Our objective is to take a file that looks like this…
$ cat flatfile.txt
AcctID, IsValid, IsLate, StartDate , NextDate , EndDate , Name , Amount
1 , true , true , 2012-10-05, 2012-12-25 15:55:21 , 1/1/2013 2:54AM , NULL , 10.00
2 , true , false , 2012-10-06, 2012-12-25 17:24 , 2013-01-01 1:44PM , , 5.12
3 , , false , 2011-10-05, 2012-01-25 2:53 PM , 01/01/2013 , Rocky , 6.70
4 , false , true , 2010-10-01, 2012-12-25 01:24 , 2013-01-01 , , 15.30
…and to load it into a table…
COPY MyTable
(
AcctID
,IsValidSource FILLER VARCHAR(5)
,ISValid AS CASE WHEN IsValidSource iLIKE '%TRUE%' THEN 1 ELSE 0 END
,IsLateSource FILLER VARCHAR(5)
,IsLate AS CASE WHEN IsLateSource iLIKE '%TRUE%' THEN 1 ELSE 0 END
,StartDateSource FILLER DATE
,StartDate AS COALESCE(StartDateSource,'9999-12-31')
,NextDateSource FILLER DATETIME
,NextDate AS COALESCE(NextDateSource,'9999-12-31')
,EndDateSource FILLER DATETIME
,EndDate AS COALESCE(EndDateSource,'9999-12-31')
,EndDateYYYYMMDD AS TO_CHAR(COALESCE(EndDateSource,'9999-12-31'),'YYYYMMDD')::INT
,EndDateDateYYYYMMDDHHMMSS AS TO_CHAR(COALESCE(EndDateSource,'9999-12-31'),'YYYYMMDDHH24MISS')::INT
,NameSource FILLER VARCHAR(100)
,Name AS CASE WHEN NameSource ilike '%NULL%' THEN '' ELSE TRIM(NameSource) END
,AmountSource FILLER MONEY
,Amount AS COALESCE(AmountSource,0)
,WeirdNumber AS (AcctID*1000000000 + COALESCE(AmountSource,0)*100)::INT
)
FROM :SourceFileName
WITH DELIMITER ','
SKIP 1
STREAM NAME :SourceFileName
;
…so that it looks like this:
-[ RECORD 1 ]-------------+--------------------------
RowID | 1
AcctID | 1
IsValid | 1
IsLate | 1
StartDate | 2012-10-05
NextDate | 2012-12-25 15:55:21
EndDate | 2013-01-01 02:54:00
EndDateYYYYMMDD | 20130101
EndDateDateYYYYMMDDHHMMSS | 20130101025400
Name |
Amount | 10.0000
InsertDate | 2012-10-21 22:47:30.84269
RandomUniform0To1 | 0.922055047936738
RandomInt100 | 0
WeirdNumber | 1000001000
-[ RECORD 2 ]-------------+--------------------------
RowID | 2
AcctID | 2
IsValid | 1
IsLate | 0
StartDate | 2012-10-06
NextDate | 2012-12-25 17:24:00
EndDate | 2013-01-01 13:44:00
EndDateYYYYMMDD | 20130101
EndDateDateYYYYMMDDHHMMSS | 20130101134400
Name |
Amount | 5.1200
InsertDate | 2012-10-21 22:47:30.84269
RandomUniform0To1 | 0.856217089109123
RandomInt100 | 32
WeirdNumber | 2000000512
-[ RECORD 3 ]-------------+--------------------------
RowID | 3
AcctID | 3
IsValid | 0
IsLate | 0
StartDate | 2011-10-05
NextDate | 2012-01-25 14:53:00
EndDate | 2013-01-01 00:00:00
EndDateYYYYMMDD | 20130101
EndDateDateYYYYMMDDHHMMSS | 20130101000000
Name | Rocky
Amount | 6.7000
InsertDate | 2012-10-21 22:47:30.84269
RandomUniform0To1 | 0.982697854051366
RandomInt100 | 90
WeirdNumber | 3000000670
-[ RECORD 4 ]-------------+--------------------------
RowID | 4
AcctID | 4
IsValid | 0
IsLate | 1
StartDate | 2010-10-01
NextDate | 2012-12-25 01:24:00
EndDate | 2013-01-01 00:00:00
EndDateYYYYMMDD | 20130101
EndDateDateYYYYMMDDHHMMSS | 20130101000000
Name |
Amount | 15.3000
InsertDate | 2012-10-21 22:47:30.84269
RandomUniform0To1 | 0.425485474290326
RandomInt100 | 55
WeirdNumber | 4000001530
Here is the SQL code. Pay attention to the commentary, it’s really important.
-----------------------------------------------------------------------------------
-- How to Load in a File and do some basic QA.
--
-- Note: We want the transaction to roll back if there is an error.
-- Consequently, AUTOCOMMIT is set to OFF
-- and ON_ERROR_STOP is set to ON
-- Otherwise, when an error is encountered Vertica would
-- move on to the next statement instead of crashing and rolling back.
-- \x transposes the output.
-----------------------------------------------------------------------------------
\set AUTOCOMMIT off
\set ON_ERROR_STOP on
\timing
\x
-----------------------------------------------------------------------------------
-- Capture the Start Time
-----------------------------------------------------------------------------------
\set StartDateTime `date "+%Y%m%d%H%M%S"`
\set StepStartDateTime `date "+%Y%m%d%H%M%S"`
-----------------------------------------------------------------------------------
-- Once you've created the table and loaded in some sample data
-- then you need to RUN DATABASE DESIGNER.
-- Look at the DD output and follow it's guidance for the
-- encoding type to use for each column.
-- The easiest way to do this is to create another super projection
-- and drop the original super projection if you need to keep the data in
-- the table. If you don't specify an encoding type for each column then your
-- data will not be compressed, wasting disk space and processing time.
-----------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
-- TRANSACTION_TIMESTAMP()
-- This function returns the start time of the current transaction;
-- the value does not change during the transaction. The intent is
-- to allow a single transaction to have a consistent notion of
-- the "current" time, so that multiple modifications within
-- the same transaction bear the same timestamp.
-----------------------------------------------------------------------------------
DROP TABLE IF EXISTS MyTable ;
CREATE TABLE MyTable
(
RowID AUTO_INCREMENT NOT NULL ENCODING COMMONDELTA_COMP
,AcctID INT NOT NULL ENCODING AUTO
-- Prefer INT to Binary because it is easier to SUM them up.
,IsValid INT NOT NULL ENCODING RLE
,IsLate INT NOT NULL ENCODING RLE
,StartDate DATE NOT NULL
,NextDate DATETIME NOT NULL
,EndDate DATETIME
,EndDateYYYYMMDD INT NOT NULL
,EndDateDateYYYYMMDDHHMMSS INT NOT NULL
,Name VARCHAR(100)
,Amount MONEY NOT NULL
,InsertDate DATETIME NOT NULL DEFAULT TRANSACTION_TIMESTAMP()
,RandomUniform0To1 FLOAT NOT NULL DEFAULT RANDOM()
,RandomInt100 INT NOT NULL DEFAULT RANDOMINT(100)
,WeirdNumber INT NOT NULL
,PRIMARY KEY (AcctID)
)
ORDER BY IsValid, IsLate, Amount, Name, StartDate, NextDate, EndDate, AcctID
-- UNSEGMENTED ALL NODES -- Use UNSEGMENTED for Dimension Tables
SEGMENTED BY MODULARHASH(AcctID) ALL NODES -- Use SEGMENTED for Fact Tables
KSAFE 1 -- Put desired K-safety factor here.
;
-----------------------------------------------------------------------------------
-- Log Time
-----------------------------------------------------------------------------------
\set EndDateTime `date "+%Y%m%d%H%M%S"`
SELECT TO_TIMESTAMP( :StartDateTime , 'YYYYMMDDHH24MISS') AS StartDateTime
,TO_TIMESTAMP( :StepStartDateTime , 'YYYYMMDDHH24MISS') AS StepStartDateTime
,DATEDIFF(SECOND, TO_TIMESTAMP( :StepStartDateTime , 'YYYYMMDDHH24MISS')
, TO_TIMESTAMP( :EndDateTime , 'YYYYMMDDHH24MISS')
) AS IncrementalSeconds
,DATEDIFF(SECOND, TO_TIMESTAMP( :StartDateTime , 'YYYYMMDDHH24MISS')
, TO_TIMESTAMP( :EndDateTime , 'YYYYMMDDHH24MISS')
) AS TotalSeconds
;
\set :StepStartDateTime `date "+%Y%m%d%H%M%S"`
-----------------------------------------------------------------------------------
-- I like to use variables for my input files
-- Note, you could also use wildcards such as
-- /home/dbadmin/playground/flat*.txt
-----------------------------------------------------------------------------------
\set SourceFileName '''' /home/dbadmin/playground/flatfile.txt ''''
-----------------------------------------------------------------------------------
-- Read in the file
-- iLIKE is a case insenstive comparison
-- Expressions can only contain source columns, not derived columns
-- StreamName makes it easy to find stats in
-- v_monitor.load_status table
-----------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
-- I like qecho because it logs the comments in the log file, making it easy to
-- trace exactly where the program hit an Error.
\qecho =============================================================================
\qecho Load the file
\qecho =============================================================================
COPY MyTable
(
AcctID
,IsValidSource FILLER VARCHAR(5)
,ISValid AS CASE WHEN IsValidSource iLIKE '%TRUE%' THEN 1 ELSE 0 END
,IsLateSource FILLER VARCHAR(5)
,IsLate AS CASE WHEN IsLateSource iLIKE '%TRUE%' THEN 1 ELSE 0 END
,StartDateSource FILLER DATE
,StartDate AS COALESCE(StartDateSource,'9999-12-31')
,NextDateSource FILLER DATETIME
,NextDate AS COALESCE(NextDateSource,'9999-12-31')
,EndDateSource FILLER DATETIME
,EndDate AS COALESCE(EndDateSource,'9999-12-31')
,EndDateYYYYMMDD AS TO_CHAR(COALESCE(EndDateSource,'9999-12-31'),'YYYYMMDD')::INT
,EndDateDateYYYYMMDDHHMMSS AS TO_CHAR(COALESCE(EndDateSource,'9999-12-31'),'YYYYMMDDHH24MISS')::INT
,NameSource FILLER VARCHAR(100)
,Name AS CASE WHEN NameSource ilike '%NULL%' THEN '' ELSE TRIM(NameSource) END
,AmountSource FILLER MONEY
,Amount AS COALESCE(AmountSource,0)
,WeirdNumber AS (AcctID*1000000000 + COALESCE(AmountSource,0)*100)::INT
)
FROM :SourceFileName
WITH DELIMITER ','
SKIP 1
STREAM NAME :SourceFileName
;
-----------------------------------------------------------------------------------
-- Log Time
-----------------------------------------------------------------------------------
\set EndDateTime `date "+%Y%m%d%H%M%S"`
SELECT TO_TIMESTAMP( :StartDateTime , 'YYYYMMDDHH24MISS') AS StartDateTime
,TO_TIMESTAMP( :StepStartDateTime , 'YYYYMMDDHH24MISS') AS StepStartDateTime
,DATEDIFF(SECOND, TO_TIMESTAMP( :StepStartDateTime , 'YYYYMMDDHH24MISS')
, TO_TIMESTAMP( :EndDateTime , 'YYYYMMDDHH24MISS')
) AS IncrementalSeconds
,DATEDIFF(SECOND, TO_TIMESTAMP( :StartDateTime , 'YYYYMMDDHH24MISS')
, TO_TIMESTAMP( :EndDateTime , 'YYYYMMDDHH24MISS')
) AS TotalSeconds
;
\set :StepStartDateTime `date "+%Y%m%d%H%M%S"`
-----------------------------------------------------------------------------------
-- QA: Make sure all rows loaded and rollback if there was
-- an error.
-----------------------------------------------------------------------------------
\qecho =============================================================================
\qecho Raise an Error condition by selecting 1/0 if any rows were rejected
\qecho IF GET_NUM_REJECTED_ROWS()<>0 THEN 1-SIGN(GET_NUM_REJECTED_ROWS())) = 0
\qecho =============================================================================
SELECT
GET_NUM_REJECTED_ROWS() AS NumRejectedRows
,GET_NUM_ACCEPTED_ROWS() AS NumAcceptedRows
;
SELECT 1 / (1-SIGN(GET_NUM_REJECTED_ROWS()));
\qecho =============================================================================
\qecho Look at Load Streams Table
\qecho =============================================================================
SELECT * FROM v_monitor.load_streams
WHERE stream_name like :SourceFileName ;
\qecho =============================================================================
\qecho QA: IsValid,IsLate contain only binary values.
\qecho COUNT(*)=0 if no records returned
\qecho =============================================================================
SELECT 1/(1-SIGN(COUNT(*)))
FROM (SELECT *
FROM MyTable
WHERE IsValid > 1
OR IsValid < 0
OR IsLate > 1
OR IsLate < 0
) AS T1;
\qecho ============================================================================
\qecho Since Vertica does not check constraints when loading data,
\qecho raise an error and rollback if primary key constraint is violated.
\qecho ============================================================================
SELECT 1/(1-SIGN(COUNT(*)))
FROM (SELECT ANALYZE_CONSTRAINTS ('MyTable')) AS T1;
-----------------------------------------------------------------------------------
-- Log Time
-----------------------------------------------------------------------------------
\set EndDateTime `date "+%Y%m%d%H%M%S"`
SELECT TO_TIMESTAMP( :StartDateTime , 'YYYYMMDDHH24MISS') AS StartDateTime
,TO_TIMESTAMP( :StepStartDateTime , 'YYYYMMDDHH24MISS') AS StepStartDateTime
,DATEDIFF(SECOND, TO_TIMESTAMP( :StepStartDateTime , 'YYYYMMDDHH24MISS')
, TO_TIMESTAMP( :EndDateTime , 'YYYYMMDDHH24MISS')
) AS IncrementalSeconds
,DATEDIFF(SECOND, TO_TIMESTAMP( :StartDateTime , 'YYYYMMDDHH24MISS')
, TO_TIMESTAMP( :EndDateTime , 'YYYYMMDDHH24MISS')
) AS TotalSeconds
;
\set :StepStartDateTime `date "+%Y%m%d%H%M%S"`
-- SELECT 1/0 ; -- Uncomment this line to Raise an Error
\qecho ============================================================================
\qecho Run ANALYZE_STATISTICS
\qecho Note:
\qecho THIS IS THE SAME AS A COMMIT!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
\qecho ============================================================================
SELECT ANALYZE_STATISTICS('MyTable');
-----------------------------------------------------------------------------------
-- Log Time
-----------------------------------------------------------------------------------
\set EndDateTime `date "+%Y%m%d%H%M%S"`
SELECT TO_TIMESTAMP( :StartDateTime , 'YYYYMMDDHH24MISS') AS StartDateTime
,TO_TIMESTAMP( :StepStartDateTime , 'YYYYMMDDHH24MISS') AS StepStartDateTime
,DATEDIFF(SECOND, TO_TIMESTAMP( :StepStartDateTime , 'YYYYMMDDHH24MISS')
, TO_TIMESTAMP( :EndDateTime , 'YYYYMMDDHH24MISS')
) AS IncrementalSeconds
,DATEDIFF(SECOND, TO_TIMESTAMP( :StartDateTime , 'YYYYMMDDHH24MISS')
, TO_TIMESTAMP( :EndDateTime , 'YYYYMMDDHH24MISS')
) AS TotalSeconds
;
\set :StepStartDateTime `date "+%Y%m%d%H%M%S"`
-----------------------------------------------------------------------------------
-- Did it Work?
-----------------------------------------------------------------------------------
SELECT * FROM MyTable ORDER BY AcctID;
-----------------------------------------------------------------------------------
-- Cleanup (don't do this in production code).
-----------------------------------------------------------------------------------
DROP TABLE IF EXISTS MyTable ;
\q