On Wed, Feb 07, 2018 at 01:59:32PM -0800, Anthony DeBarros wrote:
> Hi, all,
> 
> My favorite solution to this issue is to use the very handy Python library
> csvkit: https://csvkit.readthedocs.io/en/1.0.2/

For the same purpose I have written a script in Julia which I regularly
use.  It creates an SQL-file from a csv similar to the output of a
pg_dump of a table.  Then I can run 'psql -f sqlfile.sql' and it will
create and populate the table.

I have attached the script.  You need to have Julia on your computer
(which is a good thing :)

Regards.

Johann
--
Johann Spies                            Telefoon: 021-808 4699
Databestuurder /  Data manager          Faks: 021-883 3691

Sentrum vir Navorsing oor Evaluasie, Wetenskap en Tegnologie
Centre for Research on Evaluation, Science and Technology 
Universiteit Stellenbosch.


The integrity and confidentiality of this e-mail is governed by these terms / 
Die integriteit en vertroulikheid van hierdie e-pos word deur die volgende 
bepalings gereël. http://www.sun.ac.za/emaildisclaimer
#!/usr/bin/env julia
using DataFrames
using CSV

function reservedwords()
    """Only the first set will be used at this stage
       Thanks to the Web2py code where I got the  data
       in this format.
    """
    POSTGRESQL = Set((
        "FALSE",
        "TRUE",
        "ALL",
        "ANALYSE",
        "ANALYZE",
        "AND",
        "ANY",
        "ARRAY",
        "AS",
        "ASC",
        "ASYMMETRIC",
        "AUTHORIZATION",
        "BETWEEN",
        "BIGINT",
        "BINARY",
        "BIT",
        "BOOLEAN",
        "BOTH",
        "CASE",
        "CAST",
        "CHAR",
        "CHARACTER",
        "CHECK",
        "COALESCE",
        "COLLATE",
        "COLUMN",
        "CONSTRAINT",
        "CREATE",
        "CROSS",
        "CURRENT_CATALOG",
        "CURRENT_DATE",
        "CURRENT_ROLE",
        "CURRENT_SCHEMA",
        "CURRENT_TIME",
        "CURRENT_TIMESTAMP",
        "CURRENT_USER",
        "DEC",
        "DECIMAL",
        "DEFAULT",
        "DEFERRABLE",
        "DESC",
        "DISTINCT",
        "DO",
        "ELSE",
        "END",
        "EXCEPT",
        "EXISTS",
        "EXTRACT",
        "FETCH",
        "FLOAT",
        "FOR",
        "FOREIGN",
        "FREEZE",
        "FROM",
        "FULL",
        "GRANT",
        "GREATEST",
        "GROUP",
        "HAVING",
        "ILIKE",
        "IN",
        "INITIALLY",
        "INNER",
        "INOUT",
        "INT",
        "INTEGER",
        "INTERSECT",
        "INTERVAL",
        "INTO",
        "IS",
        "ISNULL",
        "JOIN",
        "LEADING",
        "LEAST",
        "LEFT",
        "LIKE",
        "LIMIT",
        "LOCALTIME",
        "LOCALTIMESTAMP",
        "NATIONAL",
        "NATURAL",
        "NCHAR",
        "NEW",
        "NONE",
        "NOT",
        "NOTNULL",
        "NULL",
        "NULLIF",
        "NUMERIC",
        "OFF",
        "OFFSET",
        "OLD",
        "ON",
        "ONLY",
        "OR",
        "ORDER",
        "OUT",
        "OUTER",
        "OVERLAPS",
        "OVERLAY",
        "PLACING",
        "POSITION",
        "PRECISION",
        "PRIMARY",
        "REAL",
        "REFERENCES",
        "RETURNING",
        "RIGHT",
        "ROW",
        "SELECT",
        "SESSION_USER",
        "SETOF",
        "SIMILAR",
        "SMALLINT",
        "SOME",
        "SUBSTRING",
        "SYMMETRIC",
        "TABLE",
        "THEN",
        "TIME",
        "TIMESTAMP",
        "TO",
        "TRAILING",
        "TREAT",
        "TRIM",
        "UNION",
        "UNIQUE",
        "USER",
        "USING",
        "VALUES",
        "VARCHAR",
        "VARIADIC",
        "VERBOSE",
        "WHEN",
        "WHERE",
        "WITH",
        "XMLATTRIBUTES",
        "XMLCONCAT",
        "XMLELEMENT",
        "XMLFOREST",
        "XMLPARSE",
        "XMLPI",
        "XMLROOT",
        "XMLSERIALIZE",
    ))


    POSTGRESQL_NONRESERVED = Set((
        "A",
        "ABORT",
        "ABS",
        "ABSENT",
        "ABSOLUTE",
        "ACCESS",
        "ACCORDING",
        "ACTION",
        "ADA",
        "ADD",
        "ADMIN",
        "AFTER",
        "AGGREGATE",
        "ALIAS",
        "ALLOCATE",
        "ALSO",
        "ALTER",
        "ALWAYS",
        "ARE",
        "ARRAY_AGG",
        "ASENSITIVE",
        "ASSERTION",
        "ASSIGNMENT",
        "AT",
        "ATOMIC",
        "ATTRIBUTE",
        "ATTRIBUTES",
        "AVG",
        "BACKWARD",
        "BASE64",
        "BEFORE",
        "BEGIN",
        "BERNOULLI",
        "BIT_LENGTH",
        "BITVAR",
        "BLOB",
        "BOM",
        "BREADTH",
        "BY",
        "C",
        "CACHE",
        "CALL",
        "CALLED",
        "CARDINALITY",
        "CASCADE",
        "CASCADED",
        "CATALOG",
        "CATALOG_NAME",
        "CEIL",
        "CEILING",
        "CHAIN",
        "CHAR_LENGTH",
        "CHARACTER_LENGTH",
        "CHARACTER_SET_CATALOG",
        "CHARACTER_SET_NAME",
        "CHARACTER_SET_SCHEMA",
        "CHARACTERISTICS",
        "CHARACTERS",
        "CHECKED",
        "CHECKPOINT",
        "CLASS",
        "CLASS_ORIGIN",
        "CLOB",
        "CLOSE",
        "CLUSTER",
        "COBOL",
        "COLLATION",
        "COLLATION_CATALOG",
        "COLLATION_NAME",
        "COLLATION_SCHEMA",
        "COLLECT",
        "COLUMN_NAME",
        "COLUMNS",
        "COMMAND_FUNCTION",
        "COMMAND_FUNCTION_CODE",
        "COMMENT",
        "COMMIT",
        "COMMITTED",
        "COMPLETION",
        "CONCURRENTLY",
        "CONDITION",
        "CONDITION_NUMBER",
        "CONFIGURATION",
        "CONNECT",
        "CONNECTION",
        "CONNECTION_NAME",
        "CONSTRAINT_CATALOG",
        "CONSTRAINT_NAME",
        "CONSTRAINT_SCHEMA",
        "CONSTRAINTS",
        "CONSTRUCTOR",
        "CONTAINS",
        "CONTENT",
        "CONTINUE",
        "CONVERSION",
        "CONVERT",
        "COPY",
        "CORR",
        "CORRESPONDING",
        "COST",
        "COUNT",
        "COVAR_POP",
        "COVAR_SAMP",
        "CREATEDB",
        "CREATEROLE",
        "CREATEUSER",
        "CSV",
        "CUBE",
        "CUME_DIST",
        "CURRENT",
        "CURRENT_DEFAULT_TRANSFORM_GROUP",
        "CURRENT_PATH",
        "CURRENT_TRANSFORM_GROUP_FOR_TYPE",
        "CURSOR",
        "CURSOR_NAME",
        "CYCLE",
        "DATA",
        "DATABASE",
        "DATE",
        "DATETIME_INTERVAL_CODE",
        "DATETIME_INTERVAL_PRECISION",
        "DAY",
        "DEALLOCATE",
        "DECLARE",
        "DEFAULTS",
        "DEFERRED",
        "DEFINED",
        "DEFINER",
        "DEGREE",
        "DELETE",
        "DELIMITER",
        "DELIMITERS",
        "DENSE_RANK",
        "DEPTH",
        "DEREF",
        "DERIVED",
        "DESCRIBE",
        "DESCRIPTOR",
        "DESTROY",
        "DESTRUCTOR",
        "DETERMINISTIC",
        "DIAGNOSTICS",
        "DICTIONARY",
        "DISABLE",
        "DISCARD",
        "DISCONNECT",
        "DISPATCH",
        "DOCUMENT",
        "DOMAIN",
        "DOUBLE",
        "DROP",
        "DYNAMIC",
        "DYNAMIC_FUNCTION",
        "DYNAMIC_FUNCTION_CODE",
        "EACH",
        "ELEMENT",
        "EMPTY",
        "ENABLE",
        "ENCODING",
        "ENCRYPTED",
        "END-EXEC",
        "ENUM",
        "EQUALS",
        "ESCAPE",
        "EVERY",
        "EXCEPTION",
        "EXCLUDE",
        "EXCLUDING",
        "EXCLUSIVE",
        "EXEC",
        "EXECUTE",
        "EXISTING",
        "EXP",
        "EXPLAIN",
        "EXTERNAL",
        "FAMILY",
        "FILTER",
        "FINAL",
        "FIRST",
        "FIRST_VALUE",
        "FLAG",
        "FLOOR",
        "FOLLOWING",
        "FORCE",
        "FORTRAN",
        "FORWARD",
        "FOUND",
        "FREE",
        "FUNCTION",
        "FUSION",
        "G",
        "GENERAL",
        "GENERATED",
        "GET",
        "GLOBAL",
        "GO",
        "GOTO",
        "GRANTED",
        "GROUPING",
        "HANDLER",
        "HEADER",
        "HEX",
        "HIERARCHY",
        "HOLD",
        "HOST",
        "HOUR",
        #    "ID",
        "IDENTITY",
        "IF",
        "IGNORE",
        "IMMEDIATE",
        "IMMUTABLE",
        "IMPLEMENTATION",
        "IMPLICIT",
        "INCLUDING",
        "INCREMENT",
        "INDENT",
        "INDEX",
        "INDEXES",
        "INDICATOR",
        "INFIX",
        "INHERIT",
        "INHERITS",
        "INITIALIZE",
        "INPUT",
        "INSENSITIVE",
        "INSERT",
        "INSTANCE",
        "INSTANTIABLE",
        "INSTEAD",
        "INTERSECTION",
        "INVOKER",
        "ISOLATION",
        "ITERATE",
        "K",
        "KEY",
        "KEY_MEMBER",
        "KEY_TYPE",
        "LAG",
        "LANCOMPILER",
        "LANGUAGE",
        "LARGE",
        "LAST",
        "LAST_VALUE",
        "LATERAL",
        "LC_COLLATE",
        "LC_CTYPE",
        "LEAD",
        "LENGTH",
        "LESS",
        "LEVEL",
        "LIKE_REGEX",
        "LISTEN",
        "LN",
        "LOAD",
        "LOCAL",
        "LOCATION",
        "LOCATOR",
        "LOCK",
        "LOGIN",
        "LOWER",
        "M",
        "MAP",
        "MAPPING",
        "MATCH",
        "MATCHED",
        "MAX",
        "MAX_CARDINALITY",
        "MAXVALUE",
        "MEMBER",
        "MERGE",
        "MESSAGE_LENGTH",
        "MESSAGE_OCTET_LENGTH",
        "MESSAGE_TEXT",
        "METHOD",
        "MIN",
        "MINUTE",
        "MINVALUE",
        "MOD",
        "MODE",
        "MODIFIES",
        "MODIFY",
        "MODULE",
        "MONTH",
        "MORE",
        "MOVE",
        "MULTISET",
        "MUMPS",
        #    "NAME",
        "NAMES",
        "NAMESPACE",
        "NCLOB",
        "NESTING",
        "NEXT",
        "NFC",
        "NFD",
        "NFKC",
        "NFKD",
        "NIL",
        "NO",
        "NOCREATEDB",
        "NOCREATEROLE",
        "NOCREATEUSER",
        "NOINHERIT",
        "NOLOGIN",
        "NORMALIZE",
        "NORMALIZED",
        "NOSUPERUSER",
        "NOTHING",
        "NOTIFY",
        "NOWAIT",
        "NTH_VALUE",
        "NTILE",
        "NULLABLE",
        "NULLS",
        "NUMBER",
        "OBJECT",
        "OCCURRENCES_REGEX",
        "OCTET_LENGTH",
        "OCTETS",
        "OF",
        "OIDS",
        "OPEN",
        "OPERATION",
        "OPERATOR",
        "OPTION",
        "OPTIONS",
        "ORDERING",
        "ORDINALITY",
        "OTHERS",
        "OUTPUT",
        "OVER",
        "OVERRIDING",
        "OWNED",
        "OWNER",
        "P",
        "PAD",
        "PARAMETER",
        "PARAMETER_MODE",
        "PARAMETER_NAME",
        "PARAMETER_ORDINAL_POSITION",
        "PARAMETER_SPECIFIC_CATALOG",
        "PARAMETER_SPECIFIC_NAME",
        "PARAMETER_SPECIFIC_SCHEMA",
        "PARAMETERS",
        "PARSER",
        "PARTIAL",
        "PARTITION",
        "PASCAL",
        "PASSING",
        #    "PASSWORD",
        "PATH",
        "PERCENT_RANK",
        "PERCENTILE_CONT",
        "PERCENTILE_DISC",
        "PLANS",
        "PLI",
        "POSITION_REGEX",
        "POSTFIX",
        "POWER",
        "PRECEDING",
        "PREFIX",
        "PREORDER",
        "PREPARE",
        "PREPARED",
        "PRESERVE",
        "PRIOR",
        "PRIVILEGES",
        "PROCEDURAL",
        "PROCEDURE",
        "PUBLIC",
        "QUOTE",
        "RANGE",
        "RANK",
        "READ",
        "READS",
        "REASSIGN",
        "RECHECK",
        "RECURSIVE",
        "REF",
        "REFERENCING",
        "REGR_AVGX",
        "REGR_AVGY",
        "REGR_COUNT",
        "REGR_INTERCEPT",
        "REGR_R2",
        "REGR_SLOPE",
        "REGR_SXX",
        "REGR_SXY",
        "REGR_SYY",
        "REINDEX",
        "RELATIVE",
        "RELEASE",
        "RENAME",
        "REPEATABLE",
        "REPLACE",
        "REPLICA",
        "RESET",
        "RESPECT",
        "RESTART",
        "RESTRICT",
        "RESULT",
        "RETURN",
        "RETURNED_CARDINALITY",
        "RETURNED_LENGTH",
        "RETURNED_OCTET_LENGTH",
        "RETURNED_SQLSTATE",
        "RETURNS",
        "REVOKE",
        "ROLE",
        "ROLLBACK",
        "ROLLUP",
        "ROUTINE",
        "ROUTINE_CATALOG",
        "ROUTINE_NAME",
        "ROUTINE_SCHEMA",
        "ROW_COUNT",
        "ROW_NUMBER",
        "ROWS",
        "RULE",
        "SAVEPOINT",
        "SCALE",
        "SCHEMA",
        "SCHEMA_NAME",
        "SCOPE",
        "SCOPE_CATALOG",
        "SCOPE_NAME",
        "SCOPE_SCHEMA",
        "SCROLL",
        "SEARCH",
        "SECOND",
        "SECTION",
        "SECURITY",
        "SELF",
        "SENSITIVE",
        "SEQUENCE",
        "SERIALIZABLE",
        "SERVER",
        "SERVER_NAME",
        "SESSION",
        "SET",
        "SETS",
        "SHARE",
        "SHOW",
        "SIMPLE",
        "SIZE",
        "SOURCE",
        "SPACE",
        "SPECIFIC",
        "SPECIFIC_NAME",
        "SPECIFICTYPE",
        "SQL",
        "SQLCODE",
        "SQLERROR",
        "SQLEXCEPTION",
        "SQLSTATE",
        "SQLWARNING",
        "SQRT",
        "STABLE",
        "STANDALONE",
        "START",
        "STATE",
        "STATEMENT",
        "STATIC",
        "STATISTICS",
        "STDDEV_POP",
        "STDDEV_SAMP",
        "STDIN",
        "STDOUT",
        "STORAGE",
        "STRICT",
        "STRIP",
        "STRUCTURE",
        "STYLE",
        "SUBCLASS_ORIGIN",
        "SUBLIST",
        "SUBMULTISET",
        "SUBSTRING_REGEX",
        "SUM",
        "SUPERUSER",
        "SYSID",
        "SYSTEM",
        "SYSTEM_USER",
        "T",
        "TABLE_NAME",
        "TABLESAMPLE",
        "TABLESPACE",
        "TEMP",
        "TEMPLATE",
        "TEMPORARY",
        "TERMINATE",
        "TEXT",
        "THAN",
        "TIES",
        "TIMEZONE_HOUR",
        "TIMEZONE_MINUTE",
        "TOP_LEVEL_COUNT",
        "TRANSACTION",
        "TRANSACTION_ACTIVE",
        "TRANSACTIONS_COMMITTED",
        "TRANSACTIONS_ROLLED_BACK",
        "TRANSFORM",
        "TRANSFORMS",
        "TRANSLATE",
        "TRANSLATE_REGEX",
        "TRANSLATION",
        "TRIGGER",
        "TRIGGER_CATALOG",
        "TRIGGER_NAME",
        "TRIGGER_SCHEMA",
        "TRIM_ARRAY",
        "TRUNCATE",
        "TRUSTED",
        "TYPE",
        "UESCAPE",
        "UNBOUNDED",
        "UNCOMMITTED",
        "UNDER",
        "UNENCRYPTED",
        "UNKNOWN",
        "UNLISTEN",
        "UNNAMED",
        "UNNEST",
        "UNTIL",
        "UNTYPED",
        "UPDATE",
        "UPPER",
        "URI",
        "USAGE",
        "USER_DEFINED_TYPE_CATALOG",
        "USER_DEFINED_TYPE_CODE",
        "USER_DEFINED_TYPE_NAME",
        "USER_DEFINED_TYPE_SCHEMA",
        "VACUUM",
        "VALID",
        "VALIDATOR",
        "VALUE",
        "VAR_POP",
        "VAR_SAMP",
        "VARBINARY",
        "VARIABLE",
        "VARYING",
        "VERSION",
        "VIEW",
        "VOLATILE",
        "WHENEVER",
        "WHITESPACE",
        "WIDTH_BUCKET",
        "WINDOW",
        "WITHIN",
        "WITHOUT",
        "WORK",
        "WRAPPER",
        "WRITE",
        "XML",
        "XMLAGG",
        "XMLBINARY",
        "XMLCAST",
        "XMLCOMMENT",
        "XMLDECLARATION",
        "XMLDOCUMENT",
        "XMLEXISTS",
        "XMLITERATE",
        "XMLNAMESPACES",
        "XMLQUERY",
        "XMLSCHEMA",
        "XMLTABLE",
        "XMLTEXT",
        "XMLVALIDATE",
        "YEAR",
        "YES",
        "ZONE",
    ))
        combined = union(POSTGRESQL,  POSTGRESQL_NONRESERVED)
        POSTGRESQL
end

function field_type(df)
    """Get the names and field types from the dataframe"""
    nr,nc = size(df)
    fields = names(df)
    fl = [(string(fields[i]), string(eltype(df[i][1:nr]))) for i in 1:nc]
end

function jt_to_sql(t)
    """Convert the Julia-datatypes to Postgresql"""
    md = Dict(
    "UTF8String"=>"text",
    "Int64"=>"bigint",
    "Int8"=>"smallint",
    "UInt8"=>"smallint",
    "Int16"=>"integer",
    "UInt16"=>"integer",
    "Int32"=>"integer",
    "UInt32"=>"integer",
    "UInt64"=>"bigint",
    "Int128"=>"integer",
    "UInt128"=>"integer",
    "Float64"=>"numeric",
    "Char"=>"char(1)",
    "Bool"=>"boolean")
    if haskey(md,t)
        md[t]
    else
        "text"
    end
end

function correct_fieldvalue(v)
    """ replace xml-values with utf-8
        replace ' with ''
    """
    v = replace(v, r"'", "''")
    v = replace(v, r">", ">")
    v = replace(v, r"&lt;", "<")
end

function row_of_strings(r)
    """Prepare a row of strings to print from one row"""
    nrows,ncols = size(r)
    s = fill("",ncols)
    for col in 1:ncols
        v = r[nrows,col]
        if ismissing(v)
            v = "\\N"
        else
            v = string(v)
        end
        s[col] = correct_fieldvalue(v)
    end
    join(s,"\t")
end

function correct_fieldname(f,pg_keywords)
    """  correct fieldnames that starts with either
         a numerical or a underscore character
         and add a 'n_' in front of the name
    """
    syfers = r"^[0-9_]"
    if uppercase(f) in pg_keywords
        f = "n_$f"
    elseif ismatch(syfers,f)
        f = "n_$f"
    else
        f
    end
    f
end

function create_table_def(df,t)
    nrows,ncols = size(df) #rows and columns
    fls = field_type(df) # get the fieldnames and types
    pg_keywords = reservedwords()
    # Create a list of fieldnames
    fns = [correct_fieldname(x[1],pg_keywords) for x in fls]
    s = "CREATE TABLE $t ("
    for c in 1:length(fls)
        (i,f) = fls[c] #i = fieldname f = type
        if i == "id"
            st = "serial"
        else
            st = jt_to_sql(f) # postgresql-type
        end
        i = correct_fieldname(i,pg_keywords)
        s = s * "$i $st,\n"
    end
    ss = s[1:length(s)-2] * ");\n\n" # remove last ',' and '\n' and add ');'
    # The following header was taken from a file created by pg_dump
    header = """SET statement_timeout = 0;
                SET lock_timeout = 0;
                SET idle_in_transaction_session_timeout = 0;
                SET client_encoding = 'UTF8';
                SET standard_conforming_strings = on;
                SET check_function_bodies = false;
                SET client_min_messages = warning;
                SET row_security = off;
                SET search_path = public, pg_catalog;
                SET default_tablespace = '';
                SET default_with_oids = false;\n\n"""
    print(header)
    print(ss)
    # Create a line like this:
    # COPY aa (article, sa_author, id, last_name, first_name, aa) FROM stdin;
    cs = "COPY $t ("
    z = join(fns, ", ")
    css = cs * z * ") FROM stdin;\n"
    print(css)
    format_rows(df)
end

function format_rows(df)
    """Create rows of tab-delimited data from the values in the dataframe"""
    nrows,ncols = size(df)
    for row in 1:nrows
        s = row_of_strings(df[row,1:ncols])
        println(s)
    end
    println("\\.")
end
function addcolumn(df, c, v )
    # Add a column with the name c and value v(in all the rows) to df
    df[c] = v
    df
end

function add_id(df)
    # if there is a field "id" do nothing else add it
    nrows,  = size(df)
    (:id in names(df)) ||  addcolumn(df, :id, 1:nrows)

end
if length(ARGS) < 2
    println
    
println("-----------------------------------------------------------------------------")
    println("Usage: julia csv_to_sql <csvfilename> <tablename for sql  [> 
outputfile.sql] ")
    println("If you do not redirect it the output will be to stdout")
    
println("-----------------------------------------------------------------------------")
    println
    exit(1)
end
#function __init__()
input_file = ARGS[1]
table = ARGS[2]
df = CSV.read(input_file)
df = add_id(df) # Add id-field if it is not present in dataframe
create_table_def(df,table)
#end

Reply via email to