On Wed, Feb 07, 2018 at 01:59:32PM -0800, Anthony DeBarros wrote:
> Hi, all,
>
> My favorite solution to this issue is to use the very handy Python library
> csvkit: https://csvkit.readthedocs.io/en/1.0.2/
For the same purpose I have written a script in Julia which I regularly
use. It creates an SQL-file from a csv similar to the output of a
pg_dump of a table. Then I can run 'psql -f sqlfile.sql' and it will
create and populate the table.
I have attached the script. You need to have Julia on your computer
(which is a good thing :)
Regards.
Johann
--
Johann Spies Telefoon: 021-808 4699
Databestuurder / Data manager Faks: 021-883 3691
Sentrum vir Navorsing oor Evaluasie, Wetenskap en Tegnologie
Centre for Research on Evaluation, Science and Technology
Universiteit Stellenbosch.
The integrity and confidentiality of this e-mail is governed by these terms /
Die integriteit en vertroulikheid van hierdie e-pos word deur die volgende
bepalings gereël. http://www.sun.ac.za/emaildisclaimer
#!/usr/bin/env julia
using DataFrames
using CSV
function reservedwords()
"""Only the first set will be used at this stage
Thanks to the Web2py code where I got the data
in this format.
"""
POSTGRESQL = Set((
"FALSE",
"TRUE",
"ALL",
"ANALYSE",
"ANALYZE",
"AND",
"ANY",
"ARRAY",
"AS",
"ASC",
"ASYMMETRIC",
"AUTHORIZATION",
"BETWEEN",
"BIGINT",
"BINARY",
"BIT",
"BOOLEAN",
"BOTH",
"CASE",
"CAST",
"CHAR",
"CHARACTER",
"CHECK",
"COALESCE",
"COLLATE",
"COLUMN",
"CONSTRAINT",
"CREATE",
"CROSS",
"CURRENT_CATALOG",
"CURRENT_DATE",
"CURRENT_ROLE",
"CURRENT_SCHEMA",
"CURRENT_TIME",
"CURRENT_TIMESTAMP",
"CURRENT_USER",
"DEC",
"DECIMAL",
"DEFAULT",
"DEFERRABLE",
"DESC",
"DISTINCT",
"DO",
"ELSE",
"END",
"EXCEPT",
"EXISTS",
"EXTRACT",
"FETCH",
"FLOAT",
"FOR",
"FOREIGN",
"FREEZE",
"FROM",
"FULL",
"GRANT",
"GREATEST",
"GROUP",
"HAVING",
"ILIKE",
"IN",
"INITIALLY",
"INNER",
"INOUT",
"INT",
"INTEGER",
"INTERSECT",
"INTERVAL",
"INTO",
"IS",
"ISNULL",
"JOIN",
"LEADING",
"LEAST",
"LEFT",
"LIKE",
"LIMIT",
"LOCALTIME",
"LOCALTIMESTAMP",
"NATIONAL",
"NATURAL",
"NCHAR",
"NEW",
"NONE",
"NOT",
"NOTNULL",
"NULL",
"NULLIF",
"NUMERIC",
"OFF",
"OFFSET",
"OLD",
"ON",
"ONLY",
"OR",
"ORDER",
"OUT",
"OUTER",
"OVERLAPS",
"OVERLAY",
"PLACING",
"POSITION",
"PRECISION",
"PRIMARY",
"REAL",
"REFERENCES",
"RETURNING",
"RIGHT",
"ROW",
"SELECT",
"SESSION_USER",
"SETOF",
"SIMILAR",
"SMALLINT",
"SOME",
"SUBSTRING",
"SYMMETRIC",
"TABLE",
"THEN",
"TIME",
"TIMESTAMP",
"TO",
"TRAILING",
"TREAT",
"TRIM",
"UNION",
"UNIQUE",
"USER",
"USING",
"VALUES",
"VARCHAR",
"VARIADIC",
"VERBOSE",
"WHEN",
"WHERE",
"WITH",
"XMLATTRIBUTES",
"XMLCONCAT",
"XMLELEMENT",
"XMLFOREST",
"XMLPARSE",
"XMLPI",
"XMLROOT",
"XMLSERIALIZE",
))
POSTGRESQL_NONRESERVED = Set((
"A",
"ABORT",
"ABS",
"ABSENT",
"ABSOLUTE",
"ACCESS",
"ACCORDING",
"ACTION",
"ADA",
"ADD",
"ADMIN",
"AFTER",
"AGGREGATE",
"ALIAS",
"ALLOCATE",
"ALSO",
"ALTER",
"ALWAYS",
"ARE",
"ARRAY_AGG",
"ASENSITIVE",
"ASSERTION",
"ASSIGNMENT",
"AT",
"ATOMIC",
"ATTRIBUTE",
"ATTRIBUTES",
"AVG",
"BACKWARD",
"BASE64",
"BEFORE",
"BEGIN",
"BERNOULLI",
"BIT_LENGTH",
"BITVAR",
"BLOB",
"BOM",
"BREADTH",
"BY",
"C",
"CACHE",
"CALL",
"CALLED",
"CARDINALITY",
"CASCADE",
"CASCADED",
"CATALOG",
"CATALOG_NAME",
"CEIL",
"CEILING",
"CHAIN",
"CHAR_LENGTH",
"CHARACTER_LENGTH",
"CHARACTER_SET_CATALOG",
"CHARACTER_SET_NAME",
"CHARACTER_SET_SCHEMA",
"CHARACTERISTICS",
"CHARACTERS",
"CHECKED",
"CHECKPOINT",
"CLASS",
"CLASS_ORIGIN",
"CLOB",
"CLOSE",
"CLUSTER",
"COBOL",
"COLLATION",
"COLLATION_CATALOG",
"COLLATION_NAME",
"COLLATION_SCHEMA",
"COLLECT",
"COLUMN_NAME",
"COLUMNS",
"COMMAND_FUNCTION",
"COMMAND_FUNCTION_CODE",
"COMMENT",
"COMMIT",
"COMMITTED",
"COMPLETION",
"CONCURRENTLY",
"CONDITION",
"CONDITION_NUMBER",
"CONFIGURATION",
"CONNECT",
"CONNECTION",
"CONNECTION_NAME",
"CONSTRAINT_CATALOG",
"CONSTRAINT_NAME",
"CONSTRAINT_SCHEMA",
"CONSTRAINTS",
"CONSTRUCTOR",
"CONTAINS",
"CONTENT",
"CONTINUE",
"CONVERSION",
"CONVERT",
"COPY",
"CORR",
"CORRESPONDING",
"COST",
"COUNT",
"COVAR_POP",
"COVAR_SAMP",
"CREATEDB",
"CREATEROLE",
"CREATEUSER",
"CSV",
"CUBE",
"CUME_DIST",
"CURRENT",
"CURRENT_DEFAULT_TRANSFORM_GROUP",
"CURRENT_PATH",
"CURRENT_TRANSFORM_GROUP_FOR_TYPE",
"CURSOR",
"CURSOR_NAME",
"CYCLE",
"DATA",
"DATABASE",
"DATE",
"DATETIME_INTERVAL_CODE",
"DATETIME_INTERVAL_PRECISION",
"DAY",
"DEALLOCATE",
"DECLARE",
"DEFAULTS",
"DEFERRED",
"DEFINED",
"DEFINER",
"DEGREE",
"DELETE",
"DELIMITER",
"DELIMITERS",
"DENSE_RANK",
"DEPTH",
"DEREF",
"DERIVED",
"DESCRIBE",
"DESCRIPTOR",
"DESTROY",
"DESTRUCTOR",
"DETERMINISTIC",
"DIAGNOSTICS",
"DICTIONARY",
"DISABLE",
"DISCARD",
"DISCONNECT",
"DISPATCH",
"DOCUMENT",
"DOMAIN",
"DOUBLE",
"DROP",
"DYNAMIC",
"DYNAMIC_FUNCTION",
"DYNAMIC_FUNCTION_CODE",
"EACH",
"ELEMENT",
"EMPTY",
"ENABLE",
"ENCODING",
"ENCRYPTED",
"END-EXEC",
"ENUM",
"EQUALS",
"ESCAPE",
"EVERY",
"EXCEPTION",
"EXCLUDE",
"EXCLUDING",
"EXCLUSIVE",
"EXEC",
"EXECUTE",
"EXISTING",
"EXP",
"EXPLAIN",
"EXTERNAL",
"FAMILY",
"FILTER",
"FINAL",
"FIRST",
"FIRST_VALUE",
"FLAG",
"FLOOR",
"FOLLOWING",
"FORCE",
"FORTRAN",
"FORWARD",
"FOUND",
"FREE",
"FUNCTION",
"FUSION",
"G",
"GENERAL",
"GENERATED",
"GET",
"GLOBAL",
"GO",
"GOTO",
"GRANTED",
"GROUPING",
"HANDLER",
"HEADER",
"HEX",
"HIERARCHY",
"HOLD",
"HOST",
"HOUR",
# "ID",
"IDENTITY",
"IF",
"IGNORE",
"IMMEDIATE",
"IMMUTABLE",
"IMPLEMENTATION",
"IMPLICIT",
"INCLUDING",
"INCREMENT",
"INDENT",
"INDEX",
"INDEXES",
"INDICATOR",
"INFIX",
"INHERIT",
"INHERITS",
"INITIALIZE",
"INPUT",
"INSENSITIVE",
"INSERT",
"INSTANCE",
"INSTANTIABLE",
"INSTEAD",
"INTERSECTION",
"INVOKER",
"ISOLATION",
"ITERATE",
"K",
"KEY",
"KEY_MEMBER",
"KEY_TYPE",
"LAG",
"LANCOMPILER",
"LANGUAGE",
"LARGE",
"LAST",
"LAST_VALUE",
"LATERAL",
"LC_COLLATE",
"LC_CTYPE",
"LEAD",
"LENGTH",
"LESS",
"LEVEL",
"LIKE_REGEX",
"LISTEN",
"LN",
"LOAD",
"LOCAL",
"LOCATION",
"LOCATOR",
"LOCK",
"LOGIN",
"LOWER",
"M",
"MAP",
"MAPPING",
"MATCH",
"MATCHED",
"MAX",
"MAX_CARDINALITY",
"MAXVALUE",
"MEMBER",
"MERGE",
"MESSAGE_LENGTH",
"MESSAGE_OCTET_LENGTH",
"MESSAGE_TEXT",
"METHOD",
"MIN",
"MINUTE",
"MINVALUE",
"MOD",
"MODE",
"MODIFIES",
"MODIFY",
"MODULE",
"MONTH",
"MORE",
"MOVE",
"MULTISET",
"MUMPS",
# "NAME",
"NAMES",
"NAMESPACE",
"NCLOB",
"NESTING",
"NEXT",
"NFC",
"NFD",
"NFKC",
"NFKD",
"NIL",
"NO",
"NOCREATEDB",
"NOCREATEROLE",
"NOCREATEUSER",
"NOINHERIT",
"NOLOGIN",
"NORMALIZE",
"NORMALIZED",
"NOSUPERUSER",
"NOTHING",
"NOTIFY",
"NOWAIT",
"NTH_VALUE",
"NTILE",
"NULLABLE",
"NULLS",
"NUMBER",
"OBJECT",
"OCCURRENCES_REGEX",
"OCTET_LENGTH",
"OCTETS",
"OF",
"OIDS",
"OPEN",
"OPERATION",
"OPERATOR",
"OPTION",
"OPTIONS",
"ORDERING",
"ORDINALITY",
"OTHERS",
"OUTPUT",
"OVER",
"OVERRIDING",
"OWNED",
"OWNER",
"P",
"PAD",
"PARAMETER",
"PARAMETER_MODE",
"PARAMETER_NAME",
"PARAMETER_ORDINAL_POSITION",
"PARAMETER_SPECIFIC_CATALOG",
"PARAMETER_SPECIFIC_NAME",
"PARAMETER_SPECIFIC_SCHEMA",
"PARAMETERS",
"PARSER",
"PARTIAL",
"PARTITION",
"PASCAL",
"PASSING",
# "PASSWORD",
"PATH",
"PERCENT_RANK",
"PERCENTILE_CONT",
"PERCENTILE_DISC",
"PLANS",
"PLI",
"POSITION_REGEX",
"POSTFIX",
"POWER",
"PRECEDING",
"PREFIX",
"PREORDER",
"PREPARE",
"PREPARED",
"PRESERVE",
"PRIOR",
"PRIVILEGES",
"PROCEDURAL",
"PROCEDURE",
"PUBLIC",
"QUOTE",
"RANGE",
"RANK",
"READ",
"READS",
"REASSIGN",
"RECHECK",
"RECURSIVE",
"REF",
"REFERENCING",
"REGR_AVGX",
"REGR_AVGY",
"REGR_COUNT",
"REGR_INTERCEPT",
"REGR_R2",
"REGR_SLOPE",
"REGR_SXX",
"REGR_SXY",
"REGR_SYY",
"REINDEX",
"RELATIVE",
"RELEASE",
"RENAME",
"REPEATABLE",
"REPLACE",
"REPLICA",
"RESET",
"RESPECT",
"RESTART",
"RESTRICT",
"RESULT",
"RETURN",
"RETURNED_CARDINALITY",
"RETURNED_LENGTH",
"RETURNED_OCTET_LENGTH",
"RETURNED_SQLSTATE",
"RETURNS",
"REVOKE",
"ROLE",
"ROLLBACK",
"ROLLUP",
"ROUTINE",
"ROUTINE_CATALOG",
"ROUTINE_NAME",
"ROUTINE_SCHEMA",
"ROW_COUNT",
"ROW_NUMBER",
"ROWS",
"RULE",
"SAVEPOINT",
"SCALE",
"SCHEMA",
"SCHEMA_NAME",
"SCOPE",
"SCOPE_CATALOG",
"SCOPE_NAME",
"SCOPE_SCHEMA",
"SCROLL",
"SEARCH",
"SECOND",
"SECTION",
"SECURITY",
"SELF",
"SENSITIVE",
"SEQUENCE",
"SERIALIZABLE",
"SERVER",
"SERVER_NAME",
"SESSION",
"SET",
"SETS",
"SHARE",
"SHOW",
"SIMPLE",
"SIZE",
"SOURCE",
"SPACE",
"SPECIFIC",
"SPECIFIC_NAME",
"SPECIFICTYPE",
"SQL",
"SQLCODE",
"SQLERROR",
"SQLEXCEPTION",
"SQLSTATE",
"SQLWARNING",
"SQRT",
"STABLE",
"STANDALONE",
"START",
"STATE",
"STATEMENT",
"STATIC",
"STATISTICS",
"STDDEV_POP",
"STDDEV_SAMP",
"STDIN",
"STDOUT",
"STORAGE",
"STRICT",
"STRIP",
"STRUCTURE",
"STYLE",
"SUBCLASS_ORIGIN",
"SUBLIST",
"SUBMULTISET",
"SUBSTRING_REGEX",
"SUM",
"SUPERUSER",
"SYSID",
"SYSTEM",
"SYSTEM_USER",
"T",
"TABLE_NAME",
"TABLESAMPLE",
"TABLESPACE",
"TEMP",
"TEMPLATE",
"TEMPORARY",
"TERMINATE",
"TEXT",
"THAN",
"TIES",
"TIMEZONE_HOUR",
"TIMEZONE_MINUTE",
"TOP_LEVEL_COUNT",
"TRANSACTION",
"TRANSACTION_ACTIVE",
"TRANSACTIONS_COMMITTED",
"TRANSACTIONS_ROLLED_BACK",
"TRANSFORM",
"TRANSFORMS",
"TRANSLATE",
"TRANSLATE_REGEX",
"TRANSLATION",
"TRIGGER",
"TRIGGER_CATALOG",
"TRIGGER_NAME",
"TRIGGER_SCHEMA",
"TRIM_ARRAY",
"TRUNCATE",
"TRUSTED",
"TYPE",
"UESCAPE",
"UNBOUNDED",
"UNCOMMITTED",
"UNDER",
"UNENCRYPTED",
"UNKNOWN",
"UNLISTEN",
"UNNAMED",
"UNNEST",
"UNTIL",
"UNTYPED",
"UPDATE",
"UPPER",
"URI",
"USAGE",
"USER_DEFINED_TYPE_CATALOG",
"USER_DEFINED_TYPE_CODE",
"USER_DEFINED_TYPE_NAME",
"USER_DEFINED_TYPE_SCHEMA",
"VACUUM",
"VALID",
"VALIDATOR",
"VALUE",
"VAR_POP",
"VAR_SAMP",
"VARBINARY",
"VARIABLE",
"VARYING",
"VERSION",
"VIEW",
"VOLATILE",
"WHENEVER",
"WHITESPACE",
"WIDTH_BUCKET",
"WINDOW",
"WITHIN",
"WITHOUT",
"WORK",
"WRAPPER",
"WRITE",
"XML",
"XMLAGG",
"XMLBINARY",
"XMLCAST",
"XMLCOMMENT",
"XMLDECLARATION",
"XMLDOCUMENT",
"XMLEXISTS",
"XMLITERATE",
"XMLNAMESPACES",
"XMLQUERY",
"XMLSCHEMA",
"XMLTABLE",
"XMLTEXT",
"XMLVALIDATE",
"YEAR",
"YES",
"ZONE",
))
combined = union(POSTGRESQL, POSTGRESQL_NONRESERVED)
POSTGRESQL
end
function field_type(df)
"""Get the names and field types from the dataframe"""
nr,nc = size(df)
fields = names(df)
fl = [(string(fields[i]), string(eltype(df[i][1:nr]))) for i in 1:nc]
end
function jt_to_sql(t)
"""Convert the Julia-datatypes to Postgresql"""
md = Dict(
"UTF8String"=>"text",
"Int64"=>"bigint",
"Int8"=>"smallint",
"UInt8"=>"smallint",
"Int16"=>"integer",
"UInt16"=>"integer",
"Int32"=>"integer",
"UInt32"=>"integer",
"UInt64"=>"bigint",
"Int128"=>"integer",
"UInt128"=>"integer",
"Float64"=>"numeric",
"Char"=>"char(1)",
"Bool"=>"boolean")
if haskey(md,t)
md[t]
else
"text"
end
end
function correct_fieldvalue(v)
""" replace xml-values with utf-8
replace ' with ''
"""
v = replace(v, r"'", "''")
v = replace(v, r">", ">")
v = replace(v, r"<", "<")
end
function row_of_strings(r)
"""Prepare a row of strings to print from one row"""
nrows,ncols = size(r)
s = fill("",ncols)
for col in 1:ncols
v = r[nrows,col]
if ismissing(v)
v = "\\N"
else
v = string(v)
end
s[col] = correct_fieldvalue(v)
end
join(s,"\t")
end
function correct_fieldname(f,pg_keywords)
""" correct fieldnames that starts with either
a numerical or a underscore character
and add a 'n_' in front of the name
"""
syfers = r"^[0-9_]"
if uppercase(f) in pg_keywords
f = "n_$f"
elseif ismatch(syfers,f)
f = "n_$f"
else
f
end
f
end
function create_table_def(df,t)
nrows,ncols = size(df) #rows and columns
fls = field_type(df) # get the fieldnames and types
pg_keywords = reservedwords()
# Create a list of fieldnames
fns = [correct_fieldname(x[1],pg_keywords) for x in fls]
s = "CREATE TABLE $t ("
for c in 1:length(fls)
(i,f) = fls[c] #i = fieldname f = type
if i == "id"
st = "serial"
else
st = jt_to_sql(f) # postgresql-type
end
i = correct_fieldname(i,pg_keywords)
s = s * "$i $st,\n"
end
ss = s[1:length(s)-2] * ");\n\n" # remove last ',' and '\n' and add ');'
# The following header was taken from a file created by pg_dump
header = """SET statement_timeout = 0;
SET lock_timeout = 0;
SET idle_in_transaction_session_timeout = 0;
SET client_encoding = 'UTF8';
SET standard_conforming_strings = on;
SET check_function_bodies = false;
SET client_min_messages = warning;
SET row_security = off;
SET search_path = public, pg_catalog;
SET default_tablespace = '';
SET default_with_oids = false;\n\n"""
print(header)
print(ss)
# Create a line like this:
# COPY aa (article, sa_author, id, last_name, first_name, aa) FROM stdin;
cs = "COPY $t ("
z = join(fns, ", ")
css = cs * z * ") FROM stdin;\n"
print(css)
format_rows(df)
end
function format_rows(df)
"""Create rows of tab-delimited data from the values in the dataframe"""
nrows,ncols = size(df)
for row in 1:nrows
s = row_of_strings(df[row,1:ncols])
println(s)
end
println("\\.")
end
function addcolumn(df, c, v )
# Add a column with the name c and value v(in all the rows) to df
df[c] = v
df
end
function add_id(df)
# if there is a field "id" do nothing else add it
nrows, = size(df)
(:id in names(df)) || addcolumn(df, :id, 1:nrows)
end
if length(ARGS) < 2
println
println("-----------------------------------------------------------------------------")
println("Usage: julia csv_to_sql <csvfilename> <tablename for sql [>
outputfile.sql] ")
println("If you do not redirect it the output will be to stdout")
println("-----------------------------------------------------------------------------")
println
exit(1)
end
#function __init__()
input_file = ARGS[1]
table = ARGS[2]
df = CSV.read(input_file)
df = add_id(df) # Add id-field if it is not present in dataframe
create_table_def(df,table)
#end