We are converting some raw CMS data from csv to parquet using Drill, and
using partitioning as we go.

Query 1 runs OK on a narrower file:

size:
13G Mar 16 18:20 out_revenuej_lds_100_201412.csv (Month 12 file)
13G Mar 16 16:37 out_claimsj_lds_100_2014_q1.csv (Quarter 1 file)

q1 has 198 columns;
month 12 has 32 columns.

Both are partitioned on the same unique ID resulting in 14 buckets.
Resulting parquet for month 12:  2.6G parquet/outpatient_revenue_12

It fails on out_claimsj_lds_100_2014_q1.csv with 198 columns.

Error:
--------

*Error: SYSTEM ERROR: IllegalStateException: Memory was leaked by query.
Memory leaked: (8530784)*
*Allocator(op:0:0:6:ExternalSort) 20000000/8530784/357974944/357913941
(res/actual/peak/limit)*


*Fragment 0:0*

*[Error Id: 0606ff19-1c3a-4611-a3d4-1d28d9b3bd60 on cyclone-backend:31010]
(state=,code=0)*


Environment
-----------------
(upped memory to MAX RAM - 4 GB to try to make it pass)


drill-env.sh
DRILL_MAX_DIRECT_MEMORY="60G"
DRILL_HEAP="48G"

export DRILL_JAVA_OPTS="-Xms$DRILL_HEAP -Xmx$DRILL_HEAP
-XX:MaxDirectMemorySize=$DRILL
_MAX_DIRECT_MEMORY -XX:MaxPermSize=512M -XX:ReservedCodeCacheSize=1G
-Ddrill.exec.enab
le-epoll=true"

# Class unloading is disabled by default in Java 7
#
http://hg.openjdk.java.net/jdk7u/jdk7u60/hotspot/file/tip/src/share/vm/runtime/globa
ls.hpp#l1622
export SERVER_GC_OPTS="-XX:+CMSClassUnloadingEnabled -XX:+UseG1GC "

Query 1
-----------

-- Success

--12
CREATE TABLE outpatient_revenue_12
PARTITION BY (UID_CLASS) AS SELECT CAST( SUBSTR(columns[0],1,2) as INT) as
 UID_CLASS, CASE WHEN columns[0] = '' THEN NULL ELSE CAST(columns[0] as
DOUBLE) END  as DSYSRTKY, CASE WHEN columns[1] = '' THEN NULL ELSE
CAST(columns[1] as DOUBLE) END  as CLAIMNO, CASE WHEN columns[2] = '' THEN
NULL ELSE CAST(columns[2] as DOUBLE) END  as CLM_LN, CASE WHEN columns[3] =
'' THEN NULL ELSE TO_DATE(columns[3], 'yyyyMMdd') END  as THRU_DT,
NULLIF(columns[4],'') as CLM_TYPE, NULLIF(columns[5],'') as REV_CNTR, CASE
WHEN columns[6] = '' THEN NULL ELSE TO_DATE(columns[6], 'yyyyMMdd') END  as
REV_DT, NULLIF(columns[7],'') as APCHIPPS, NULLIF(columns[8],'') as
HCPCS_CD, NULLIF(columns[9],'') as MDFR_CD1, NULLIF(columns[10],'') as
MDFR_CD2, NULLIF(columns[11],'') as PMTMTHD, NULLIF(columns[12],'') as
DSCNTIND, NULLIF(columns[13],'') as PACKGIND, NULLIF(columns[14],'') as
OTAF_1, NULLIF(columns[15],'') as IDENDC, CASE WHEN columns[16] = '' THEN
NULL ELSE CAST(columns[16] as DOUBLE) END  as REV_UNIT, CASE WHEN
columns[17] = '' THEN NULL ELSE CAST(columns[17] as DOUBLE) END  as
REV_RATE, CASE WHEN columns[18] = '' THEN NULL ELSE CAST(columns[18] as
DOUBLE) END  as REVBLOOD, CASE WHEN columns[19] = '' THEN NULL ELSE
CAST(columns[19] as DOUBLE) END  as REVDCTBL, CASE WHEN columns[20] = ''
THEN NULL ELSE CAST(columns[20] as DOUBLE) END  as WAGEADJ, CASE WHEN
columns[21] = '' THEN NULL ELSE CAST(columns[21] as DOUBLE) END  as
RDCDCOIN, CASE WHEN columns[22] = '' THEN NULL ELSE CAST(columns[22] as
DOUBLE) END  as REV_MSP1, CASE WHEN columns[23] = '' THEN NULL ELSE
CAST(columns[23] as DOUBLE) END  as REV_MSP2, CASE WHEN columns[24] = ''
THEN NULL ELSE CAST(columns[24] as DOUBLE) END  as RPRVDPMT, CASE WHEN
columns[25] = '' THEN NULL ELSE CAST(columns[25] as DOUBLE) END  as
RBENEPMT, CASE WHEN columns[26] = '' THEN NULL ELSE CAST(columns[26] as
DOUBLE) END  as PTNTRESP, CASE WHEN columns[27] = '' THEN NULL ELSE
CAST(columns[27] as DOUBLE) END  as REVPMT, CASE WHEN columns[28] = '' THEN
NULL ELSE CAST(columns[28] as DOUBLE) END  as REV_CHRG, CASE WHEN
columns[29] = '' THEN NULL ELSE CAST(columns[29] as DOUBLE) END  as
REV_NCVR, NULLIF(columns[30],'') as REVSTIND, NULLIF(columns[31],'') as
REV_CNTR_PRICNG_IND_CD
FROM
dfs.`default`.`/data/cms/2014_outpatient/out_revenuej_lds_100_201412.csv`;


Query 2
-----------

-- Failed

-- Q1
CREATE TABLE base_outpatient_q1
PARTITION BY (UID_CLASS)
AS
SELECT CAST( SUBSTR(columns[0],1,2) as INT) as  UID_CLASS, CASE WHEN
columns[0] = '' THEN NULL ELSE CAST(columns[0] as DOUBLE) END  as
`DSYSRTKY`, CASE WHEN columns[1] = '' THEN NULL ELSE CAST(columns[1] as
DOUBLE) END  as `CLAIMNO`, NULLIF(columns[2],'') as `PROVIDER`, CASE WHEN
columns[3] = '' THEN NULL ELSE TO_DATE(columns[3], 'yyyyMMdd') END  as
`THRU_DT`, NULLIF(columns[4],'') as `RIC_CD`, NULLIF(columns[5],'') as
`CLM_TYPE`, NULLIF(columns[6],'') as `QUERY_CD`, NULLIF(columns[7],'') as
`FAC_TYPE`, NULLIF(columns[8],'') as `TYPESRVC`, NULLIF(columns[9],'') as
`FREQ_CD`, NULLIF(columns[10],'') as `FI_NUM`, NULLIF(columns[11],'') as
`NOPAY_CD`, CASE WHEN columns[12] = '' THEN NULL ELSE CAST(columns[12] as
DOUBLE) END  as `PMT_AMT`, CASE WHEN columns[13] = '' THEN NULL ELSE
CAST(columns[13] as DOUBLE) END  as `PRPAYAMT`, NULLIF(columns[14],'') as
`PRPAY_CD`, NULLIF(columns[15],'') as `PRSTATE`, NULLIF(columns[16],'') as
`ORGNPINM`, NULLIF(columns[17],'') as `AT_UPIN`, NULLIF(columns[18],'') as
`AT_NPI`, NULLIF(columns[19],'') as `OP_UPIN`, NULLIF(columns[20],'') as
`OP_NPI`, NULLIF(columns[21],'') as `OT_UPIN`, NULLIF(columns[22],'') as
`OT_NPI`, NULLIF(columns[23],'') as `MCOPDSW`, NULLIF(columns[24],'') as
`STUS_CD`, CASE WHEN columns[25] = '' THEN NULL ELSE CAST(columns[25] as
DOUBLE) END  as `TOT_CHRG`, CASE WHEN columns[26] = '' THEN NULL ELSE
CAST(columns[26] as DOUBLE) END  as `BLDDEDAM`, CASE WHEN columns[27] = ''
THEN NULL ELSE CAST(columns[27] as DOUBLE) END  as `PCCHGAMT`,
NULLIF(columns[28],'') as `PRNCPAL_DGNS_CD`, NULLIF(columns[29],'') as
`PRNCPAL_DGNS_VRSN_CD`, NULLIF(columns[30],'') as `ICD DGNS CD1`,
NULLIF(columns[31],'') as `ICD DGNS VRSN CD1`, NULLIF(columns[32],'') as
`ICD DGNS CD2`, NULLIF(columns[33],'') as `ICD DGNS VRSN CD2`,
NULLIF(columns[34],'') as `ICD DGNS CD3`, NULLIF(columns[35],'') as `ICD
DGNS VRSN CD3`, NULLIF(columns[36],'') as `ICD DGNS CD4`,
NULLIF(columns[37],'') as `ICD DGNS VRSN CD4`, NULLIF(columns[38],'') as
`ICD DGNS CD5`, NULLIF(columns[39],'') as `ICD DGNS VRSN CD5`,
NULLIF(columns[40],'') as `ICD DGNS CD6`, NULLIF(columns[41],'') as `ICD
DGNS VRSN CD6`, NULLIF(columns[42],'') as `ICD DGNS CD7`,
NULLIF(columns[43],'') as `ICD DGNS VRSN CD7`, NULLIF(columns[44],'') as
`ICD DGNS CD8`, NULLIF(columns[45],'') as `ICD DGNS VRSN CD8`,
NULLIF(columns[46],'') as `ICD DGNS CD9`, NULLIF(columns[47],'') as `ICD
DGNS VRSN CD9`, NULLIF(columns[48],'') as `ICD DGNS CD10`,
NULLIF(columns[49],'') as `ICD DGNS VRSN CD10`, NULLIF(columns[50],'') as
`ICD DGNS CD11`, NULLIF(columns[51],'') as `ICD DGNS VRSN CD11`,
NULLIF(columns[52],'') as `ICD DGNS CD12`, NULLIF(columns[53],'') as `ICD
DGNS VRSN CD12`, NULLIF(columns[54],'') as `ICD DGNS CD13`,
NULLIF(columns[55],'') as `ICD DGNS VRSN CD13`, NULLIF(columns[56],'') as
`ICD DGNS CD14`, NULLIF(columns[57],'') as `ICD DGNS VRSN CD14`,
NULLIF(columns[58],'') as `ICD DGNS CD15`, NULLIF(columns[59],'') as `ICD
DGNS VRSN CD15`, NULLIF(columns[60],'') as `ICD DGNS CD16`,
NULLIF(columns[61],'') as `ICD DGNS VRSN CD16`, NULLIF(columns[62],'') as
`ICD DGNS CD17`, NULLIF(columns[63],'') as `ICD DGNS VRSN CD17`,
NULLIF(columns[64],'') as `ICD DGNS CD18`, NULLIF(columns[65],'') as `ICD
DGNS VRSN CD18`, NULLIF(columns[66],'') as `ICD DGNS CD19`,
NULLIF(columns[67],'') as `ICD DGNS VRSN CD19`, NULLIF(columns[68],'') as
`ICD DGNS CD20`, NULLIF(columns[69],'') as `ICD DGNS VRSN CD20`,
NULLIF(columns[70],'') as `ICD DGNS CD21`, NULLIF(columns[71],'') as `ICD
DGNS VRSN CD21`, NULLIF(columns[72],'') as `ICD DGNS CD22`,
NULLIF(columns[73],'') as `ICD DGNS VRSN CD22`, NULLIF(columns[74],'') as
`ICD DGNS CD23`, NULLIF(columns[75],'') as `ICD DGNS VRSN CD23`,
NULLIF(columns[76],'') as `ICD DGNS CD24`, NULLIF(columns[77],'') as `ICD
DGNS VRSN CD24`, NULLIF(columns[78],'') as `ICD DGNS CD25`,
NULLIF(columns[79],'') as `ICD DGNS VRSN CD25`, NULLIF(columns[80],'') as
`FST_DGNS_E_CD`, NULLIF(columns[81],'') as `FST_DGNS_E_VRSN_CD`,
NULLIF(columns[82],'') as `ICD DGNS E CD1`, NULLIF(columns[83],'') as `ICD
DGNS E VRSN CD1`, NULLIF(columns[84],'') as `ICD DGNS E CD2`,
NULLIF(columns[85],'') as `ICD DGNS E VRSN CD2`, NULLIF(columns[86],'') as
`ICD DGNS E CD3`, NULLIF(columns[87],'') as `ICD DGNS E VRSN CD3`,
NULLIF(columns[88],'') as `ICD DGNS E CD4`, NULLIF(columns[89],'') as `ICD
DGNS E VRSN CD4`, NULLIF(columns[90],'') as `ICD DGNS E CD5`,
NULLIF(columns[91],'') as `ICD DGNS E VRSN CD5`, NULLIF(columns[92],'') as
`ICD DGNS E CD6`, NULLIF(columns[93],'') as `ICD DGNS E VRSN CD6`,
NULLIF(columns[94],'') as `ICD DGNS E CD7`, NULLIF(columns[95],'') as `ICD
DGNS E VRSN CD7`, NULLIF(columns[96],'') as `ICD DGNS E CD8`,
NULLIF(columns[97],'') as `ICD DGNS E VRSN CD8`, NULLIF(columns[98],'') as
`ICD DGNS E CD9`, NULLIF(columns[99],'') as `ICD DGNS E VRSN CD9`,
NULLIF(columns[100],'') as `ICD DGNS E CD10`, NULLIF(columns[101],'') as
`ICD DGNS E VRSN CD10`, NULLIF(columns[102],'') as `ICD DGNS E CD11`,
NULLIF(columns[103],'') as `ICD DGNS E VRSN CD11`, NULLIF(columns[104],'')
as `ICD DGNS E CD12`, NULLIF(columns[105],'') as `ICD DGNS E VRSN CD12`,
NULLIF(columns[106],'') as `ICD_PRCDR_CD1`, NULLIF(columns[107],'') as
`ICD_PRCDR_VRSN_CD1`, CASE WHEN columns[108] = '' THEN NULL ELSE
TO_DATE(columns[108], 'yyyyMMdd') END  as `PRCDR_DT1 `,
NULLIF(columns[109],'') as `ICD_PRCDR_CD2`, NULLIF(columns[110],'') as
`ICD_PRCDR_VRSN_CD2 `, CASE WHEN columns[111] = '' THEN NULL ELSE
TO_DATE(columns[111], 'yyyyMMdd') END  as `PRCDR_DT2`,
NULLIF(columns[112],'') as `ICD_PRCDR_CD3`, NULLIF(columns[113],'') as
`ICD_PRCDR_VRSN_CD3`, CASE WHEN columns[114] = '' THEN NULL ELSE
TO_DATE(columns[114], 'yyyyMMdd') END  as `PRCDR_DT3`,
NULLIF(columns[115],'') as `ICD_PRCDR_CD4`, NULLIF(columns[116],'') as
`ICD_PRCDR_VRSN_CD4`, CASE WHEN columns[117] = '' THEN NULL ELSE
TO_DATE(columns[117], 'yyyyMMdd') END  as `PRCDR_DT4`,
NULLIF(columns[118],'') as `ICD_PRCDR_CD5`, NULLIF(columns[119],'') as
`ICD_PRCDR_VRSN_CD5 `, CASE WHEN columns[120] = '' THEN NULL ELSE
TO_DATE(columns[120], 'yyyyMMdd') END  as `PRCDR_DT5`,
NULLIF(columns[121],'') as `ICD_PRCDR_CD6`, NULLIF(columns[122],'') as
`ICD_PRCDR_VRSN_CD6 `, CASE WHEN columns[123] = '' THEN NULL ELSE
TO_DATE(columns[123], 'yyyyMMdd') END  as `PRCDR_DT6`,
NULLIF(columns[124],'') as `ICD_PRCDR_CD7`, NULLIF(columns[125],'') as
`ICD_PRCDR_VRSN_CD7 `, CASE WHEN columns[126] = '' THEN NULL ELSE
TO_DATE(columns[126], 'yyyyMMdd') END  as `PRCDR_DT7`,
NULLIF(columns[127],'') as `ICD_PRCDR_CD8`, NULLIF(columns[128],'') as
`ICD_PRCDR_VRSN_CD8 `, CASE WHEN columns[129] = '' THEN NULL ELSE
TO_DATE(columns[129], 'yyyyMMdd') END  as `PRCDR_DT8`,
NULLIF(columns[130],'') as `ICD_PRCDR_CD9`, NULLIF(columns[131],'') as
`ICD_PRCDR_VRSN_CD9`, CASE WHEN columns[132] = '' THEN NULL ELSE
TO_DATE(columns[132], 'yyyyMMdd') END  as `PRCDR_DT9`,
NULLIF(columns[133],'') as `ICD_PRCDR_CD10`, NULLIF(columns[134],'') as
`ICD_PRCDR_VRSN_CD10`, CASE WHEN columns[135] = '' THEN NULL ELSE
TO_DATE(columns[135], 'yyyyMMdd') END  as `PRCDR_DT10`,
NULLIF(columns[136],'') as `ICD_PRCDR_CD11`, NULLIF(columns[137],'') as
`ICD_PRCDR_VRSN_CD11 `, CASE WHEN columns[138] = '' THEN NULL ELSE
TO_DATE(columns[138], 'yyyyMMdd') END  as `PRCDR_DT11`,
NULLIF(columns[139],'') as `ICD_PRCDR_CD12`, NULLIF(columns[140],'') as
`ICD_PRCDR_VRSN_CD12`, CASE WHEN columns[141] = '' THEN NULL ELSE
TO_DATE(columns[141], 'yyyyMMdd') END  as `PRCDR_DT12`,
NULLIF(columns[142],'') as `ICD_PRCDR_CD13`, NULLIF(columns[143],'') as
`ICD_PRCDR_VRSN_CD13`, CASE WHEN columns[144] = '' THEN NULL ELSE
TO_DATE(columns[144], 'yyyyMMdd') END  as `PRCDR_DT13`,
NULLIF(columns[145],'') as `ICD_PRCDR_CD14`, NULLIF(columns[146],'') as
`ICD_PRCDR_VRSN_CD14`, CASE WHEN columns[147] = '' THEN NULL ELSE
TO_DATE(columns[147], 'yyyyMMdd') END  as `PRCDR_DT14`,
NULLIF(columns[148],'') as `ICD_PRCDR_CD15`, NULLIF(columns[149],'') as
`ICD_PRCDR_VRSN_CD15 `, CASE WHEN columns[150] = '' THEN NULL ELSE
TO_DATE(columns[150], 'yyyyMMdd') END  as `PRCDR_DT15`,
NULLIF(columns[151],'') as `ICD_PRCDR_CD16`, NULLIF(columns[152],'') as
`ICD_PRCDR_VRSN_CD16`, CASE WHEN columns[153] = '' THEN NULL ELSE
TO_DATE(columns[153], 'yyyyMMdd') END  as `PRCDR_DT16`,
NULLIF(columns[154],'') as `ICD_PRCDR_CD17`, NULLIF(columns[155],'') as
`ICD_PRCDR_VRSN_CD17`, CASE WHEN columns[156] = '' THEN NULL ELSE
TO_DATE(columns[156], 'yyyyMMdd') END  as `PRCDR_DT17`,
NULLIF(columns[157],'') as `ICD_PRCDR_CD18`, NULLIF(columns[158],'') as
`ICD_PRCDR_VRSN_CD18`, CASE WHEN columns[159] = '' THEN NULL ELSE
TO_DATE(columns[159], 'yyyyMMdd') END  as `PRCDR_DT18`,
NULLIF(columns[160],'') as `ICD_PRCDR_CD19`, NULLIF(columns[161],'') as
`ICD_PRCDR_VRSN_CD19 `, CASE WHEN columns[162] = '' THEN NULL ELSE
TO_DATE(columns[162], 'yyyyMMdd') END  as `PRCDR_DT19`,
NULLIF(columns[163],'') as `ICD_PRCDR_CD20`, NULLIF(columns[164],'') as
`ICD_PRCDR_VRSN_CD20`, CASE WHEN columns[165] = '' THEN NULL ELSE
TO_DATE(columns[165], 'yyyyMMdd') END  as `PRCDR_DT20`,
NULLIF(columns[166],'') as `ICD_PRCDR_CD21`, NULLIF(columns[167],'') as
`ICD_PRCDR_VRSN_CD21`, CASE WHEN columns[168] = '' THEN NULL ELSE
TO_DATE(columns[168], 'yyyyMMdd') END  as `PRCDR_DT21`,
NULLIF(columns[169],'') as `ICD_PRCDR_CD22`, NULLIF(columns[170],'') as
`ICD_PRCDR_VRSN_CD22`, CASE WHEN columns[171] = '' THEN NULL ELSE
TO_DATE(columns[171], 'yyyyMMdd') END  as `PRCDR_DT22`,
NULLIF(columns[172],'') as `ICD_PRCDR_CD23`, NULLIF(columns[173],'') as
`ICD_PRCDR_VRSN_CD23`, CASE WHEN columns[174] = '' THEN NULL ELSE
TO_DATE(columns[174], 'yyyyMMdd') END  as `PRCDR_DT23`,
NULLIF(columns[175],'') as `ICD_PRCDR_CD24`, NULLIF(columns[176],'') as
`ICD_PRCDR_VRSN_CD24`, CASE WHEN columns[177] = '' THEN NULL ELSE
TO_DATE(columns[177], 'yyyyMMdd') END  as `PRCDR_DT24`,
NULLIF(columns[178],'') as `ICD_PRCDR_CD25`, NULLIF(columns[179],'') as
`ICD_PRCDR_VRSN_CD25`, CASE WHEN columns[180] = '' THEN NULL ELSE
TO_DATE(columns[180], 'yyyyMMdd') END  as `PRCDR_DT25`,
NULLIF(columns[181],'') as `RSN_VISIT_CD1`, NULLIF(columns[182],'') as
`RSN_VISIT_VRSN_CD1`, NULLIF(columns[183],'') as `RSN_VISIT_CD2`,
NULLIF(columns[184],'') as `RSN_VISIT_VRSN_CD2`, NULLIF(columns[185],'') as
`RSN_VISIT_CD3`, NULLIF(columns[186],'') as `RSN_VISIT_VRSN_CD3`, CASE WHEN
columns[187] = '' THEN NULL ELSE CAST(columns[187] as DOUBLE) END  as
`PTB_DED`, CASE WHEN columns[188] = '' THEN NULL ELSE CAST(columns[188] as
DOUBLE) END  as `PTB_COIN`, CASE WHEN columns[189] = '' THEN NULL ELSE
CAST(columns[189] as DOUBLE) END  as `PRVDRPMT`, CASE WHEN columns[190] =
'' THEN NULL ELSE CAST(columns[190] as DOUBLE) END  as `BENEPMT`, CASE WHEN
columns[191] = '' THEN NULL ELSE CAST(columns[191] as DOUBLE) END  as
`DOB_DT`, NULLIF(columns[192],'') as `GNDR_CD`, NULLIF(columns[193],'') as
`RACE_CD`, NULLIF(columns[194],'') as `CNTY_CD`, NULLIF(columns[195],'') as
`STATE_CD`, NULLIF(columns[196],'') as `CWF_BENE_MDCR_STUS_CD`,
NULLIF(columns[197],'') as `ACTIONCD` FROM
dfs.`default`.`/data/cms/2014_outpatient/out_claimsj_lds_100_2014_q1.csv`;

Thank you,
Edmon

Reply via email to