public void generateParquet(JavaSparkContext sc, String jsonFilePath, String parquetPath) { //int MB_128 = 128*1024*1024; //sc.hadoopConfiguration().setInt("dfs.blocksize", MB_128); //sc.hadoopConfiguration().setInt("parquet.block.size", MB_128); JavaSQLContext sqlCtx = new JavaSQLContext(sc); JavaRDD<Claim> claimRdd = sc.textFile(jsonFilePath).map(new StringToClaimMapper()).filter(new NullFilter()); JavaSchemaRDD claimSchemaRdd = sqlCtx.applySchema(claimRdd, Claim.class); claimSchemaRdd.coalesce(1, true); //tried with false also. Tried repartition(1) too. claimSchemaRdd.saveAsParquetFile(parquetPath); }
-- View this message in context: http://apache-spark-user-list.1001560.n3.nabble.com/Control-number-of-parquet-generated-from-JavaSchemaRDD-tp19717p19776.html Sent from the Apache Spark User List mailing list archive at Nabble.com. --------------------------------------------------------------------- To unsubscribe, e-mail: user-unsubscr...@spark.apache.org For additional commands, e-mail: user-h...@spark.apache.org