[ https://issues.apache.org/jira/browse/SPARK-16720?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15573367#comment-15573367 ]
holdenk commented on SPARK-16720: --------------------------------- Sounds good - go ahead and close this :) > Loading CSV file with 2k+ columns fails during attribute resolution on action > ----------------------------------------------------------------------------- > > Key: SPARK-16720 > URL: https://issues.apache.org/jira/browse/SPARK-16720 > Project: Spark > Issue Type: Bug > Components: SQL > Affects Versions: 2.0.0 > Reporter: holdenk > > Example shell for repro: > {quote} > scala> val df =spark.read.format("csv").option("header", > "true").option("inferSchema", "true").load("/home/holden/Downloads/ex*.csv") > df: org.apache.spark.sql.DataFrame = [Date: string, Lifetime Total Likes: int > ... 2125 more fields] > scala> df.schema > res0: org.apache.spark.sql.types.StructType = > StructType(StructField(Date,StringType,true), StructField(Lifetime Total > Likes,IntegerType,true), StructField(Daily New Likes,IntegerType,true), > StructField(Daily Unlikes,IntegerType,true), StructField(Daily Page Engaged > Users,IntegerType,true), StructField(Weekly Page Engaged > Users,IntegerType,true), StructField(28 Days Page Engaged > Users,IntegerType,true), StructField(Daily Like Sources - On Your > Page,IntegerType,true), StructField(Daily Total Reach,IntegerType,true), > StructField(Weekly Total Reach,IntegerType,true), StructField(28 Days Total > Reach,IntegerType,true), StructField(Daily Organic Reach,IntegerType,true), > StructField(Weekly Organic Reach,IntegerType,true), StructField(28 Days > Organic Reach,IntegerType,true), StructField(Daily T... > scala> df.take(1) > [GIANT LIST OF COLUMNS] > at > org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$resolve$1$$anonfun$apply$5.apply(LogicalPlan.scala:134) > at > org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$resolve$1$$anonfun$apply$5.apply(LogicalPlan.scala:134) > at scala.Option.getOrElse(Option.scala:121) > at > org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$resolve$1.apply(LogicalPlan.scala:133) > at > org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$resolve$1.apply(LogicalPlan.scala:129) > at > scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) > at > scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) > at scala.collection.Iterator$class.foreach(Iterator.scala:893) > at scala.collection.AbstractIterator.foreach(Iterator.scala:1336) > at scala.collection.IterableLike$class.foreach(IterableLike.scala:72) > at org.apache.spark.sql.types.StructType.foreach(StructType.scala:95) > at scala.collection.TraversableLike$class.map(TraversableLike.scala:234) > at org.apache.spark.sql.types.StructType.map(StructType.scala:95) > at > org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolve(LogicalPlan.scala:129) > at > org.apache.spark.sql.execution.datasources.FileSourceStrategy$.apply(FileSourceStrategy.scala:87) > at > org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$1.apply(QueryPlanner.scala:60) > at > org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$1.apply(QueryPlanner.scala:60) > at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434) > at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440) > at > org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:61) > at org.apache.spark.sql.execution.SparkPlanner.plan(SparkPlanner.scala:47) > at > org.apache.spark.sql.execution.SparkPlanner$$anonfun$plan$1$$anonfun$apply$1.applyOrElse(SparkPlanner.scala:51) > at > org.apache.spark.sql.execution.SparkPlanner$$anonfun$plan$1$$anonfun$apply$1.applyOrElse(SparkPlanner.scala:48) > at > org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:301) > at > org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:301) > at > org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:69) > at > org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:300) > at > org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:298) > at > org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:298) > at > org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$5.apply(TreeNode.scala:321) > at > org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:179) > at > org.apache.spark.sql.catalyst.trees.TreeNode.transformChildren(TreeNode.scala:319) > at > org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:298) > at > org.apache.spark.sql.execution.SparkPlanner$$anonfun$plan$1.apply(SparkPlanner.scala:48) > at > org.apache.spark.sql.execution.SparkPlanner$$anonfun$plan$1.apply(SparkPlanner.scala:48) > at scala.collection.Iterator$$anon$11.next(Iterator.scala:409) > at > org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:78) > at > org.apache.spark.sql.execution.QueryExecution.sparkPlan(QueryExecution.scala:76) > at > org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:83) > at > org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:83) > at org.apache.spark.sql.Dataset.withTypedCallback(Dataset.scala:2558) > at org.apache.spark.sql.Dataset.head(Dataset.scala:1924) > at org.apache.spark.sql.Dataset.take(Dataset.scala:2139) > ... 48 elided > {quote} > Interestingly enough attempting to access row by index also fails in column > resolution phase or converting to an RDD also fails. > Loading without header on succeeds. > csv file for repro (on dropbox): > https://www.dropbox.com/s/f8453txcej43mz4/example_facebook.csv?dl=0 -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org