Maciej Bryński created SPARK-9970: ------------------------------------- Summary: SQLContext.createDataFrame failed to properly determine column names Key: SPARK-9970 URL: https://issues.apache.org/jira/browse/SPARK-9970 Project: Spark Issue Type: Bug Components: PySpark Affects Versions: 1.4.0 Reporter: Maciej Bryński Priority: Minor
Hi, I'm trying to do "nested join" of tables. After first join everything is ok, but second join made some of the column names lost. My code is following: def joinTable(tableLeft, tableRight, columnLeft, columnRight, columnNested, joinType = "left_outer"): print "Joining tables %s %s" % (namestr(tableLeft), namestr(tableRight)) sys.stdout.flush() tmpTable = sqlCtx.createDataFrame(tableRight.rdd.groupBy(lambda r: r.asDict()[columnRight])) tmpTable = tmpTable.select(tmpTable._1.alias("joinColumn"), tmpTable._2.data.alias(columnNested)) return tableLeft.join(tmpTable, tableLeft[columnLeft] == tmpTable["joinColumn"], joinType).drop("joinColumn") user = sqlContext.read.json(path + "user.json") user.printSchema() root |-- id: long (nullable = true) |-- name: string (nullable = true) order = sqlContext.read.json(path + "order.json") order.printSchema(); root |-- id: long (nullable = true) |-- price: double (nullable = true) |-- userid: long (nullable = true) lines = sqlContext.read.json(path + "lines.json") lines.printSchema(); root |-- id: long (nullable = true) |-- orderid: long (nullable = true) |-- product: string (nullable = true) orders = joinTable(order, lines, "id", "orderid", "lines") orders.printSchema() root |-- id: long (nullable = true) |-- price: double (nullable = true) |-- userid: long (nullable = true) |-- lines: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- id: long (nullable = true) | | |-- orderid: long (nullable = true) | | |-- product: string (nullable = true) clients = joinTable(user, orders, "id", "userid", "orders") clients.printSchema() root |-- id: long (nullable = true) |-- name: string (nullable = true) |-- orders: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- id: long (nullable = true) | | |-- price: double (nullable = true) | | |-- userid: long (nullable = true) | | |-- lines: array (nullable = true) | | | |-- element: struct (containsNull = true) | | | | |-- _1: long (nullable = true) | | | | |-- _2: long (nullable = true) | | | | |-- _3: string (nullable = true) I tried to check if groupBy isn't the root of the problem but it's looks right grouped = orders.rdd.groupBy(lambda r: r.userid) print grouped.map(lambda x: list(x[1])).collect() [[Row(id=1, price=202.3, userid=1, lines=[Row(id=1, orderid=1, product=u'XXX'), Row(id=2, orderid=1, product=u'YYY')]), Row(id=2, price=343.99, userid=1, lines=[Row(id=3, orderid=2, product=u'XXX')])], [Row(id=3, price=399.99, userid=2, lines=[Row(id=4, orderid=3, product=u'XXX')])]] -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org