[jira] [Updated] (SPARK-9970) SQLContext.createDataFrame failed to properly determine column names
[ https://issues.apache.org/jira/browse/SPARK-9970?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Maciej Bryński updated SPARK-9970: -- Description: Hi, I'm trying to do nested join of tables. After first join everything is ok, but second join made some of the column names lost. My code is following: {code:java} def joinTable(tableLeft, tableRight, columnLeft, columnRight, columnNested, joinType = left_outer): tmpTable = sqlCtx.createDataFrame(tableRight.rdd.groupBy(lambda r: r.asDict()[columnRight])) tmpTable = tmpTable.select(tmpTable._1.alias(joinColumn), tmpTable._2.data.alias(columnNested)) return tableLeft.join(tmpTable, tableLeft[columnLeft] == tmpTable[joinColumn], joinType).drop(joinColumn) user = sqlContext.read.json(user.json) user.printSchema() root |-- id: long (nullable = true) |-- name: string (nullable = true) order = sqlContext.read.json(order.json) order.printSchema(); root |-- id: long (nullable = true) |-- price: double (nullable = true) |-- userid: long (nullable = true) lines = sqlContext.read.json(lines.json) lines.printSchema(); root |-- id: long (nullable = true) |-- orderid: long (nullable = true) |-- product: string (nullable = true) {code} And joining code: Please look for the result of 2nd join. There are columns _1,_2,_3. Should be 'id', 'orderid', 'product' {code:java} orders = joinTable(order, lines, id, orderid, lines) orders.printSchema() root |-- id: long (nullable = true) |-- price: double (nullable = true) |-- userid: long (nullable = true) |-- lines: array (nullable = true) ||-- element: struct (containsNull = true) |||-- id: long (nullable = true) |||-- orderid: long (nullable = true) |||-- product: string (nullable = true) clients = joinTable(user, orders, id, userid, orders) clients.printSchema() root |-- id: long (nullable = true) |-- name: string (nullable = true) |-- orders: array (nullable = true) ||-- element: struct (containsNull = true) |||-- id: long (nullable = true) |||-- price: double (nullable = true) |||-- userid: long (nullable = true) |||-- lines: array (nullable = true) ||||-- element: struct (containsNull = true) |||||-- _1: long (nullable = true) |||||-- _2: long (nullable = true) |||||-- _3: string (nullable = true) {code} I tried to check if groupBy isn't the cause of the problem but it looks right {code:java} grouped = orders.rdd.groupBy(lambda r: r.userid) print grouped.map(lambda x: list(x[1])).collect() [[Row(id=1, price=202.3, userid=1, lines=[Row(id=1, orderid=1, product=u'XXX'), Row(id=2, orderid=1, product=u'YYY')]), Row(id=2, price=343.99, userid=1, lines=[Row(id=3, orderid=2, product=u'XXX')])], [Row(id=3, price=399.99, userid=2, lines=[Row(id=4, orderid=3, product=u'XXX')])]] {code} So I assume that the problem is with createDataFrame. was: Hi, I'm trying to do nested join of tables. After first join everything is ok, but second join made some of the column names lost. My code is following: {code:java} def joinTable(tableLeft, tableRight, columnLeft, columnRight, columnNested, joinType = left_outer): tmpTable = sqlCtx.createDataFrame(tableRight.rdd.groupBy(lambda r: r.asDict()[columnRight])) tmpTable = tmpTable.select(tmpTable._1.alias(joinColumn), tmpTable._2.data.alias(columnNested)) return tableLeft.join(tmpTable, tableLeft[columnLeft] == tmpTable[joinColumn], joinType).drop(joinColumn) user = sqlContext.read.json(path + user.json) user.printSchema() root |-- id: long (nullable = true) |-- name: string (nullable = true) order = sqlContext.read.json(path + order.json) order.printSchema(); root |-- id: long (nullable = true) |-- price: double (nullable = true) |-- userid: long (nullable = true) lines = sqlContext.read.json(path + lines.json) lines.printSchema(); root |-- id: long (nullable = true) |-- orderid: long (nullable = true) |-- product: string (nullable = true) {code} And joining code: Please look for the result of 2nd join. There are columns _1,_2,_3. Should be 'id', 'orderid', 'product' {code:java} orders = joinTable(order, lines, id, orderid, lines) orders.printSchema() root |-- id: long (nullable = true) |-- price: double (nullable = true) |-- userid: long (nullable = true) |-- lines: array (nullable = true) ||-- element: struct (containsNull = true) |||-- id: long (nullable = true) |||-- orderid: long (nullable = true) |||-- product: string (nullable = true) clients = joinTable(user, orders, id, userid, orders) clients.printSchema() root |-- id: long (nullable = true) |-- name: string (nullable = true) |-- orders: array (nullable = true) ||-- element: struct (containsNull = true) |||-- id: long (nullable = true) |||-- price: double (nullable = true) |||--
[jira] [Updated] (SPARK-9970) SQLContext.createDataFrame failed to properly determine column names
[ https://issues.apache.org/jira/browse/SPARK-9970?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Maciej Bryński updated SPARK-9970: -- Description: Hi, I'm trying to do nested join of tables. After first join everything is ok, but second join made some of the column names lost. My code is following: {code:java} def joinTable(tableLeft, tableRight, columnLeft, columnRight, columnNested, joinType = left_outer): tmpTable = sqlCtx.createDataFrame(tableRight.rdd.groupBy(lambda r: r.asDict()[columnRight])) tmpTable = tmpTable.select(tmpTable._1.alias(joinColumn), tmpTable._2.data.alias(columnNested)) return tableLeft.join(tmpTable, tableLeft[columnLeft] == tmpTable[joinColumn], joinType).drop(joinColumn) user = sqlContext.read.json(path + user.json) user.printSchema() root |-- id: long (nullable = true) |-- name: string (nullable = true) order = sqlContext.read.json(path + order.json) order.printSchema(); root |-- id: long (nullable = true) |-- price: double (nullable = true) |-- userid: long (nullable = true) lines = sqlContext.read.json(path + lines.json) lines.printSchema(); root |-- id: long (nullable = true) |-- orderid: long (nullable = true) |-- product: string (nullable = true) {code} And joining code: {code:java} orders = joinTable(order, lines, id, orderid, lines) orders.printSchema() root |-- id: long (nullable = true) |-- price: double (nullable = true) |-- userid: long (nullable = true) |-- lines: array (nullable = true) ||-- element: struct (containsNull = true) |||-- id: long (nullable = true) |||-- orderid: long (nullable = true) |||-- product: string (nullable = true) clients = joinTable(user, orders, id, userid, orders) clients.printSchema() root |-- id: long (nullable = true) |-- name: string (nullable = true) |-- orders: array (nullable = true) ||-- element: struct (containsNull = true) |||-- id: long (nullable = true) |||-- price: double (nullable = true) |||-- userid: long (nullable = true) |||-- lines: array (nullable = true) ||||-- element: struct (containsNull = true) |||||-- _1: long (nullable = true) |||||-- _2: long (nullable = true) |||||-- _3: string (nullable = true) {code} I tried to check if groupBy isn't the root of the problem but it's looks right {code:java} grouped = orders.rdd.groupBy(lambda r: r.userid) print grouped.map(lambda x: list(x[1])).collect() [[Row(id=1, price=202.3, userid=1, lines=[Row(id=1, orderid=1, product=u'XXX'), Row(id=2, orderid=1, product=u'YYY')]), Row(id=2, price=343.99, userid=1, lines=[Row(id=3, orderid=2, product=u'XXX')])], [Row(id=3, price=399.99, userid=2, lines=[Row(id=4, orderid=3, product=u'XXX')])]] {code} So I assume that the problem is with createDataFrame. was: Hi, I'm trying to do nested join of tables. After first join everything is ok, but second join made some of the column names lost. My code is following: def joinTable(tableLeft, tableRight, columnLeft, columnRight, columnNested, joinType = left_outer): print Joining tables %s %s % (namestr(tableLeft), namestr(tableRight)) sys.stdout.flush() tmpTable = sqlCtx.createDataFrame(tableRight.rdd.groupBy(lambda r: r.asDict()[columnRight])) tmpTable = tmpTable.select(tmpTable._1.alias(joinColumn), tmpTable._2.data.alias(columnNested)) return tableLeft.join(tmpTable, tableLeft[columnLeft] == tmpTable[joinColumn], joinType).drop(joinColumn) user = sqlContext.read.json(path + user.json) user.printSchema() root |-- id: long (nullable = true) |-- name: string (nullable = true) order = sqlContext.read.json(path + order.json) order.printSchema(); root |-- id: long (nullable = true) |-- price: double (nullable = true) |-- userid: long (nullable = true) lines = sqlContext.read.json(path + lines.json) lines.printSchema(); root |-- id: long (nullable = true) |-- orderid: long (nullable = true) |-- product: string (nullable = true) orders = joinTable(order, lines, id, orderid, lines) orders.printSchema() root |-- id: long (nullable = true) |-- price: double (nullable = true) |-- userid: long (nullable = true) |-- lines: array (nullable = true) ||-- element: struct (containsNull = true) |||-- id: long (nullable = true) |||-- orderid: long (nullable = true) |||-- product: string (nullable = true) clients = joinTable(user, orders, id, userid, orders) clients.printSchema() root |-- id: long (nullable = true) |-- name: string (nullable = true) |-- orders: array (nullable = true) ||-- element: struct (containsNull = true) |||-- id: long (nullable = true) |||-- price: double (nullable = true) |||-- userid: long (nullable = true) |||-- lines: array (nullable = true) ||||-- element: struct (containsNull = true) ||
[jira] [Updated] (SPARK-9970) SQLContext.createDataFrame failed to properly determine column names
[ https://issues.apache.org/jira/browse/SPARK-9970?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Maciej Bryński updated SPARK-9970: -- Description: Hi, I'm trying to do nested join of tables. After first join everything is ok, but second join made some of the column names lost. My code is following: {code:java} def joinTable(tableLeft, tableRight, columnLeft, columnRight, columnNested, joinType = left_outer): tmpTable = sqlCtx.createDataFrame(tableRight.rdd.groupBy(lambda r: r.asDict()[columnRight])) tmpTable = tmpTable.select(tmpTable._1.alias(joinColumn), tmpTable._2.data.alias(columnNested)) return tableLeft.join(tmpTable, tableLeft[columnLeft] == tmpTable[joinColumn], joinType).drop(joinColumn) user = sqlContext.read.json(path + user.json) user.printSchema() root |-- id: long (nullable = true) |-- name: string (nullable = true) order = sqlContext.read.json(path + order.json) order.printSchema(); root |-- id: long (nullable = true) |-- price: double (nullable = true) |-- userid: long (nullable = true) lines = sqlContext.read.json(path + lines.json) lines.printSchema(); root |-- id: long (nullable = true) |-- orderid: long (nullable = true) |-- product: string (nullable = true) {code} And joining code: Please look for the result of 2nd join. There are columns _1,_2,_3. Should be 'id', 'orderid', 'product' {code:java} orders = joinTable(order, lines, id, orderid, lines) orders.printSchema() root |-- id: long (nullable = true) |-- price: double (nullable = true) |-- userid: long (nullable = true) |-- lines: array (nullable = true) ||-- element: struct (containsNull = true) |||-- id: long (nullable = true) |||-- orderid: long (nullable = true) |||-- product: string (nullable = true) clients = joinTable(user, orders, id, userid, orders) clients.printSchema() root |-- id: long (nullable = true) |-- name: string (nullable = true) |-- orders: array (nullable = true) ||-- element: struct (containsNull = true) |||-- id: long (nullable = true) |||-- price: double (nullable = true) |||-- userid: long (nullable = true) |||-- lines: array (nullable = true) ||||-- element: struct (containsNull = true) |||||-- _1: long (nullable = true) |||||-- _2: long (nullable = true) |||||-- _3: string (nullable = true) {code} I tried to check if groupBy isn't the cause of the problem but it looks right {code:java} grouped = orders.rdd.groupBy(lambda r: r.userid) print grouped.map(lambda x: list(x[1])).collect() [[Row(id=1, price=202.3, userid=1, lines=[Row(id=1, orderid=1, product=u'XXX'), Row(id=2, orderid=1, product=u'YYY')]), Row(id=2, price=343.99, userid=1, lines=[Row(id=3, orderid=2, product=u'XXX')])], [Row(id=3, price=399.99, userid=2, lines=[Row(id=4, orderid=3, product=u'XXX')])]] {code} So I assume that the problem is with createDataFrame. was: Hi, I'm trying to do nested join of tables. After first join everything is ok, but second join made some of the column names lost. My code is following: {code:java} def joinTable(tableLeft, tableRight, columnLeft, columnRight, columnNested, joinType = left_outer): tmpTable = sqlCtx.createDataFrame(tableRight.rdd.groupBy(lambda r: r.asDict()[columnRight])) tmpTable = tmpTable.select(tmpTable._1.alias(joinColumn), tmpTable._2.data.alias(columnNested)) return tableLeft.join(tmpTable, tableLeft[columnLeft] == tmpTable[joinColumn], joinType).drop(joinColumn) user = sqlContext.read.json(path + user.json) user.printSchema() root |-- id: long (nullable = true) |-- name: string (nullable = true) order = sqlContext.read.json(path + order.json) order.printSchema(); root |-- id: long (nullable = true) |-- price: double (nullable = true) |-- userid: long (nullable = true) lines = sqlContext.read.json(path + lines.json) lines.printSchema(); root |-- id: long (nullable = true) |-- orderid: long (nullable = true) |-- product: string (nullable = true) {code} And joining code: Please look for the result of 2nd join. There are columns _1,_2,_3. Should be 'id', 'orderid', 'product' {code:java} orders = joinTable(order, lines, id, orderid, lines) orders.printSchema() root |-- id: long (nullable = true) |-- price: double (nullable = true) |-- userid: long (nullable = true) |-- lines: array (nullable = true) ||-- element: struct (containsNull = true) |||-- id: long (nullable = true) |||-- orderid: long (nullable = true) |||-- product: string (nullable = true) clients = joinTable(user, orders, id, userid, orders) clients.printSchema() root |-- id: long (nullable = true) |-- name: string (nullable = true) |-- orders: array (nullable = true) ||-- element: struct (containsNull = true) |||-- id: long (nullable = true) |||-- price: double (nullable =
[jira] [Updated] (SPARK-9970) SQLContext.createDataFrame failed to properly determine column names
[ https://issues.apache.org/jira/browse/SPARK-9970?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Maciej Bryński updated SPARK-9970: -- Description: Hi, I'm trying to do nested join of tables. After first join everything is ok, but second join made some of the column names lost. My code is following: {code:java} def joinTable(tableLeft, tableRight, columnLeft, columnRight, columnNested, joinType = left_outer): tmpTable = sqlCtx.createDataFrame(tableRight.rdd.groupBy(lambda r: r.asDict()[columnRight])) tmpTable = tmpTable.select(tmpTable._1.alias(joinColumn), tmpTable._2.data.alias(columnNested)) return tableLeft.join(tmpTable, tableLeft[columnLeft] == tmpTable[joinColumn], joinType).drop(joinColumn) user = sqlContext.read.json(path + user.json) user.printSchema() root |-- id: long (nullable = true) |-- name: string (nullable = true) order = sqlContext.read.json(path + order.json) order.printSchema(); root |-- id: long (nullable = true) |-- price: double (nullable = true) |-- userid: long (nullable = true) lines = sqlContext.read.json(path + lines.json) lines.printSchema(); root |-- id: long (nullable = true) |-- orderid: long (nullable = true) |-- product: string (nullable = true) {code} And joining code: Please look for the result of 2nd join. There are columns _1,_2,_3. Should be 'id', 'orderid', 'product' {code:java} orders = joinTable(order, lines, id, orderid, lines) orders.printSchema() root |-- id: long (nullable = true) |-- price: double (nullable = true) |-- userid: long (nullable = true) |-- lines: array (nullable = true) ||-- element: struct (containsNull = true) |||-- id: long (nullable = true) |||-- orderid: long (nullable = true) |||-- product: string (nullable = true) clients = joinTable(user, orders, id, userid, orders) clients.printSchema() root |-- id: long (nullable = true) |-- name: string (nullable = true) |-- orders: array (nullable = true) ||-- element: struct (containsNull = true) |||-- id: long (nullable = true) |||-- price: double (nullable = true) |||-- userid: long (nullable = true) |||-- lines: array (nullable = true) ||||-- element: struct (containsNull = true) |||||-- _1: long (nullable = true) |||||-- _2: long (nullable = true) |||||-- _3: string (nullable = true) {code} I tried to check if groupBy isn't the root of the problem but it's looks right {code:java} grouped = orders.rdd.groupBy(lambda r: r.userid) print grouped.map(lambda x: list(x[1])).collect() [[Row(id=1, price=202.3, userid=1, lines=[Row(id=1, orderid=1, product=u'XXX'), Row(id=2, orderid=1, product=u'YYY')]), Row(id=2, price=343.99, userid=1, lines=[Row(id=3, orderid=2, product=u'XXX')])], [Row(id=3, price=399.99, userid=2, lines=[Row(id=4, orderid=3, product=u'XXX')])]] {code} So I assume that the problem is with createDataFrame. was: Hi, I'm trying to do nested join of tables. After first join everything is ok, but second join made some of the column names lost. My code is following: {code:java} def joinTable(tableLeft, tableRight, columnLeft, columnRight, columnNested, joinType = left_outer): tmpTable = sqlCtx.createDataFrame(tableRight.rdd.groupBy(lambda r: r.asDict()[columnRight])) tmpTable = tmpTable.select(tmpTable._1.alias(joinColumn), tmpTable._2.data.alias(columnNested)) return tableLeft.join(tmpTable, tableLeft[columnLeft] == tmpTable[joinColumn], joinType).drop(joinColumn) user = sqlContext.read.json(path + user.json) user.printSchema() root |-- id: long (nullable = true) |-- name: string (nullable = true) order = sqlContext.read.json(path + order.json) order.printSchema(); root |-- id: long (nullable = true) |-- price: double (nullable = true) |-- userid: long (nullable = true) lines = sqlContext.read.json(path + lines.json) lines.printSchema(); root |-- id: long (nullable = true) |-- orderid: long (nullable = true) |-- product: string (nullable = true) {code} And joining code: {code:java} orders = joinTable(order, lines, id, orderid, lines) orders.printSchema() root |-- id: long (nullable = true) |-- price: double (nullable = true) |-- userid: long (nullable = true) |-- lines: array (nullable = true) ||-- element: struct (containsNull = true) |||-- id: long (nullable = true) |||-- orderid: long (nullable = true) |||-- product: string (nullable = true) clients = joinTable(user, orders, id, userid, orders) clients.printSchema() root |-- id: long (nullable = true) |-- name: string (nullable = true) |-- orders: array (nullable = true) ||-- element: struct (containsNull = true) |||-- id: long (nullable = true) |||-- price: double (nullable = true) |||-- userid: long (nullable = true) |||-- lines: array (nullable = true) ||
[jira] [Updated] (SPARK-9970) SQLContext.createDataFrame failed to properly determine column names
[ https://issues.apache.org/jira/browse/SPARK-9970?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Maciej Bryński updated SPARK-9970: -- Priority: Major (was: Minor) SQLContext.createDataFrame failed to properly determine column names Key: SPARK-9970 URL: https://issues.apache.org/jira/browse/SPARK-9970 Project: Spark Issue Type: Bug Components: PySpark Affects Versions: 1.4.0 Reporter: Maciej Bryński Hi, I'm trying to do nested join of tables. After first join everything is ok, but second join made some of the column names lost. My code is following: {code:java} def joinTable(tableLeft, tableRight, columnLeft, columnRight, columnNested, joinType = left_outer): tmpTable = sqlCtx.createDataFrame(tableRight.rdd.groupBy(lambda r: r.asDict()[columnRight])) tmpTable = tmpTable.select(tmpTable._1.alias(joinColumn), tmpTable._2.data.alias(columnNested)) return tableLeft.join(tmpTable, tableLeft[columnLeft] == tmpTable[joinColumn], joinType).drop(joinColumn) user = sqlContext.read.json(path + user.json) user.printSchema() root |-- id: long (nullable = true) |-- name: string (nullable = true) order = sqlContext.read.json(path + order.json) order.printSchema(); root |-- id: long (nullable = true) |-- price: double (nullable = true) |-- userid: long (nullable = true) lines = sqlContext.read.json(path + lines.json) lines.printSchema(); root |-- id: long (nullable = true) |-- orderid: long (nullable = true) |-- product: string (nullable = true) {code} And joining code: Please look for the result of 2nd join. There are columns _1,_2,_3. Should be 'id', 'orderid', 'product' {code:java} orders = joinTable(order, lines, id, orderid, lines) orders.printSchema() root |-- id: long (nullable = true) |-- price: double (nullable = true) |-- userid: long (nullable = true) |-- lines: array (nullable = true) ||-- element: struct (containsNull = true) |||-- id: long (nullable = true) |||-- orderid: long (nullable = true) |||-- product: string (nullable = true) clients = joinTable(user, orders, id, userid, orders) clients.printSchema() root |-- id: long (nullable = true) |-- name: string (nullable = true) |-- orders: array (nullable = true) ||-- element: struct (containsNull = true) |||-- id: long (nullable = true) |||-- price: double (nullable = true) |||-- userid: long (nullable = true) |||-- lines: array (nullable = true) ||||-- element: struct (containsNull = true) |||||-- _1: long (nullable = true) |||||-- _2: long (nullable = true) |||||-- _3: string (nullable = true) {code} I tried to check if groupBy isn't the cause of the problem but it looks right {code:java} grouped = orders.rdd.groupBy(lambda r: r.userid) print grouped.map(lambda x: list(x[1])).collect() [[Row(id=1, price=202.3, userid=1, lines=[Row(id=1, orderid=1, product=u'XXX'), Row(id=2, orderid=1, product=u'YYY')]), Row(id=2, price=343.99, userid=1, lines=[Row(id=3, orderid=2, product=u'XXX')])], [Row(id=3, price=399.99, userid=2, lines=[Row(id=4, orderid=3, product=u'XXX')])]] {code} So I assume that the problem is with createDataFrame. -- This message was sent by Atlassian JIRA (v6.3.4#6332) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org