shao lo created SPARK-12586: ------------------------------- Summary: Wrong answer with registerTempTable and union sql query Key: SPARK-12586 URL: https://issues.apache.org/jira/browse/SPARK-12586 Project: Spark Issue Type: Bug Affects Versions: 1.5.2 Environment: Windows 7 Reporter: shao lo
The following python script gets the wrong answer unless workarounds are used... from pyspark import SparkContext from pyspark.sql import SQLContext if __name__ == "__main__": sc = SparkContext(appName="PythonSQLbug") sqlContext = SQLContext(sc) data = [(v,) for v in range(1, 5)] values = sqlContext.createDataFrame(data, ['v_value']) values.registerTempTable("values") values.show() data = [ (3, 1, 1, 1, None), (2, 1, 1, 1, 3), (3, 2, 1, 1, None), (3, 3, 1, 1, 2), (3, 4, 1, 2, None)] df1 = sqlContext.createDataFrame(data, ['row', 'col', 'foo', 'bar', 'value']) df1.registerTempTable("t0") df1.show() sql_text = """select row, col, foo, bar, value2 value from (select row, col, foo, bar, 8 value2 from t0 where row=1 and col=2) s1 union select row, col, foo, bar, value from t0 where not (row=1 and col=2)""" df2 = sqlContext.sql(sql_text) df2.registerTempTable("t1") # # The following 2 line workaround fixes the problem somehow? # df3 = sqlContext.createDataFrame(df2.collect()) # df3.registerTempTable("t1") # # The following 4 line workaround fixes the problem too..but takes way longer # filename = "t1.json" # df2.write.json(filename, mode='overwrite') # df3 = sqlContext.read.json(filename) # df3.registerTempTable("t1") sql_text2 = """select row, col, v1 value from (select v1 from (select v_value v1 from values) s1 left join (select value v2,foo,bar,row,col from t1 where foo=1 and bar=2 and value is not null) s2 on v1=v2 where v2 is null ) sa join (select row, col from t1 where foo=1 and bar=2 and value is null) sb""" result = sqlContext.sql(sql_text2) result.show() # Expected result # +---+---+-----+ # |row|col|value| # +---+---+-----+ # | 3| 4| 1| # | 3| 4| 2| # | 3| 4| 3| # | 3| 4| 4| # +---+---+-----+ # Getting this wrong result...when not using the workarounds above # +---+---+-----+ # |row|col|value| # +---+---+-----+ # +---+---+-----+ sc.stop() -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org