[ https://issues.apache.org/jira/browse/SPARK-4450?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Josh Rosen updated SPARK-4450: ------------------------------ Description: A simple summary program using spark-submit --master local MyJob.py vs. spark-submit --master yarn MyJob.py produces different answers--the output produced by local has been independently verified and is correct, but the output from yarn is incorrect. It does not appear to happen with smaller files, only large files. MyJob.py is {code} from pyspark import SparkContext, SparkConf from pyspark.sql import * def maybeFloat(x): """Convert NULLs into 0s""" if x=='': return 0. else: return float(x) def maybeInt(x): """Convert NULLs into 0s""" if x=='': return 0 else: return int(x) def mapColl(p): return { "f1": p[0], "f2": p[1], "f3": p[2], "f4": int(p[3]), "f5": int(p[4]), "f6": p[5], "f7": p[6], "f8": p[7], "f9": p[8], "f10": maybeInt(p[9]), "f11": p[10], "f12": p[11], "f13": p[12], "f14": p[13], "f15": maybeFloat(p[14]), "f16": maybeInt(p[15]), "f17": maybeFloat(p[16]) } sc = SparkContext() sqlContext = SQLContext(sc) lines = sc.textFile("sample.csv") fields = lines.map(lambda l: mapColl(l.split(","))) collTable = sqlContext.inferSchema(fields) collTable.registerAsTable("sample") test = sqlContext.sql("SELECT f9, COUNT(*) AS rows, SUM(f15) AS f15sum " \ + "FROM sample " \ + "GROUP BY f9") foo = test.collect() print foo sc.stop() {code} was: A simple summary program using spark-submit --master local MyJob.py vs. spark-submit --master yarn MyJob.py produces different answers--the output produced by local has been independently verified and is correct, but the output from yarn is incorrect. It does not appear to happen with smaller files, only large files. MyJob.py is from pyspark import SparkContext, SparkConf from pyspark.sql import * def maybeFloat(x): """Convert NULLs into 0s""" if x=='': return 0. else: return float(x) def maybeInt(x): """Convert NULLs into 0s""" if x=='': return 0 else: return int(x) def mapColl(p): return { "f1": p[0], "f2": p[1], "f3": p[2], "f4": int(p[3]), "f5": int(p[4]), "f6": p[5], "f7": p[6], "f8": p[7], "f9": p[8], "f10": maybeInt(p[9]), "f11": p[10], "f12": p[11], "f13": p[12], "f14": p[13], "f15": maybeFloat(p[14]), "f16": maybeInt(p[15]), "f17": maybeFloat(p[16]) } sc = SparkContext() sqlContext = SQLContext(sc) lines = sc.textFile("sample.csv") fields = lines.map(lambda l: mapColl(l.split(","))) collTable = sqlContext.inferSchema(fields) collTable.registerAsTable("sample") test = sqlContext.sql("SELECT f9, COUNT(*) AS rows, SUM(f15) AS f15sum " \ + "FROM sample " \ + "GROUP BY f9") foo = test.collect() print foo sc.stop() > SparkSQL producing incorrect answer when using --master yarn > ------------------------------------------------------------ > > Key: SPARK-4450 > URL: https://issues.apache.org/jira/browse/SPARK-4450 > Project: Spark > Issue Type: Bug > Components: SQL > Affects Versions: 1.0.0 > Environment: CDH 5.1 > Reporter: Rick Bischoff > > A simple summary program using > spark-submit --master local MyJob.py > vs. > spark-submit --master yarn MyJob.py > produces different answers--the output produced by local has been > independently verified and is correct, but the output from yarn is incorrect. > It does not appear to happen with smaller files, only large files. > MyJob.py is > {code} > from pyspark import SparkContext, SparkConf > from pyspark.sql import * > def maybeFloat(x): > """Convert NULLs into 0s""" > if x=='': return 0. > else: return float(x) > def maybeInt(x): > """Convert NULLs into 0s""" > if x=='': return 0 > else: return int(x) > def mapColl(p): > return { > "f1": p[0], > "f2": p[1], > "f3": p[2], > "f4": int(p[3]), > "f5": int(p[4]), > "f6": p[5], > "f7": p[6], > "f8": p[7], > "f9": p[8], > "f10": maybeInt(p[9]), > "f11": p[10], > "f12": p[11], > "f13": p[12], > "f14": p[13], > "f15": maybeFloat(p[14]), > "f16": maybeInt(p[15]), > "f17": maybeFloat(p[16]) } > sc = SparkContext() > sqlContext = SQLContext(sc) > lines = sc.textFile("sample.csv") > fields = lines.map(lambda l: mapColl(l.split(","))) > collTable = sqlContext.inferSchema(fields) > collTable.registerAsTable("sample") > test = sqlContext.sql("SELECT f9, COUNT(*) AS rows, SUM(f15) AS f15sum " \ > + "FROM sample " \ > + "GROUP BY f9") > foo = test.collect() > print foo > sc.stop() > {code} -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org