subject:"spark git commit\: \[SPARK\-17811\] SparkR cannot parallelize data.frame with NA or NULL in Date columns"

spark git commit: [SPARK-17811] SparkR cannot parallelize data.frame with NA or NULL in Date columns

2016-10-21 Thread felixcheung

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 a65d40ab6 -> 78458a7eb


[SPARK-17811] SparkR cannot parallelize data.frame with NA or NULL in Date 
columns

## What changes were proposed in this pull request?
NA date values are serialized as "NA" and NA time values are serialized as NaN 
from R. In the backend we did not have proper logic to deal with them. As a 
result we got an IllegalArgumentException for Date and wrong value for time. 
This PR adds support for deserializing NA as Date and Time.

## How was this patch tested?
* [x] TODO

Author: Hossein 

Closes #15421 from falaki/SPARK-17811.

(cherry picked from commit e371040a0150e4ed748a7c25465965840b61ca63)
Signed-off-by: Felix Cheung 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/78458a7e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/78458a7e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/78458a7e

Branch: refs/heads/branch-2.0
Commit: 78458a7ebeba6758890b01cc2b7417ab2fda221e
Parents: a65d40a
Author: Hossein 
Authored: Fri Oct 21 12:38:52 2016 -0700
Committer: Felix Cheung 
Committed: Fri Oct 21 12:45:35 2016 -0700

--
 R/pkg/inst/tests/testthat/test_sparkSQL.R   | 13 
 .../scala/org/apache/spark/api/r/SerDe.scala| 31 
 2 files changed, 38 insertions(+), 6 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/78458a7e/R/pkg/inst/tests/testthat/test_sparkSQL.R
--
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R 
b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index d33fcde..b7b9de7 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -373,6 +373,19 @@ test_that("create DataFrame with different data types", {
   expect_equal(collect(df), data.frame(l, stringsAsFactors = FALSE))
 })
 
+test_that("SPARK-17811: can create DataFrame containing NA as date and time", {
+  df <- data.frame(
+id = 1:2,
+time = c(as.POSIXlt("2016-01-10"), NA),
+date = c(as.Date("2016-10-01"), NA))
+
+  DF <- collect(createDataFrame(df))
+  expect_true(is.na(DF$date[2]))
+  expect_equal(DF$date[1], as.Date("2016-10-01"))
+  expect_true(is.na(DF$time[2]))
+  expect_equal(DF$time[1], as.POSIXlt("2016-01-10"))
+})
+
 test_that("create DataFrame with complex types", {
   e <- new.env()
   assign("n", 3L, envir = e)

http://git-wip-us.apache.org/repos/asf/spark/blob/78458a7e/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
--
diff --git a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala 
b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
index e4932a4..550e075 100644
--- a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
@@ -125,15 +125,34 @@ private[spark] object SerDe {
   }
 
   def readDate(in: DataInputStream): Date = {
-Date.valueOf(readString(in))
+try {
+  val inStr = readString(in)
+  if (inStr == "NA") {
+null
+  } else {
+Date.valueOf(inStr)
+  }
+} catch {
+  // TODO: SPARK-18011 with some versions of R deserializing NA from R 
results in NASE
+  case _: NegativeArraySizeException => null
+}
   }
 
   def readTime(in: DataInputStream): Timestamp = {
-val seconds = in.readDouble()
-val sec = Math.floor(seconds).toLong
-val t = new Timestamp(sec * 1000L)
-t.setNanos(((seconds - sec) * 1e9).toInt)
-t
+try {
+  val seconds = in.readDouble()
+  if (java.lang.Double.isNaN(seconds)) {
+null
+  } else {
+val sec = Math.floor(seconds).toLong
+val t = new Timestamp(sec * 1000L)
+t.setNanos(((seconds - sec) * 1e9).toInt)
+t
+  }
+} catch {
+  // TODO: SPARK-18011 with some versions of R deserializing NA from R 
results in NASE
+  case _: NegativeArraySizeException => null
+}
   }
 
   def readBytesArr(in: DataInputStream): Array[Array[Byte]] = {


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-17811] SparkR cannot parallelize data.frame with NA or NULL in Date columns

2016-10-21 Thread felixcheung

Repository: spark
Updated Branches:
  refs/heads/master e21e1c946 -> e371040a0


[SPARK-17811] SparkR cannot parallelize data.frame with NA or NULL in Date 
columns

## What changes were proposed in this pull request?
NA date values are serialized as "NA" and NA time values are serialized as NaN 
from R. In the backend we did not have proper logic to deal with them. As a 
result we got an IllegalArgumentException for Date and wrong value for time. 
This PR adds support for deserializing NA as Date and Time.

## How was this patch tested?
* [x] TODO

Author: Hossein 

Closes #15421 from falaki/SPARK-17811.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e371040a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e371040a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e371040a

Branch: refs/heads/master
Commit: e371040a0150e4ed748a7c25465965840b61ca63
Parents: e21e1c9
Author: Hossein 
Authored: Fri Oct 21 12:38:52 2016 -0700
Committer: Felix Cheung 
Committed: Fri Oct 21 12:38:52 2016 -0700

--
 R/pkg/inst/tests/testthat/test_sparkSQL.R   | 13 
 .../scala/org/apache/spark/api/r/SerDe.scala| 31 
 2 files changed, 38 insertions(+), 6 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e371040a/R/pkg/inst/tests/testthat/test_sparkSQL.R
--
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R 
b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index 3a987cd..b4b43fd 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -390,6 +390,19 @@ test_that("create DataFrame with different data types", {
   expect_equal(collect(df), data.frame(l, stringsAsFactors = FALSE))
 })
 
+test_that("SPARK-17811: can create DataFrame containing NA as date and time", {
+  df <- data.frame(
+id = 1:2,
+time = c(as.POSIXlt("2016-01-10"), NA),
+date = c(as.Date("2016-10-01"), NA))
+
+  DF <- collect(createDataFrame(df))
+  expect_true(is.na(DF$date[2]))
+  expect_equal(DF$date[1], as.Date("2016-10-01"))
+  expect_true(is.na(DF$time[2]))
+  expect_equal(DF$time[1], as.POSIXlt("2016-01-10"))
+})
+
 test_that("create DataFrame with complex types", {
   e <- new.env()
   assign("n", 3L, envir = e)

http://git-wip-us.apache.org/repos/asf/spark/blob/e371040a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
--
diff --git a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala 
b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
index e4932a4..550e075 100644
--- a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
@@ -125,15 +125,34 @@ private[spark] object SerDe {
   }
 
   def readDate(in: DataInputStream): Date = {
-Date.valueOf(readString(in))
+try {
+  val inStr = readString(in)
+  if (inStr == "NA") {
+null
+  } else {
+Date.valueOf(inStr)
+  }
+} catch {
+  // TODO: SPARK-18011 with some versions of R deserializing NA from R 
results in NASE
+  case _: NegativeArraySizeException => null
+}
   }
 
   def readTime(in: DataInputStream): Timestamp = {
-val seconds = in.readDouble()
-val sec = Math.floor(seconds).toLong
-val t = new Timestamp(sec * 1000L)
-t.setNanos(((seconds - sec) * 1e9).toInt)
-t
+try {
+  val seconds = in.readDouble()
+  if (java.lang.Double.isNaN(seconds)) {
+null
+  } else {
+val sec = Math.floor(seconds).toLong
+val t = new Timestamp(sec * 1000L)
+t.setNanos(((seconds - sec) * 1e9).toInt)
+t
+  }
+} catch {
+  // TODO: SPARK-18011 with some versions of R deserializing NA from R 
results in NASE
+  case _: NegativeArraySizeException => null
+}
   }
 
   def readBytesArr(in: DataInputStream): Array[Array[Byte]] = {


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-17811] SparkR cannot parallelize data.frame with NA or NULL in Date columns

spark git commit: [SPARK-17811] SparkR cannot parallelize data.frame with NA or NULL in Date columns

2 matches

Site Navigation

Mail list logo

Footer information