[ 
https://issues.apache.org/jira/browse/ARROW-12542?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17332782#comment-17332782
 ] 

Jonathan Keane edited comment on ARROW-12542 at 4/26/21, 10:15 PM:
-------------------------------------------------------------------

Oh, and interestingly, when writing the dataset it looks like when partitioning 
we're actually writing all of the metadata out to each partition (instead of 
partitioning it so that each partition only has the metadata for the rows in 
its partition)

{code:r}
> df <- tibble::tibble(
+   part = 1:10,
+   a = rep(1:2, 5),
+   x = rep(list(structure(1, foo = "bar"), structure(2, baz = "qux")), 5)
+ )
> 
> # write the dataset, and open it
> tmp <- tempfile()
> write_dataset(df, tmp, partitioning = "part", format = "parquet")
> ds <- open_dataset(tmp)
> 
> read_parquet(file.path(tmp, "part=1/part-0.parquet"))
# A tibble: 1 x 2
      a              x
  <int> <list<double>>
1     1            [1]
Warning message:
Invalid metadata$r 
> 
> tab <- read_parquet(file.path(tmp, "part=1/part-0.parquet"), as_data_frame = 
> FALSE)
> 
> tab$metadata$r
 'arrow_r_metadata' chr 
"A\n3\n262148\n197888\n5\nUTF-8\n531\n2\n531\n0\n1026\n1\n262153\n5\nnames\n16\n0\n254\n531\n3\n254\n254\n531\n2"|
 __truncated__
List of 2
 $ attributes: Named list()
 $ columns   :List of 3
  ..$ part: NULL
  ..$ a   : NULL
  ..$ x   :List of 2
  .. ..$ attributes: NULL
  .. ..$ columns   :List of 10
  .. .. ..$ :List of 2
  .. .. .. ..$ attributes:List of 1
  .. .. .. .. ..$ foo: chr "bar"
  .. .. .. ..$ columns   : NULL
  .. .. ..$ :List of 2
  [...]
{code}

Solving that alone (probably) won't be enough to resolve the {{filter()}} issue 
as well, but will be part of the solution


was (Author: jonkeane):
Oh, and interestingly, when writing the dataset it looks like when partitioning 
we're actually writing all of the metadata out to each partition (instead of 
partitioning it so that each partition only has the metadata for the rows in 
its partition)

{code:r}
> df <- tibble::tibble(
+   part = 1:10,
+   a = rep(1:2, 5),
+   x = rep(list(structure(1, foo = "bar"), structure(2, baz = "qux")), 5)
+ )
> 
> # write the dataset, and open it
> tmp <- tempfile()
> write_dataset(df, tmp, partitioning = "part", format = "parquet")
> ds <- open_dataset(tmp)
> 
> read_parquet(file.path(tmp, "part=1/part-0.parquet"))
# A tibble: 1 x 2
      a              x
  <int> <list<double>>
1     1            [1]
Warning message:
Invalid metadata$r 
> 
> tab <- read_parquet(file.path(tmp, "part=1/part-0.parquet"), as_data_frame = 
> FALSE)
> 
> tab$metadata$r
 'arrow_r_metadata' chr 
"A\n3\n262148\n197888\n5\nUTF-8\n531\n2\n531\n0\n1026\n1\n262153\n5\nnames\n16\n0\n254\n531\n3\n254\n254\n531\n2"|
 __truncated__
List of 2
 $ attributes: Named list()
 $ columns   :List of 3
  ..$ part: NULL
  ..$ a   : NULL
  ..$ x   :List of 2
  .. ..$ attributes: NULL
  .. ..$ columns   :List of 10
  .. .. ..$ :List of 2
  .. .. .. ..$ attributes:List of 1
  .. .. .. .. ..$ foo: chr "bar"
  .. .. .. ..$ columns   : NULL
  .. .. ..$ :List of 2
  .. .. .. ..$ attributes:List of 1
  .. .. .. .. ..$ baz: chr "qux"
  .. .. .. ..$ columns   : NULL
  .. .. ..$ :List of 2
  .. .. .. ..$ attributes:List of 1
  .. .. .. .. ..$ foo: chr "bar"
  .. .. .. ..$ columns   : NULL
  .. .. ..$ :List of 2
  .. .. .. ..$ attributes:List of 1
  .. .. .. .. ..$ baz: chr "qux"
  .. .. .. ..$ columns   : NULL
  .. .. ..$ :List of 2
  .. .. .. ..$ attributes:List of 1
  .. .. .. .. ..$ foo: chr "bar"
  .. .. .. ..$ columns   : NULL
  .. .. ..$ :List of 2
  .. .. .. ..$ attributes:List of 1
  .. .. .. .. ..$ baz: chr "qux"
  .. .. .. ..$ columns   : NULL
  .. .. ..$ :List of 2
  .. .. .. ..$ attributes:List of 1
  .. .. .. .. ..$ foo: chr "bar"
  .. .. .. ..$ columns   : NULL
  .. .. ..$ :List of 2
  .. .. .. ..$ attributes:List of 1
  .. .. .. .. ..$ baz: chr "qux"
  .. .. .. ..$ columns   : NULL
  .. .. ..$ :List of 2
  .. .. .. ..$ attributes:List of 1
  .. .. .. .. ..$ foo: chr "bar"
  .. .. .. ..$ columns   : NULL
  .. .. ..$ :List of 2
  .. .. .. ..$ attributes:List of 1
  .. .. .. .. ..$ baz: chr "qux"
  .. .. .. ..$ columns   : NULL
{code}

Solving that alone (probably) won't be enough to resolve the {{filter()}} issue 
as well, but will be part of the solution

> [R] SF columns in datasets with filters
> ---------------------------------------
>
>                 Key: ARROW-12542
>                 URL: https://issues.apache.org/jira/browse/ARROW-12542
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: R
>            Reporter: Jonathan Keane
>            Assignee: Jonathan Keane
>            Priority: Major
>              Labels: pull-request-available
>          Time Spent: 20m
>  Remaining Estimate: 0h
>
> First reported at 
> https://issues.apache.org/jira/browse/ARROW-10386?focusedCommentId=17331668&page=com.atlassian.jira.plugin.system.issuetabpanels%3Acomment-tabpanel#comment-17331668
> OK, I actually have recreated a similar issue. In the following code, I 
> create an sf object and write it as a dataset to parquet files. I then call 
> open_dataset() on the files.
> If I collect() the dataset I get back an sf object, no problem.
> But if I first filter() the dataset then collect() I get an error.
> {code:r}
> library(sf)
> library(arrow)
> library(dplyr)
> n <- 10000
> fake <- tibble(
>     ID=seq(n),
>     Date=sample(seq(as.Date('2019-01-01'), as.Date('2021-04-01'), by=1), 
> size=n, replace=TRUE),
>     x=runif(n=n, min=-170, max=170),
>     y=runif(n=n, min=-60, max=70),
>     text1=sample(x=state.name, size=n, replace=TRUE),
>     text2=sample(x=state.name, size=n, replace=TRUE),
>     text3=sample(x=state.division, size=n, replace=TRUE),
>     text4=sample(x=state.region, size=n, replace=TRUE),
>     text5=sample(x=state.abb, size=n, replace=TRUE),
>     num1=sample(x=state.center$x, size=n, replace=TRUE),
>     num2=sample(x=state.center$y, size=n, replace=TRUE),
>     num3=sample(x=state.area, size=n, replace=TRUE),
>     Rand1=rnorm(n=n),
>     Rand2=rnorm(n=n, mean=100, sd=3),
>     Rand3=rbinom(n=n, size=10, prob=0.4)
> )
> # make it into an sf object
> spat <- fake %>% 
>     st_as_sf(coords=c('x', 'y'), remove=FALSE, crs = 4326)
> class(spat)
> class(spat$geometry)
> # create new columns for partitioning and write to disk
> spat %>% 
>     mutate(Year=lubridate::year(Date), Month=lubridate::month(Date)) %>% 
>     group_by(Year, Month) %>% 
>     write_dataset('data/splits/', format='parquet')
> spat_in <- open_dataset('data/splits/')
> class(spat_in)
> # it's an sf as expected
> spat_in %>% collect() %>% class()
> spat_in %>% collect() %>% pull(geometry) %>% class()
> # it even plots
> leaflet::leaflet() %>% 
>     leaflet::addTiles() %>% 
>     leafgl::addGlPoints(data=spat_in %>% collect())
> # but if we filter first
> spat_in %>% 
>     filter(Year == 2020 & Month == 2) %>% 
>     collect()
> # we get this error
> Error in st_geometry.sf(x) : 
>   attr(obj, "sf_column") does not point to a geometry column.
> Did you rename it, without setting st_geometry(obj) <- "newname"?
> In addition: Warning message:
> Invalid metadata$r 
> {code}



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to