TPDeramus opened a new issue, #38473:
URL: https://github.com/apache/arrow/issues/38473
### Describe the usage question you have. Please include as many useful
details as possible.
Hi Experts.
So sorry to bother you all with this, but I'm trying to pass an argument to
an arrow query to a directory full of about 4232 `parquet` files with various
different columns generated on a participant level using `fastDummies`. Once
the schema is fully partitioned (roughly 6745697 rows and 30486 columns), each
subject has multiple rows and a variable number of bolean (0 does not have/1
has character string) with gets padded with `NA` once queried.
I'm attempting to use some variation of `mutate` or `across` to replace all
the `NA` values with 0 prior to the `summarize` call so that _all_ columns and
rows of interest are retained and included prior to the calculation of the
sum/frequency of each boolean.
Below is the `open_dataset` call and a few of the replacement attempts I've
tried.
```
DOI <- open_dataset(sources = *<path_to_directory_of_parquet_files>*,
unify_schemas = TRUE)
> Terms <- DOI |> filter(treatment_flag == 0, comparator_flag == 0)|>
group_by(Participant) |>
select(DOI$schema$names[grepl("term_",DOI$schema$names)]) |>
mutate_at(DOI$schema$names[grepl("term_",DOI$schema$names)], ~replace_na(.,0))
|> summarise(across(DOI$schema$names[grepl("term_",DOI$schema$names)],sum)) |>
collect() |> as.data.frame()
Error in `map_lgl()`:
ℹ In index: 1.
Caused by error in `if (deparse(expr[[1]]) == name) ...`:
! the condition has length > 1
Run `rlang::last_trace()` to see where the error occurred.
> Terms <- DOI |> filter(treatment_flag == 0, comparator_flag == 0)|>
group_by(Participant) |>
select(DOI$schema$names[grepl("term_",DOI$schema$names)]) |> replace(is.na(.),
0) |> summarise(across(DOI$schema$names[grepl("term_",DOI$schema$names)],sum))
|> collect() |> as.data.frame()
Error in replace(select(group_by(filter(DOI, treatment_flag == :
object '.' not found
> Terms <- DOI |> filter(treatment_flag == 0, comparator_flag == 0)|>
group_by(Participant) |>
select(DOI$schema$names[grepl("term_",DOI$schema$names)]) |>
mutate(across(everything(), ~replace_na(.x, 0))) |>
summarise(across(DOI$schema$names[grepl("term_",DOI$schema$names)],sum)) |>
collect() |> as.data.frame()
Error: Expression replace_na(term_ade_anemia, 0) not supported in Arrow
Call collect() first to pull data into R.
> Terms <- DOI |> filter(treatment_flag == 0, comparator_flag == 0)|>
group_by(Participant) |>
select(DOI$schema$names[grepl("term_",DOI$schema$names)]) |> replace(is.na(.),
0) |> summarise(across(DOI$schema$names[grepl("term_",DOI$schema$names)],sum))
|> collect() |> as.data.frame()
Error in replace(select(group_by(filter(DOI, treatment_flag == :
object '.' not found
> Terms <- DOI |> filter(treatment_flag == 0, comparator_flag == 0)|>
group_by(Participant) |>
select(DOI$schema$names[grepl("term_",DOI$schema$names)]) |>
summarise(across(DOI$schema$names[grepl("term_",DOI$schema$names)],sum(na.rm=TRUE)))
|> collect() |> as.data.frame()
Error in `map2()`:
ℹ In index: 1.
ℹ With name: na.rm.
Caused by error in `call2()`:
! Can't create call to non-callable object
Run `rlang::last_trace()` to see where the error occurred.
> Terms <- DOI |> filter(treatment_flag == 0, comparator_flag == 0)|>
group_by(Participant) |>
select(DOI$schema$names[grepl("term_",DOI$schema$names)]) |>
mutate_at(DOI$schema$names[grepl("term_",DOI$schema$names)], ~replace_na(.,0))
|> summarise(across(DOI$schema$names[grepl("term_",DOI$schema$names)], ~ sum(.,
na.rm = TRUE ))) |> collect() |> as.data.frame()
> Terms <- DOI |> filter(treatment_flag == 0, comparator_flag == 0)|>
group_by(Participant) |>
select(DOI$schema$names[grepl("term_",DOI$schema$names)]) |>
summarise(across(DOI$schema$names[grepl("term_",DOI$schema$names)], ~ sum(.,
na.rm = TRUE ))) |> collect() |> as.data.frame()
> Terms <- DOI |> filter(treatment_flag == 0, comparator_flag == 0)|>
group_by(Participant) |>
select(DOI$schema$names[grepl("term_",DOI$schema$names)]) |>
mutate_if(is.na(.x), ~if_else(is.na(.x), 0)) |>
summarise(across(DOI$schema$names[grepl("term_",DOI$schema$names)], ~ sum)) |>
collect() |> as.data.frame()
Error in is_logical(.p) : object '.x' not found
> Terms <- DOI |> filter(treatment_flag == 0, comparator_flag == 0)|>
group_by(Participant) |>
select(DOI$schema$names[grepl("term_",DOI$schema$names)]) |>
mutate_all(funs(replace_na(., 0))) |>
summarise(across(DOI$schema$names[grepl("term_",DOI$schema$names)], ~ sum)) |>
collect() |> as.data.frame()
Warning message:
`funs()` was deprecated in dplyr 0.8.0.
ℹ Please use a list of either functions or lambdas:
# Simple named list: list(mean = mean, median = median)
# Auto named with `tibble::lst()`: tibble::lst(mean, median)
# Using lambdas list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
generated.
> Terms <- DOI |> filter(treatment_flag == 0, comparator_flag == 0)|>
group_by(Participant) |>
select(DOI$schema$names[grepl("term_",DOI$schema$names)]) |> mutate_if(is.na,
funs(replace_na(., 0))) |>
summarise(across(DOI$schema$names[grepl("term_",DOI$schema$names)], ~ sum)) |>
collect() |> as.data.frame()
Error in `mutate_if()`:
! `.p` is invalid.
✖ `.p` should return a single logical.
ℹ `.p` returns a size 3368621 <logical> for column `term_ade_anemia`.
Run `rlang::last_trace()` to see where the error occurred.
Warning message:
Default behavior of `pull()` on Arrow data is changing. Current behavior of
returning an R vector is deprecated, and in a future release, it will return an
Arrow `ChunkedArray`. To control this:
ℹ Specify `as_vector = TRUE` (the current default) or `FALSE` (what it will
change to) in `pull()`
ℹ Or, set `options(arrow.pull_as_vector)` globally
This warning is displayed once every 8 hours.
> Terms <- DOI |> filter(treatment_flag == 0, comparator_flag == 0)|>
group_by(Participant) |>
select(DOI$schema$names[grepl("term_",DOI$schema$names)]) |>
mutate(across(is.na(.x), ~replace_na(.x, 0))) |>
summarise(across(DOI$schema$names[grepl("term_",DOI$schema$names)], ~ sum)) |>
collect() |> as.data.frame()
Error in `column_select()`:
! Problem while evaluating `is.na(.x)`.
Caused by error:
! object '.x' not found
Run `rlang::last_trace()` to see where the error occurred.
> Terms <- DOI |> filter(treatment_flag == 0, comparator_flag == 0)|>
group_by(Participant) |>
select(DOI$schema$names[grepl("term_",DOI$schema$names)]) |>
mutate(across(where(is.na), ~na_if(., 0))) |>
summarise(across(DOI$schema$names[grepl("term_",DOI$schema$names)], ~ sum)) |>
collect() |> as.data.frame()
Error in `where()`:
! Predicate must return `TRUE` or `FALSE`, not an empty logical vector.
Run `rlang::last_trace()` to see where the error occurred.
> Terms <- DOI |> filter(treatment_flag == 0, comparator_flag == 0)|>
group_by(Participant) |>
select(DOI$schema$names[grepl("term_",DOI$schema$names)]) |>
mutate(across(where(is.na), ~coalesce(., 0))) |>
summarise(across(DOI$schema$names[grepl("term_",DOI$schema$names)], ~ sum)) |>
collect() |> as.data.frame()
Error in `where()`:
! Predicate must return `TRUE` or `FALSE`, not an empty logical vector.
Run `rlang::last_trace()` to see where the error occurred.
```
I understand based on some stackoverflow threads I've found that some
functions outside of those listed here
https://arrow.apache.org/docs/r/reference/acero.html do not seem to work
without collecting the data.
-
https://stackoverflow.com/questions/75976247/how-to-write-anonymous-functions-in-r-arrow-across
Any suggestions as to how this may be accomplished using `arrow` in R prior
to `collect()` or pulling everything into memory?
Thank you in advance.
### Component(s)
Parquet, R
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]