thisisnic commented on PR #12826:
URL: https://github.com/apache/arrow/pull/12826#issuecomment-1189078502
OK, more still needed here than I thought. I've now added it as an NSE
func, but get an error when trying to print things (likely as we're looking at
the schema, which it's not been added to) - as @nealrichardson said above "we
assume that the schema contains all possible valid field refs". Not sure if
there's a better way to get this info that uses something else.
``` r
library(arrow)
library(dplyr)
tf <- tempfile()
dir.create(tf)
write_dataset(mtcars, tf, partitioning = "cyl")
# this works and returns the dataset with the augmented file correctly added
open_dataset(tf) %>%
mutate(filename = add_filename()) %>%
collect()
#> mpg disp hp drat wt qsec vs am gear carb cyl
#> 1 18.7 360.0 175 3.15 3.440 17.02 0 0 3 2 8
#> 2 14.3 360.0 245 3.21 3.570 15.84 0 0 3 4 8
#> 3 16.4 275.8 180 3.07 4.070 17.40 0 0 3 3 8
#> 4 17.3 275.8 180 3.07 3.730 17.60 0 0 3 3 8
#> 5 15.2 275.8 180 3.07 3.780 18.00 0 0 3 3 8
#> 6 10.4 472.0 205 2.93 5.250 17.98 0 0 3 4 8
#> 7 10.4 460.0 215 3.00 5.424 17.82 0 0 3 4 8
#> 8 14.7 440.0 230 3.23 5.345 17.42 0 0 3 4 8
#> 9 15.5 318.0 150 2.76 3.520 16.87 0 0 3 2 8
#> 10 15.2 304.0 150 3.15 3.435 17.30 0 0 3 2 8
#> 11 13.3 350.0 245 3.73 3.840 15.41 0 0 3 4 8
#> 12 19.2 400.0 175 3.08 3.845 17.05 0 0 3 2 8
#> 13 15.8 351.0 264 4.22 3.170 14.50 0 1 5 4 8
#> 14 15.0 301.0 335 3.54 3.570 14.60 0 1 5 8 8
#> 15 22.8 108.0 93 3.85 2.320 18.61 1 1 4 1 4
#> 16 24.4 146.7 62 3.69 3.190 20.00 1 0 4 2 4
#> 17 22.8 140.8 95 3.92 3.150 22.90 1 0 4 2 4
#> 18 32.4 78.7 66 4.08 2.200 19.47 1 1 4 1 4
#> 19 30.4 75.7 52 4.93 1.615 18.52 1 1 4 2 4
#> 20 33.9 71.1 65 4.22 1.835 19.90 1 1 4 1 4
#> 21 21.5 120.1 97 3.70 2.465 20.01 1 0 3 1 4
#> 22 27.3 79.0 66 4.08 1.935 18.90 1 1 4 1 4
#> 23 26.0 120.3 91 4.43 2.140 16.70 0 1 5 2 4
#> 24 30.4 95.1 113 3.77 1.513 16.90 1 1 5 2 4
#> 25 21.4 121.0 109 4.11 2.780 18.60 1 1 4 2 4
#> 26 21.0 160.0 110 3.90 2.620 16.46 0 1 4 4 6
#> 27 21.0 160.0 110 3.90 2.875 17.02 0 1 4 4 6
#> 28 21.4 258.0 110 3.08 3.215 19.44 1 0 3 1 6
#> 29 18.1 225.0 105 2.76 3.460 20.22 1 0 3 1 6
#> 30 19.2 167.6 123 3.92 3.440 18.30 1 0 4 4 6
#> 31 17.8 167.6 123 3.92 3.440 18.90 1 0 4 4 6
#> 32 19.7 145.0 175 3.62 2.770 15.50 0 1 5 6 6
#> filename
#> 1 /tmp/RtmpzzDK4v/file32c352040ed57/cyl=8/part-0.parquet
#> 2 /tmp/RtmpzzDK4v/file32c352040ed57/cyl=8/part-0.parquet
#> 3 /tmp/RtmpzzDK4v/file32c352040ed57/cyl=8/part-0.parquet
#> 4 /tmp/RtmpzzDK4v/file32c352040ed57/cyl=8/part-0.parquet
#> 5 /tmp/RtmpzzDK4v/file32c352040ed57/cyl=8/part-0.parquet
#> 6 /tmp/RtmpzzDK4v/file32c352040ed57/cyl=8/part-0.parquet
#> 7 /tmp/RtmpzzDK4v/file32c352040ed57/cyl=8/part-0.parquet
#> 8 /tmp/RtmpzzDK4v/file32c352040ed57/cyl=8/part-0.parquet
#> 9 /tmp/RtmpzzDK4v/file32c352040ed57/cyl=8/part-0.parquet
#> 10 /tmp/RtmpzzDK4v/file32c352040ed57/cyl=8/part-0.parquet
#> 11 /tmp/RtmpzzDK4v/file32c352040ed57/cyl=8/part-0.parquet
#> 12 /tmp/RtmpzzDK4v/file32c352040ed57/cyl=8/part-0.parquet
#> 13 /tmp/RtmpzzDK4v/file32c352040ed57/cyl=8/part-0.parquet
#> 14 /tmp/RtmpzzDK4v/file32c352040ed57/cyl=8/part-0.parquet
#> 15 /tmp/RtmpzzDK4v/file32c352040ed57/cyl=4/part-0.parquet
#> 16 /tmp/RtmpzzDK4v/file32c352040ed57/cyl=4/part-0.parquet
#> 17 /tmp/RtmpzzDK4v/file32c352040ed57/cyl=4/part-0.parquet
#> 18 /tmp/RtmpzzDK4v/file32c352040ed57/cyl=4/part-0.parquet
#> 19 /tmp/RtmpzzDK4v/file32c352040ed57/cyl=4/part-0.parquet
#> 20 /tmp/RtmpzzDK4v/file32c352040ed57/cyl=4/part-0.parquet
#> 21 /tmp/RtmpzzDK4v/file32c352040ed57/cyl=4/part-0.parquet
#> 22 /tmp/RtmpzzDK4v/file32c352040ed57/cyl=4/part-0.parquet
#> 23 /tmp/RtmpzzDK4v/file32c352040ed57/cyl=4/part-0.parquet
#> 24 /tmp/RtmpzzDK4v/file32c352040ed57/cyl=4/part-0.parquet
#> 25 /tmp/RtmpzzDK4v/file32c352040ed57/cyl=4/part-0.parquet
#> 26 /tmp/RtmpzzDK4v/file32c352040ed57/cyl=6/part-0.parquet
#> 27 /tmp/RtmpzzDK4v/file32c352040ed57/cyl=6/part-0.parquet
#> 28 /tmp/RtmpzzDK4v/file32c352040ed57/cyl=6/part-0.parquet
#> 29 /tmp/RtmpzzDK4v/file32c352040ed57/cyl=6/part-0.parquet
#> 30 /tmp/RtmpzzDK4v/file32c352040ed57/cyl=6/part-0.parquet
#> 31 /tmp/RtmpzzDK4v/file32c352040ed57/cyl=6/part-0.parquet
#> 32 /tmp/RtmpzzDK4v/file32c352040ed57/cyl=6/part-0.parquet
# this doesn't - as it tries to print it
open_dataset(tf) %>%
mutate(filename = add_filename())
#> Error in schm$GetFieldByName(name)$type$ToString(): attempt to apply
non-function
# if we try it on a table, we get an error message -
# I can look to catch this and raise an error with more context
arrow_table(mtcars) %>% mutate(filename = add_filename()) %>% collect()
#> Error in `collect()`:
#> ! Invalid: No match for FieldRef.Name(__filename) in mpg: double
#> cyl: double
#> disp: double
#> hp: double
#> drat: double
#> wt: double
#> qsec: double
#> vs: double
#> am: double
#> gear: double
#> carb: double
#> /home/nic2/arrow/cpp/src/arrow/type.h:1800 CheckNonEmpty(matches, root)
#> /home/nic2/arrow/cpp/src/arrow/compute/exec/expression.cc:429
ref->FindOne(in)
#> /home/nic2/arrow/cpp/src/arrow/compute/exec/project_node.cc:67
expr.Bind(*inputs[0]->output_schema(), plan->exec_context())
# same error as with the dataset - "attempt to apply non-function"
arrow_table(mtcars) %>% mutate(filename = add_filename())
#> Error in schm$GetFieldByName(name)$type$ToString(): attempt to apply
non-function
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]