[ https://issues.apache.org/jira/browse/ARROW-14908?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17492221#comment-17492221 ]
Sam Albers commented on ARROW-14908: ------------------------------------ Some additional break crumbs. The examples (run on windows 10, with arrow 7.0.0) are wrapped in reprex::reprex because it catches the R crash nicely. h2. works by specifying join column {code:java} reprex::reprex( { library(arrow) library(dplyr) tf1 <- tempfile() dir.create(tf1) quakes %>% select(stations, lat,long) %>% write_dataset(tf1) tf2 <- tempfile() dir.create(tf2) quakes |> select(stations, mag, depth) %>% write_dataset(tf2) open_dataset(tf1) |> left_join(open_dataset(tf2), by = "stations") |> collect() }, session_info = TRUE, std_out_err = TRUE ){code} h2. doesn't work when one of the joins dfs is in memory {code:java} reprex::reprex( { library(arrow) library(dplyr) tf1 <- tempfile() dir.create(tf1) quakes %>% select(stations, lat,long) %>% write_dataset(tf1) tf2 <- tempfile() dir.create(tf2) quakes |> select(stations, mag, depth) %>% write_dataset(tf2) b <- open_dataset(tf2) %>% collect() open_dataset(tf1) |> left_join(b, by = "stations") |> collect() }, session_info = TRUE, std_out_err = TRUE ){code} h2. doesn't work when join column in not specified {code:java} reprex::reprex( { library(arrow) library(dplyr) tf1 <- tempfile() dir.create(tf1) quakes %>% select(stations, lat,long) %>% write_dataset(tf1) tf2 <- tempfile() dir.create(tf2) quakes |> select(stations, mag, depth) %>% write_dataset(tf2) open_dataset(tf1) |> left_join(open_dataset(tf2)) |> collect() }, session_info = TRUE, std_out_err = TRUE ){code} h2. doesn't work join column is specified by that column is the partition {code:java} reprex::reprex( { library(arrow) library(dplyr) tf1 <- tempfile() dir.create(tf1) quakes %>% select(stations, lat,long) %>% group_by(stations) %>% write_dataset(tf1) tf2 <- tempfile() dir.create(tf2) quakes |> select(stations, mag, depth) %>% group_by(stations) %>% write_dataset(tf2) open_dataset(tf1) |> left_join(open_dataset(tf2), by = "stations") |> collect() }, session_info = TRUE, std_out_err = TRUE ){code} > [R] join on dataset crashes on Windows > -------------------------------------- > > Key: ARROW-14908 > URL: https://issues.apache.org/jira/browse/ARROW-14908 > Project: Apache Arrow > Issue Type: Bug > Components: R > Affects Versions: 6.0.0 > Environment: R version 4.0.4 > Reporter: Fabio Machado > Assignee: Will Jones > Priority: Critical > Labels: pull-request-available > Fix For: 7.0.1 > > Time Spent: 1h 20m > Remaining Estimate: 0h > > {code:java} > library(tidyverse) > library(arrow) > car_info <- rownames_to_column(mtcars, "car_info") > cars_arrow_table <- arrow_table(car_info) > other_mtcars_data <- select(car_info, 1) %>% > mutate(main_color = sample( c("red", "blue", "white", "black"), size = n(), > replace = TRUE)) %>% > arrow::arrow_table() > temp <- tempdir() > par_temp <- paste0(temp, "\\parquet") > car_info %>% arrow::write_dataset(par_temp) > cars_arrow <- arrow::open_dataset(par_temp) > # using arrow tables works > ------------------------------------------------------ > cars_arrow_table %>% left_join(other_mtcars_data) %>% count(main_color) %>% > collect() > # using open dataset crashes R > ------------------------------------------------------------------ > other_mtcars_data %>% > left_join(cars_arrow) %>% > count(main_color) %>% > collect() > #other variation also crash > cars_arrow %>% > left_join(other_mtcars_data) %>% > count(main_color) %>% > collect() > cars_arrow %>% > left_join(other_mtcars_data) %>% > group_by(main_color) %>% > summarise(n = n()) %>% > collect() > #compute also crashes > cars_arrow %>% > left_join(other_mtcars_data) %>% > count(main_color) %>% > compute() > # workaround with duckdb > ------------------------------------------------------ > ##this works > cars_duck <- to_duckdb(cars_arrow, auto_disconnect = TRUE) > other_cars_duck <- to_duckdb(other_mtcars_data, auto_disconnect = TRUE) > > cars_duck %>% > left_join(other_cars_duck) %>% > count(main_color) %>% > collect() > ##this doesn't (don't know if expected to work actually) > cars_arrow %>% > left_join(other_mtcars_data) %>% > to_duckdb() {code} -- This message was sent by Atlassian Jira (v8.20.1#820001)