[ https://issues.apache.org/jira/browse/ARROW-15397?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
José F updated ARROW-15397: --------------------------- Description: Hi dear arrow developers. I tested inner_join with arrow R package but R crashed, this is my example with toy dataset iris: data(iris) write.csv(iris, "iris.csv") # write csv file # write parket file with write_chunk_data function (below) walk("C:/Users/Stats/Desktop/ejemplo_join/iris.csv", write_chunk_data, "C:/Users/Stats/Desktop/ejemplo_join/parquet", chunk_size = 50) iris_arrow <- open_dataset("parquet") df1_arrow <- iris_arrow %>% select(`...1`, Sepal.Length, Sepal.Width, Petal.Length) df2_arrow <- iris_arrow %>% select(`...1`, Petal.Width, Species,) d df <- tabla1_arrow %>% inner_join(tabla2_arrow, by = "...1") %>% group_by(Species) %>% summarise(prom = mean(Sepal.Length)) %>% collect() print(df) # Run this function to write parquet files in this example please write_chunk_data <- function(data_path, output_dir, chunk_size = 1000000) { #If the output_dir do not exist, it is created if (!fs::dir_exists(output_dir)) fs::dir_create(output_dir) #It gets the name of the file data_name <- fs::path_ext_remove(fs::path_file(data_path)) #It sets the chunk_num to 0 chunk_num <- 0 #Read the file using vroom data_chunk <- vroom::vroom(data_path) #It gets the variable names data_names <- names(data_chunk) #It gets the number of rows rows<-nrow(data_chunk) #The following loop creates a parquet file for every [chunk_size] rows repeat{ #It checks if we are over the max rows if(rows>(chunk_num+1)*chunk_size) { arrow::write_parquet(data_chunk[(chunk_num*chunk_size+1):((chunk_num+1)*chunk_size),], fs::path(output_dir, glue::glue("\\{data_name} -\{chunk_num}.parquet"))) } else { arrow::write_parquet(data_chunk[(chunk_num*chunk_size+1):rows,], fs::path(output_dir, glue::glue("\\{data_name} -\{chunk_num}.parquet"))) break } chunk_num <- chunk_num + 1 } #This is to recover some memory and space in the disk rm(data_chunk) tmp_file <- tempdir() files <- list.files(tmp_file, full.names = T, pattern = "^vroom") file.remove(files) } was: Hi dear arrow developers. I tested inner_join with arrow R package but R crashed, this is my example with toy dataset iris: data(iris) write.csv(iris, "iris.csv") # write csv file # write parket file with write_chunk_data (at the end for interested users) walk("C:/Users/Stats/Desktop/ejemplo_join/iris.csv", write_chunk_data, "C:/Users/Stats/Desktop/ejemplo_join/parquet", chunk_size = 50) iris_arrow <- open_dataset("parquet") df1_arrow <- iris_arrow %>% select(`...1`, Sepal.Length, Sepal.Width, Petal.Length) df2_arrow <- iris_arrow %>% select(`...1`, Petal.Width, Species,) d df <- tabla1_arrow %>% inner_join(tabla2_arrow, by = "...1") %>% group_by(Species) %>% summarise(prom = mean(Sepal.Length)) %>% collect() print(df) # If you want to check function to write parquet file from a csv: write_chunk_data <- function(data_path, output_dir, chunk_size = 1000000) { #If the output_dir do not exist, it is created if (!fs::dir_exists(output_dir)) fs::dir_create(output_dir) #It gets the name of the file data_name <- fs::path_ext_remove(fs::path_file(data_path)) #It sets the chunk_num to 0 chunk_num <- 0 #Read the file using vroom data_chunk <- vroom::vroom(data_path) #It gets the variable names data_names <- names(data_chunk) #It gets the number of rows rows<-nrow(data_chunk) #The following loop creates a parquet file for every [chunk_size] rows repeat{ #It checks if we are over the max rows if(rows>(chunk_num+1)*chunk_size){ arrow::write_parquet(data_chunk[(chunk_num*chunk_size+1):((chunk_num+1)*chunk_size),], fs::path(output_dir, glue::glue("\{data_name}-\{chunk_num}.parquet"))) } else{ arrow::write_parquet(data_chunk[(chunk_num*chunk_size+1):rows,], fs::path(output_dir, glue::glue("\{data_name}-\{chunk_num}.parquet"))) break } chunk_num <- chunk_num + 1 } #This is to recover some memory and space in the disk rm(data_chunk) tmp_file <- tempdir() files <- list.files(tmp_file, full.names = T, pattern = "^vroom") file.remove(files) } > Problem with Join in apache arrow in R > -------------------------------------- > > Key: ARROW-15397 > URL: https://issues.apache.org/jira/browse/ARROW-15397 > Project: Apache Arrow > Issue Type: Bug > Components: R > Affects Versions: 6.0.1 > Reporter: José F > Priority: Major > Fix For: 6.0.2 > > > Hi dear arrow developers. I tested inner_join with arrow R package but R > crashed, this is my example with toy dataset iris: > > data(iris) > write.csv(iris, "iris.csv") # write csv file > # write parket file with write_chunk_data function (below) > walk("C:/Users/Stats/Desktop/ejemplo_join/iris.csv", > write_chunk_data, "C:/Users/Stats/Desktop/ejemplo_join/parquet", > chunk_size = 50) > > iris_arrow <- open_dataset("parquet") > df1_arrow <- iris_arrow %>% select(`...1`, Sepal.Length, Sepal.Width, > Petal.Length) > df2_arrow <- iris_arrow %>% select(`...1`, Petal.Width, Species,) d > df <- tabla1_arrow %>% inner_join(tabla2_arrow, by = "...1") %>% > group_by(Species) %>% summarise(prom = mean(Sepal.Length)) %>% collect() > print(df) > > > # Run this function to write parquet files in this example please > write_chunk_data <- function(data_path, output_dir, chunk_size = 1000000) { > #If the output_dir do not exist, it is created > if (!fs::dir_exists(output_dir)) fs::dir_create(output_dir) > #It gets the name of the file > data_name <- fs::path_ext_remove(fs::path_file(data_path)) > #It sets the chunk_num to 0 > chunk_num <- 0 > #Read the file using vroom > data_chunk <- vroom::vroom(data_path) > #It gets the variable names > data_names <- names(data_chunk) > #It gets the number of rows > rows<-nrow(data_chunk) > > #The following loop creates a parquet file for every [chunk_size] rows > repeat{ > #It checks if we are over the max rows > if(rows>(chunk_num+1)*chunk_size) > { > arrow::write_parquet(data_chunk[(chunk_num*chunk_size+1):((chunk_num+1)*chunk_size),], > fs::path(output_dir, glue::glue("\\{data_name} > -\{chunk_num}.parquet"))) > } > else > { arrow::write_parquet(data_chunk[(chunk_num*chunk_size+1):rows,], > fs::path(output_dir, glue::glue("\\{data_name} > -\{chunk_num}.parquet"))) > break > } > chunk_num <- chunk_num + 1 > } > > #This is to recover some memory and space in the disk > rm(data_chunk) > tmp_file <- tempdir() > files <- list.files(tmp_file, full.names = T, pattern = "^vroom") > file.remove(files) > } > -- This message was sent by Atlassian Jira (v8.20.1#820001)