paleolimbot opened a new pull request, #828:
URL: https://github.com/apache/arrow-nanoarrow/pull/828

   This PR moves the proecess of collecting an array stream from R (where we 
had preserve/protect volume issues that made garbage collection very, very 
slow) into C/C++.
   
   Reproducer for generating an IPC file with a lot of strings:
   
   <details>
   
   ```r
   library(nanoarrow)
   
   ascii_bytes <- vapply(letters, charToRaw, raw(1), USE.NAMES = FALSE)
   
   random_string_array <- function(n = 1, n_chars = 16) {
     data_buffer <- sample(ascii_bytes, n_chars * n, replace = TRUE)
     offsets_buffer <- as.integer(seq(0, n * n_chars, length.out = n + 1))
     nanoarrow_array_modify(
       nanoarrow_array_init(na_string()),
       list(
         length = n,
         null_count = 0,
         buffers = list(NULL, offsets_buffer, data_buffer)
       )
     )
   }
   
   random_string_struct <- function(n_rows = 1024, n_cols = 1, n_chars = 16) {
     col_names <- sprintf("col%03d", seq_len(n_cols))
     col_types <- rep(list(na_string()), n_cols)
     names(col_types) <- col_names
     schema <- na_struct(col_types)
     
     columns <- lapply(
       col_names,
       function(...) random_string_array(n_rows, n_chars = n_chars)
     )
     
     nanoarrow_array_modify(
       nanoarrow_array_init(schema),
       list(
         length = n_rows,
         null_count = 0,
         children = columns
       )
     )  
   }
   
   random_string_batches <- function(n_batches = 1, n_rows = 1, n_cols = 1, 
n_chars = 16) {
     lapply(
       seq_len(n_batches),
       function(...) random_string_struct(n_rows, n_cols, n_chars)
     )
   }
   
   batches <- random_string_batches(n_batches = 100, n_cols = 160)
   stream <- basic_array_stream(batches)
   write_nanoarrow(stream, "many_strings.arrows")
   ```
   
   </details>
   
   ...in a separate R session, the issues around taking a long time for the GC 
to run seemed to go away (but it would be great to have a check!)
   
   ```r
   library(nanoarrow)
   
   df <- read_nanoarrow("many_strings.arrows") |> 
     convert_array_stream()
   f
   nanoarrow:::preserved_count()
   #> [1] 0
   system.time(gc(), gcFirst = FALSE)
   #> user  system elapsed 
   #> 0.036   0.001   0.037
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to