paleolimbot opened a new pull request, #390:
URL: https://github.com/apache/arrow-nanoarrow/pull/390

   This PR adds bindings to nanoarrow's IPC reader from R. The entrypoint for a 
user is `read_nanoarrow()`, which accepts raw vectors, connections, and file 
paths (thin wrapper around connections).
   
   These are all slower than the arrow package, which has more tools at its 
disposal to prevent copies.
   
   ``` r
   library(arrow, warn.conflicts = FALSE)
   library(nanoarrow)
   
   # Basic read example
   tf <- tempfile()
   write_ipc_stream(dplyr::starwars, tf)
   read_nanoarrow(tf) |> tibble::as_tibble()
   #> # A tibble: 87 × 14
   #>    name     height  mass hair_color skin_color eye_color birth_year sex   
gender
   #>    <chr>     <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> 
<chr> 
   #>  1 Luke Sk…    172    77 blond      fair       blue            19   male  
mascu…
   #>  2 C-3PO       167    75 <NA>       gold       yellow         112   none  
mascu…
   #>  3 R2-D2        96    32 <NA>       white, bl… red             33   none  
mascu…
   #>  4 Darth V…    202   136 none       white      yellow          41.9 male  
mascu…
   #>  5 Leia Or…    150    49 brown      light      brown           19   fema… 
femin…
   #>  6 Owen La…    178   120 brown, gr… light      blue            52   male  
mascu…
   #>  7 Beru Wh…    165    75 brown      light      blue            47   fema… 
femin…
   #>  8 R5-D4        97    32 <NA>       white, red red             NA   none  
mascu…
   #>  9 Biggs D…    183    84 black      light      brown           24   male  
mascu…
   #> 10 Obi-Wan…    182    77 auburn, w… fair       blue-gray       57   male  
mascu…
   #> # ℹ 77 more rows
   #> # ℹ 5 more variables: homeworld <chr>, species <chr>, films <list<chr>>,
   #> #   vehicles <list<chr>>, starships <list<chr>>
   
   df_bigish <- nanoarrow:::vec_gen(data.frame(x = character()), n = 1e6)
   write_ipc_stream(df_bigish, tf)
   
   # Wrapper because mmap is apparently not passed through from 
read_ipc_stream()
   # and this is pretty significant
   read_ipc_stream_wrap <- function(f, ..., mmap) {
     arrow::read_ipc_stream(
       arrow:::make_readable_file(f, mmap = mmap, random_access = FALSE),
       ...
     )
   }
   
   tf_raw <- brio::read_file_raw(tf)
   
   # Slower than arrow for raw vector input because of C implementation,
   # which doesn't currently share the global buffer (just shares buffers
   # between columns within a single batch)
   bench::mark(
     nanoarrow = read_nanoarrow(tf_raw) |> collect_array_stream(),
     arrow = read_ipc_stream(buffer(tf_raw), as_data_frame = FALSE),
     check = FALSE
   )
   #> # A tibble: 2 × 6
   #>   expression      min   median `itr/sec` mem_alloc `gc/sec`
   #>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
   #> 1 nanoarrow    1.27ms   1.84ms      439.    41.5KB     2.18
   #> 2 arrow      509.26µs 528.65µs     1821.     3.6MB    79.3
   
   
   # Slower than arrow, maybe because of C implementation, but definitely
   # because it uses base::readBin() which necessiates an extra copy
   bench::mark(
     nanoarrow = read_nanoarrow(tf) |> collect_array_stream(),
     arrow_mmap = read_ipc_stream_wrap(tf, mmap = TRUE, as_data_frame = FALSE),
     arrow = read_ipc_stream_wrap(tf, mmap = FALSE, as_data_frame = FALSE),
     check = FALSE
   )
   #> # A tibble: 3 × 6
   #>   expression      min   median `itr/sec` mem_alloc `gc/sec`
   #>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
   #> 1 nanoarrow    5.18ms   5.66ms      174.    16.2MB   189.  
   #> 2 arrow_mmap 613.48µs  640.5µs     1526.   528.6KB    13.9 
   #> 3 arrow        2.18ms   2.84ms      339.   551.6KB     4.06
   ```
   
   <sup>Created on 2024-02-19 with [reprex 
v2.0.2](https://reprex.tidyverse.org)</sup>


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to