tustvold commented on code in PR #3365:
URL: https://github.com/apache/arrow-rs/pull/3365#discussion_r1051685776
##########
arrow-csv/src/reader/mod.rs:
##########
@@ -393,142 +428,57 @@ impl<R: Read> Reader<R> {
projection: Option<Vec<usize>>,
datetime_format: Option<String>,
) -> Self {
- let csv_reader =
- Self::build_csv_reader(reader, has_header, delimiter, None, None,
None);
- Self::from_csv_reader(
- csv_reader,
+ Self::new(
+ reader,
schema,
has_header,
+ delimiter,
batch_size,
bounds,
projection,
datetime_format,
)
}
-
- fn build_csv_reader(
- reader: R,
- has_header: bool,
- delimiter: Option<u8>,
- escape: Option<u8>,
- quote: Option<u8>,
- terminator: Option<u8>,
- ) -> csv::Reader<R> {
- let mut reader_builder = csv::ReaderBuilder::new();
- reader_builder.has_headers(has_header);
-
- if let Some(c) = delimiter {
- reader_builder.delimiter(c);
- }
- reader_builder.escape(escape);
- if let Some(c) = quote {
- reader_builder.quote(c);
- }
- if let Some(t) = terminator {
- reader_builder.terminator(csv::Terminator::Any(t));
- }
- reader_builder.from_reader(reader)
- }
-
- fn from_csv_reader(
- mut csv_reader: csv::Reader<R>,
- schema: SchemaRef,
- has_header: bool,
- batch_size: usize,
- bounds: Bounds,
- projection: Option<Vec<usize>>,
- datetime_format: Option<String>,
- ) -> Self {
- let (start, end) = match bounds {
- None => (0, usize::MAX),
- Some((start, end)) => (start, end),
- };
-
- // First we will skip `start` rows
- // note that this skips by iteration. This is because in general it is
not possible
- // to seek in CSV. However, skipping still saves the burden of
creating arrow arrays,
- // which is a slow operation that scales with the number of columns
-
- let mut record = ByteRecord::new();
- // Skip first start items
- for _ in 0..start {
- let res = csv_reader.read_byte_record(&mut record);
- if !res.unwrap_or(false) {
- break;
- }
- }
-
- // Initialize batch_records with StringRecords so they
- // can be reused across batches
- let mut batch_records = Vec::with_capacity(batch_size);
- batch_records.resize_with(batch_size, Default::default);
-
- Self {
- schema,
- projection,
- reader: csv_reader,
- line_number: if has_header { start + 1 } else { start },
Review Comment:
This is the cause of #3364, it increments the start but not the end
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]