scovich commented on code in PR #9097:
URL: https://github.com/apache/arrow-rs/pull/9097#discussion_r2662874314


##########
arrow-json/src/reader/mod.rs:
##########
@@ -313,10 +314,28 @@ impl ReaderBuilder {
 
         let num_fields = self.schema.flattened_fields().len();
 
+        // Extract projection field set from schema for projection-aware 
parsing
+        // - strict_mode: Disabled (unknown fields cause errors anyway)
+        // - non-strict mode: Enabled to skip JSON fields not present in the 
schema
+        // Performance overhead minimized via depth caching and short-circuit 
optimization
+        let projection = if self.strict_mode {
+            None
+        } else {
+            // Non-strict mode: always enable projection to skip unknown fields
+            match &data_type {
+                DataType::Struct(fields) if !fields.is_empty() => {
+                    let field_names: HashSet<String> =
+                        fields.iter().map(|f| f.name().clone()).collect();
+                    Some(field_names)
+                }
+                _ => None,
+            }

Review Comment:
   ```suggestion
           let projection: Option<HashSet<String>> = match &data_type {
               DataType::Struct(fields) if !fields.is_empty() && 
!self.strict_mode => {
                   Some(fields.iter().map(|f| f.name().clone()).collect())
               }
               _ => None,
   ```



##########
arrow-json/src/reader/tape.rs:
##########
@@ -237,8 +238,21 @@ enum DecoderState {
     ///
     /// Consists of `(literal, decoded length)`
     Literal(Literal, u8),
+    /// Skipping a value (for unprojected fields)
+    ///
+    /// Consists of:
+    /// - `depth`: Nesting level of objects/arrays being skipped (u32)
+    /// - `flags`: Bit-packed flags (in_string: bit 0, escape: bit 1)
+    SkipValue {
+        depth: u32,
+        flags: u8,
+    },
 }
 
+// Bit flags for SkipValue state
+const SKIP_IN_STRING: u8 = 1 << 0; // 0x01
+const SKIP_ESCAPE: u8 = 1 << 1; // 0x02

Review Comment:
   Why not just use the hex constants directly, out of curiosity?



##########
arrow-json/src/reader/tape.rs:
##########
@@ -449,7 +481,52 @@ impl TapeDecoder {
                 DecoderState::Colon => {
                     iter.skip_whitespace();
                     match next!(iter) {
-                        b':' => self.stack.pop(),
+                        b':' => {
+                            self.stack.pop();
+
+                            // Check projection: if the field is not in the 
projection set,
+                            // replace the Value state with SkipValue
+                            // NOTE: Only apply projection at the top level 
(nesting_depth == 1)
+                            // This means direct fields of the root object, 
not nested objects
+                            if self.current_nesting_depth == 1 {
+                                if let Some(ref projection) = self.projection {
+                                    // Get the field name from the last String 
element
+                                    if let 
Some(TapeElement::String(string_idx)) =
+                                        self.elements.last()
+                                    {
+                                        let string_idx = *string_idx as usize;
+                                        let start = self.offsets[string_idx];
+                                        let end = self.offsets[string_idx + 1];
+                                        let field_name = std::str::from_utf8(
+                                            &self.bytes[start..end],
+                                        )
+                                        .map_err(|e| {
+                                            ArrowError::JsonError(format!(
+                                                "Invalid UTF-8 in field name: 
{}",
+                                                e

Review Comment:
   nit
   ```suggestion
                                                   "Invalid UTF-8 in field 
name: {e}"
   ```



##########
arrow-json/src/reader/tape.rs:
##########
@@ -519,6 +596,118 @@ impl TapeDecoder {
                     }
                     *idx += 1;
                 },
+                // Skip a value for unprojected fields (optimized 
batch-processing version)
+                DecoderState::SkipValue { depth, flags } => {
+                    loop {
+                        if iter.is_empty() {
+                            break; // Need more data, preserve state
+                        }
+
+                        let in_string = (*flags & SKIP_IN_STRING) != 0;
+                        let escape = (*flags & SKIP_ESCAPE) != 0;
+
+                        if in_string {
+                            // Fast skip to next \ or " using SIMD
+                            let _ = iter.skip_chrs(b'\\', b'"');
+
+                            if iter.is_empty() {
+                                break;
+                            }
+
+                            let b = next!(iter);
+                            match b {
+                                b'\\' => *flags ^= SKIP_ESCAPE, // Toggle 
escape flag
+                                b'"' if !escape => {

Review Comment:
   I don't think I understand how this works? 
   If `flags` is changing but `escape` has already captured a stale value of 
the flag?



##########
arrow-json/src/reader/tape.rs:
##########
@@ -237,8 +238,21 @@ enum DecoderState {
     ///
     /// Consists of `(literal, decoded length)`
     Literal(Literal, u8),
+    /// Skipping a value (for unprojected fields)
+    ///
+    /// Consists of:
+    /// - `depth`: Nesting level of objects/arrays being skipped (u32)
+    /// - `flags`: Bit-packed flags (in_string: bit 0, escape: bit 1)
+    SkipValue {
+        depth: u32,
+        flags: u8,
+    },
 }
 
+// Bit flags for SkipValue state
+const SKIP_IN_STRING: u8 = 1 << 0; // 0x01
+const SKIP_ESCAPE: u8 = 1 << 1; // 0x02

Review Comment:
   Also -- my intuition is that these two flags are only needed because 
`SkipValue` does too much. The newly introduced code has a _lot_ of looping and 
nesting, where the existing enum variants are quite flat. The difference seems 
to be that the existing variants hand off to a new state whenever they detect a 
state change?
   
   So e.g. instead of messing with flags, one might declare three new enum 
variants, `SkipValue`, `SkipString` and `SkipEscape`, where each nests 
exclusively inside the one before it? e.g. if the projection skipped field 
`foo`, then the following JSON fragment:
   ```json
   { 
     "foo": {
       "bar": "hello\nworld!"
     }
   }
   ```
   would:
   * push a `SkipValue` as soon as `:` detects that `foo` is not selected
   * push a `SkipString` as soon as it hits the opening `"` of the string
   * push a `SkipEscape` as soon as it hits the `\` inside the string
   * pop once the escape was processed
   * pop once the closing `"` is found
   * pop once the next field starts (or whatever is currently the ending 
condition for `SkipValue`)



##########
arrow-json/src/reader/tape.rs:
##########
@@ -237,8 +238,21 @@ enum DecoderState {
     ///
     /// Consists of `(literal, decoded length)`
     Literal(Literal, u8),
+    /// Skipping a value (for unprojected fields)
+    ///
+    /// Consists of:
+    /// - `depth`: Nesting level of objects/arrays being skipped (u32)
+    /// - `flags`: Bit-packed flags (in_string: bit 0, escape: bit 1)
+    SkipValue {
+        depth: u32,
+        flags: u8,
+    },
 }
 
+// Bit flags for SkipValue state
+const SKIP_IN_STRING: u8 = 1 << 0; // 0x01
+const SKIP_ESCAPE: u8 = 1 << 1; // 0x02

Review Comment:
   By the same token, one would arguably want to push multiple `SkipValue` 
states instead of tracking nesting depth with a new variable? But then enum 
variants start to proliferate (basically need two of each).
   
   Would it instead make sense to have a single skip offset that is the first 
stack index being skipped? 
   And then have pairs of match arms that decide what state gets pushed vs. 
merely traversed?



##########
arrow-json/src/reader/tape.rs:
##########
@@ -519,6 +596,118 @@ impl TapeDecoder {
                     }
                     *idx += 1;
                 },
+                // Skip a value for unprojected fields (optimized 
batch-processing version)
+                DecoderState::SkipValue { depth, flags } => {
+                    loop {
+                        if iter.is_empty() {
+                            break; // Need more data, preserve state
+                        }

Review Comment:
   nit: why not just
   ```suggestion
                       while !iter.is_empty() {
   ```



##########
arrow-json/src/reader/tape.rs:
##########
@@ -519,6 +596,118 @@ impl TapeDecoder {
                     }
                     *idx += 1;
                 },
+                // Skip a value for unprojected fields (optimized 
batch-processing version)
+                DecoderState::SkipValue { depth, flags } => {
+                    loop {
+                        if iter.is_empty() {
+                            break; // Need more data, preserve state
+                        }
+
+                        let in_string = (*flags & SKIP_IN_STRING) != 0;
+                        let escape = (*flags & SKIP_ESCAPE) != 0;
+
+                        if in_string {
+                            // Fast skip to next \ or " using SIMD
+                            let _ = iter.skip_chrs(b'\\', b'"');
+
+                            if iter.is_empty() {
+                                break;
+                            }
+
+                            let b = next!(iter);

Review Comment:
   The `next!` macro handles empty iterators with a `break`, no? 
   Preceding it with an `is_empty` check seems redundant?



##########
arrow-json/src/reader/tape.rs:
##########
@@ -449,7 +481,52 @@ impl TapeDecoder {
                 DecoderState::Colon => {
                     iter.skip_whitespace();
                     match next!(iter) {
-                        b':' => self.stack.pop(),
+                        b':' => {
+                            self.stack.pop();
+
+                            // Check projection: if the field is not in the 
projection set,
+                            // replace the Value state with SkipValue
+                            // NOTE: Only apply projection at the top level 
(nesting_depth == 1)
+                            // This means direct fields of the root object, 
not nested objects
+                            if self.current_nesting_depth == 1 {
+                                if let Some(ref projection) = self.projection {
+                                    // Get the field name from the last String 
element
+                                    if let 
Some(TapeElement::String(string_idx)) =
+                                        self.elements.last()
+                                    {
+                                        let string_idx = *string_idx as usize;
+                                        let start = self.offsets[string_idx];
+                                        let end = self.offsets[string_idx + 1];
+                                        let field_name = std::str::from_utf8(
+                                            &self.bytes[start..end],
+                                        )
+                                        .map_err(|e| {
+                                            ArrowError::JsonError(format!(
+                                                "Invalid UTF-8 in field name: 
{}",
+                                                e
+                                            ))
+                                        })?;
+
+                                        if !projection.contains(field_name) {
+                                            // Field not in projection: skip 
its value
+                                            // CRITICAL: Remove the field name 
from tape to maintain structure
+                                            // The tape must have paired 
field_name:value entries
+                                            self.elements.pop(); // Remove the 
String element
+                                            self.bytes.truncate(start); // 
Remove the field name bytes
+                                            self.offsets.pop(); // Remove the 
offset entry
+
+                                            // Replace Value state with 
SkipValue
+                                            if let Some(last) = 
self.stack.last_mut() {

Review Comment:
   is it actually possible for the stack to be empty here?



##########
arrow-json/src/reader/tape.rs:
##########
@@ -449,7 +481,52 @@ impl TapeDecoder {
                 DecoderState::Colon => {
                     iter.skip_whitespace();
                     match next!(iter) {
-                        b':' => self.stack.pop(),
+                        b':' => {
+                            self.stack.pop();
+
+                            // Check projection: if the field is not in the 
projection set,
+                            // replace the Value state with SkipValue
+                            // NOTE: Only apply projection at the top level 
(nesting_depth == 1)
+                            // This means direct fields of the root object, 
not nested objects
+                            if self.current_nesting_depth == 1 {
+                                if let Some(ref projection) = self.projection {
+                                    // Get the field name from the last String 
element
+                                    if let 
Some(TapeElement::String(string_idx)) =
+                                        self.elements.last()

Review Comment:
   I think if-let chaining is available now?
   
   ```suggestion
                               if self.current_nesting_depth == 1
                                   && let Some(ref projection) = self.projection
                                   && let Some(TapeElement::String(string_idx)) 
= self.elements.last()
                               {
   ```
   (this is so deeply nested that line lengths are a problem, tho... rule of 30 
suggests it's anyway worth factoring out a helper method?)
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to