alamb commented on code in PR #7263:
URL: https://github.com/apache/arrow-rs/pull/7263#discussion_r1989799617


##########
arrow-json/src/reader/mod.rs:
##########
@@ -902,6 +905,43 @@ mod tests {
         assert_eq!(col2.value(4), "");
     }
 
+    #[test]
+    fn test_string_with_uft8view() {
+        let buf = r#"
+        {"a": "1", "b": "2"}
+        {"a": "hello", "b": "shoo"}
+        {"b": "\t😁foo", "a": 
"\nfoobar\ud83d\ude00\u0061\u0073\u0066\u0067\u00FF"}
+
+        {"b": null}
+        {"b": "", "a": null}
+
+        "#;
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Utf8View, true),
+            Field::new("b", DataType::LargeUtf8, true),
+        ]));
+
+        let batches = do_read(buf, 1024, false, false, schema);
+        assert_eq!(batches.len(), 1);
+
+        let col1 = batches[0].column(0).as_string_view();
+        assert_eq!(col1.null_count(), 2);
+        assert_eq!(col1.value(0), "1");
+        assert_eq!(col1.value(1), "hello");
+        assert_eq!(col1.value(2), "\nfoobar😀asfgÿ");

Review Comment:
   this value has more than 12 bytes so it will exercise the longer string view 
path



##########
arrow-json/src/reader/string_view_array.rs:
##########
@@ -0,0 +1,123 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow_array::builder::GenericByteViewBuilder;
+use arrow_array::types::StringViewType;
+use arrow_array::Array;
+use arrow_data::ArrayData;
+use arrow_schema::ArrowError;
+
+use crate::reader::tape::{Tape, TapeElement};
+use crate::reader::ArrayDecoder;
+
+const TRUE: &str = "true";
+const FALSE: &str = "false";
+
+pub struct StringViewArrayDecoder {
+    coerce_primitive: bool,
+}
+
+impl StringViewArrayDecoder {
+    pub fn new(coerce_primitive: bool) -> Self {
+        Self { coerce_primitive }
+    }
+}
+
+impl ArrayDecoder for StringViewArrayDecoder {
+    fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, 
ArrowError> {
+        let coerce = self.coerce_primitive;
+        let mut data_capacity = 0;
+        for &p in pos {
+            match tape.get(p) {
+                TapeElement::String(idx) => {
+                    data_capacity += tape.get_string(idx).len();
+                }
+                TapeElement::Null => { /* 不增加容量 */ }

Review Comment:
   would it be possible to use english in the comments?



##########
arrow-json/src/reader/string_view_array.rs:
##########
@@ -0,0 +1,123 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow_array::builder::GenericByteViewBuilder;
+use arrow_array::types::StringViewType;
+use arrow_array::Array;
+use arrow_data::ArrayData;
+use arrow_schema::ArrowError;
+
+use crate::reader::tape::{Tape, TapeElement};
+use crate::reader::ArrayDecoder;
+
+const TRUE: &str = "true";
+const FALSE: &str = "false";
+
+pub struct StringViewArrayDecoder {
+    coerce_primitive: bool,
+}
+
+impl StringViewArrayDecoder {
+    pub fn new(coerce_primitive: bool) -> Self {
+        Self { coerce_primitive }
+    }
+}
+
+impl ArrayDecoder for StringViewArrayDecoder {
+    fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, 
ArrowError> {
+        let coerce = self.coerce_primitive;
+        let mut data_capacity = 0;
+        for &p in pos {
+            match tape.get(p) {
+                TapeElement::String(idx) => {
+                    data_capacity += tape.get_string(idx).len();
+                }
+                TapeElement::Null => { /* 不增加容量 */ }
+                TapeElement::True if coerce => {
+                    data_capacity += TRUE.len();
+                }
+                TapeElement::False if coerce => {
+                    data_capacity += FALSE.len();
+                }
+                TapeElement::Number(idx) if coerce => {
+                    data_capacity += tape.get_string(idx).len();
+                }
+                TapeElement::I64(_) if coerce => {
+                    data_capacity += 10;
+                }
+                TapeElement::I32(_) if coerce => {
+                    data_capacity += 10;
+                }
+                TapeElement::F32(_) if coerce => {
+                    data_capacity += 10;
+                }
+                TapeElement::F64(_) if coerce => {
+                    data_capacity += 10;
+                }
+                _ => {
+                    return Err(tape.error(p, "string"));
+                }
+            }
+        }
+
+        let mut builder = 
GenericByteViewBuilder::<StringViewType>::with_capacity(data_capacity);
+
+        for &p in pos {
+            match tape.get(p) {
+                TapeElement::String(idx) => {
+                    builder.append_value(tape.get_string(idx));
+                }
+                TapeElement::Null => {
+                    builder.append_null();
+                }
+                TapeElement::True if coerce => {
+                    builder.append_value(TRUE);
+                }
+                TapeElement::False if coerce => {
+                    builder.append_value(FALSE);
+                }
+                TapeElement::Number(idx) if coerce => {
+                    builder.append_value(tape.get_string(idx));
+                }
+                TapeElement::I64(high) if coerce => match tape.get(p + 1) {
+                    TapeElement::I32(low) => {
+                        let val = ((high as i64) << 32) | (low as u32) as i64;
+                        builder.append_value(val.to_string());
+                    }
+                    _ => unreachable!(),
+                },
+                TapeElement::I32(n) if coerce => {
+                    builder.append_value(n.to_string());

Review Comment:
   This allocation is quite unfortunate (`to_string()` allocates a string) but 
I see this is consistent with the other JSON string implementation:
   
   
https://github.com/apache/arrow-rs/blob/a0c3186c55ac8ed3f6b8a15d1305548fd6305ebb/arrow-json/src/reader/string_array.rs#L111-L110



##########
arrow-json/src/reader/string_view_array.rs:
##########
@@ -0,0 +1,123 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow_array::builder::GenericByteViewBuilder;
+use arrow_array::types::StringViewType;
+use arrow_array::Array;
+use arrow_data::ArrayData;
+use arrow_schema::ArrowError;
+
+use crate::reader::tape::{Tape, TapeElement};
+use crate::reader::ArrayDecoder;
+
+const TRUE: &str = "true";
+const FALSE: &str = "false";
+
+pub struct StringViewArrayDecoder {
+    coerce_primitive: bool,
+}
+
+impl StringViewArrayDecoder {
+    pub fn new(coerce_primitive: bool) -> Self {
+        Self { coerce_primitive }
+    }
+}
+
+impl ArrayDecoder for StringViewArrayDecoder {
+    fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, 
ArrowError> {
+        let coerce = self.coerce_primitive;
+        let mut data_capacity = 0;
+        for &p in pos {

Review Comment:
   StringView is different that StringArray in that only "long" strings (longer 
than 12 bytes) contributed to the data
   
   Thus I think these calculations should be adjusted:
   1. only increase capacity for strings if the data is over 12 bytes
   2. don't increase for boolean
   3. For I32 probably we can use zero as well (as the longest such value is 
`-2147483647` whcih is less than 12 bytes)
   4. For I64 maybe we could be  more sophisticated and only add data length if 
the value is over `999999999999` etc.
   5. For F32 and F64 I am not sure what hte maximum length of a string 
representation is so we should probably keep the existing estimate
   
   More details on the layout are here: 
https://docs.rs/arrow/latest/arrow/array/struct.GenericByteViewArray.html



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to