This is an automated email from the ASF dual-hosted git repository.
dheres pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new c214c3c6f5 Add benchmark for `infer_json_schema` (#9546)
c214c3c6f5 is described below
commit c214c3c6f539c50ff644a3d92571375c57ffe11b
Author: Alexander Rafferty <[email protected]>
AuthorDate: Fri Mar 13 20:54:04 2026 +1100
Add benchmark for `infer_json_schema` (#9546)
# Which issue does this PR close?
Split out from #9494 to make review easier. It simply adds a benchmark
for JSON schema inference.
# Rationale for this change
I have an open PR that significantly refactors the JSON schema inference
code, so I want confidence that not only is the new code correct, but
also has better performance than the existing code.
# What changes are included in this PR?
Adds a benchmark.
# Are these changes tested?
N/A
# Are there any user-facing changes?
No
---
arrow-json/Cargo.toml | 1 +
arrow-json/benches/json_reader.rs | 76 +++++++++++++++++++++++++++++++++++++--
2 files changed, 75 insertions(+), 2 deletions(-)
diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml
index be1f8d0ccd..851f0a244f 100644
--- a/arrow-json/Cargo.toml
+++ b/arrow-json/Cargo.toml
@@ -61,6 +61,7 @@ tokio = { version = "1.27", default-features = false,
features = ["io-util"] }
bytes = "1.4"
criterion = { workspace = true, default-features = false }
rand = { version = "0.9", default-features = false, features = ["std",
"std_rng", "thread_rng"] }
+arbitrary = { version = "1.4.2", features = ["derive"] }
[[bench]]
name = "serde"
diff --git a/arrow-json/benches/json_reader.rs
b/arrow-json/benches/json_reader.rs
index f87ba695eb..fccac68d9b 100644
--- a/arrow-json/benches/json_reader.rs
+++ b/arrow-json/benches/json_reader.rs
@@ -15,12 +15,14 @@
// specific language governing permissions and limitations
// under the License.
+use arbitrary::{Arbitrary, Unstructured};
use arrow_json::ReaderBuilder;
-use arrow_json::reader::Decoder;
+use arrow_json::reader::{Decoder, infer_json_schema};
use arrow_schema::{DataType, Field, Schema};
use criterion::{
BenchmarkId, Criterion, SamplingMode, Throughput, criterion_group,
criterion_main,
};
+use serde::Serialize;
use serde_json::{Map, Number, Value};
use std::fmt::Write;
use std::hint::black_box;
@@ -323,6 +325,75 @@ fn bench_serialize_list(c: &mut Criterion) {
});
}
+fn bench_schema_inference(c: &mut Criterion) {
+ const ROWS: usize = 1000;
+
+ #[derive(Serialize, Arbitrary, Debug)]
+ struct Row {
+ a: Option<i16>,
+ b: Option<String>,
+ c: Option<[i16; 8]>,
+ d: Option<[bool; 8]>,
+ e: Option<Inner>,
+ f: f64,
+ }
+
+ #[derive(Serialize, Arbitrary, Debug)]
+ struct Inner {
+ a: Option<i16>,
+ b: Option<String>,
+ c: Option<bool>,
+ }
+
+ let mut data = vec![];
+ for row in pseudorandom_sequence::<Row>(ROWS) {
+ serde_json::to_writer(&mut data, &row).unwrap();
+ data.push(b'\n');
+ }
+
+ let mut group = c.benchmark_group("infer_json_schema");
+ group.throughput(Throughput::Bytes(data.len() as u64));
+ group.sample_size(50);
+ group.measurement_time(std::time::Duration::from_secs(5));
+ group.warm_up_time(std::time::Duration::from_secs(2));
+ group.sampling_mode(SamplingMode::Flat);
+ group.bench_function(BenchmarkId::from_parameter(ROWS), |b| {
+ b.iter(|| infer_json_schema(black_box(&data[..]), None).unwrap())
+ });
+ group.finish();
+}
+
+fn pseudorandom_sequence<T: for<'a> Arbitrary<'a>>(len: usize) -> Vec<T> {
+ static RAND_BYTES: &[u8; 255] = &[
+ 12, 135, 254, 243, 18, 5, 38, 175, 60, 58, 204, 103, 15, 88, 201, 199,
57, 63, 56, 234,
+ 106, 111, 238, 119, 214, 50, 110, 89, 129, 185, 112, 115, 35, 239,
188, 189, 49, 184, 194,
+ 146, 108, 131, 213, 43, 236, 81, 61, 20, 21, 52, 223, 220, 215, 74,
210, 27, 190, 107, 174,
+ 142, 237, 66, 75, 1, 53, 181, 82, 158, 68, 134, 176, 229, 157, 116,
233, 153, 84, 139, 151,
+ 8, 171, 59, 105, 242, 40, 69, 94, 170, 4, 187, 212, 156, 65, 90, 192,
216, 29, 222, 122,
+ 230, 198, 154, 155, 245, 45, 178, 123, 23, 117, 168, 149, 17, 177, 48,
54, 241, 202, 44,
+ 232, 64, 221, 252, 161, 91, 93, 143, 240, 102, 172, 209, 224, 186,
197, 219, 247, 71, 36,
+ 101, 133, 113, 6, 137, 231, 162, 31, 7, 22, 138, 47, 136, 2, 244, 141,
173, 99, 25, 95, 96,
+ 85, 249, 42, 251, 217, 16, 205, 98, 203, 92, 114, 14, 163, 150, 144,
10, 125, 13, 195, 72,
+ 41, 67, 246, 11, 77, 132, 83, 37, 24, 183, 226, 250, 109, 248, 33, 76,
9, 55, 159, 34, 62,
+ 196, 87, 3, 39, 28, 166, 167, 255, 206, 79, 191, 228, 193, 179, 97,
182, 148, 73, 120, 211,
+ 253, 70, 227, 51, 169, 130, 145, 218, 78, 180, 165, 46, 127, 152, 26,
140, 207, 19, 100,
+ 104, 80, 164, 126, 118, 200, 128, 86, 160, 32, 30, 225, 147, 124, 121,
235, 208,
+ ];
+
+ let bytes: Vec<u8> = RAND_BYTES
+ .iter()
+ .flat_map(|i| RAND_BYTES.map(|j| i.wrapping_add(j)))
+ .take(1000 * len)
+ .collect();
+
+ let mut u = Unstructured::new(&bytes);
+
+ (0..len)
+ .map(|_| u.arbitrary::<T>().unwrap())
+ .take(len)
+ .collect()
+}
+
criterion_group!(
benches,
bench_decode_wide_object,
@@ -330,6 +401,7 @@ criterion_group!(
bench_binary_hex,
bench_wide_projection,
bench_decode_list,
- bench_serialize_list
+ bench_serialize_list,
+ bench_schema_inference
);
criterion_main!(benches);