Copilot commented on code in PR #101: URL: https://github.com/apache/datasketches-rust/pull/101#discussion_r2828819866
########## CHANGELOG.md: ########## @@ -13,6 +13,7 @@ All significant changes to this project will be documented in this file. * `CountMinSketch` with unsigned values now supports `halve` and `decay` operations. * `CpcSketch` and `CpcUnion` are now available for cardinality estimation. +* `CpcWrapper` is now available for read `CpcSketch`'s estimation from a serialized sketch without full deserialization. Review Comment: The phrase "for read `CpcSketch`'s estimation" is grammatically incorrect. It should be "for reading `CpcSketch`'s estimation" or "to read `CpcSketch` estimation". ```suggestion * `CpcWrapper` is now available for reading `CpcSketch`'s estimation from a serialized sketch without full deserialization. ``` ########## datasketches/src/cpc/wrapper.rs: ########## @@ -0,0 +1,173 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::codec::SketchSlice; +use crate::codec::family::Family; +use crate::codec::utility::ensure_preamble_longs_in; +use crate::codec::utility::ensure_serial_version_is; +use crate::common::NumStdDev; +use crate::cpc::MAX_LG_K; +use crate::cpc::MIN_LG_K; +use crate::cpc::estimator::estimate; +use crate::cpc::estimator::lower_bound; +use crate::cpc::estimator::upper_bound; +use crate::cpc::serialization::FLAG_COMPRESSED; +use crate::cpc::serialization::FLAG_HAS_HIP; +use crate::cpc::serialization::FLAG_HAS_TABLE; +use crate::cpc::serialization::FLAG_HAS_WINDOW; +use crate::cpc::serialization::SERIAL_VERSION; +use crate::cpc::serialization::make_preamble_ints; +use crate::error::Error; +use crate::error::ErrorKind; + +/// A read-only view of a serialized image of a CpcSketch. +#[derive(Debug, Clone)] +pub struct CpcWrapper { + lg_k: u8, + merge_flag: bool, + num_coupons: u32, + hip_est_accum: f64, +} + +impl CpcWrapper { + /// Creates a new `CpcWrapper` from the given byte slice without copying. + pub fn new(bytes: &[u8]) -> Result<Self, Error> { + fn make_error(tag: &'static str) -> impl FnOnce(std::io::Error) -> Error { + move |_| Error::insufficient_data(tag) + } + + let mut cursor = SketchSlice::new(bytes); + let preamble_ints = cursor.read_u8().map_err(make_error("preamble_ints"))?; + let serial_version = cursor.read_u8().map_err(make_error("serial_version"))?; + let family_id = cursor.read_u8().map_err(make_error("family_id"))?; + Family::CPC.validate_id(family_id)?; + ensure_serial_version_is(SERIAL_VERSION, serial_version)?; + + let lg_k = cursor.read_u8().map_err(make_error("lg_k"))?; + let first_interesting_column = cursor + .read_u8() + .map_err(make_error("first_interesting_column"))?; + if !(MIN_LG_K..=MAX_LG_K).contains(&lg_k) { + return Err(Error::invalid_argument(format!( + "lg_k out of range; got {}", + lg_k + ))); + } + if first_interesting_column > 63 { + return Err(Error::invalid_argument(format!( + "first_interesting_column out of range; got {}", + first_interesting_column + ))); + } + + let flags = cursor.read_u8().map_err(make_error("flags"))?; + let is_compressed = flags & (1 << FLAG_COMPRESSED) != 0; + if !is_compressed { + return Err(Error::new( + ErrorKind::InvalidData, + "only compressed sketches are supported", + )); + } + let has_hip = flags & (1 << FLAG_HAS_HIP) != 0; + let has_table = flags & (1 << FLAG_HAS_TABLE) != 0; + let has_window = flags & (1 << FLAG_HAS_WINDOW) != 0; + + cursor.read_u16_le().map_err(make_error("seed_hash"))?; + + let mut num_coupons = 0; + let mut hip_est_accum = 0.0; + + if has_table || has_window { + num_coupons = cursor.read_u32_le().map_err(make_error("num_coupons"))?; + if has_table && has_window { + cursor + .read_u32_le() + .map_err(make_error("table_num_entries"))?; + if has_hip { + cursor.read_f64_le().map_err(make_error("kxp"))?; + hip_est_accum = cursor.read_f64_le().map_err(make_error("hip_est_accum"))?; + } + } + if has_table { + cursor + .read_u32_le() + .map_err(make_error("table_data_words"))? as usize; + } + if has_window { + cursor + .read_u32_le() + .map_err(make_error("window_data_words"))? as usize; Review Comment: The cast to `usize` is unnecessary here since the result is immediately discarded. The field is intentionally read to advance the cursor position in the byte stream but its value is not used. Consider removing the cast for clarity. ```suggestion .map_err(make_error("table_data_words"))?; } if has_window { cursor .read_u32_le() .map_err(make_error("window_data_words"))?; ``` ########## datasketches/tests/cpc_wrapper_test.rs: ########## @@ -0,0 +1,78 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datasketches::common::NumStdDev; +use datasketches::cpc::CpcSketch; +use datasketches::cpc::CpcUnion; +use datasketches::cpc::CpcWrapper; +use googletest::assert_that; +use googletest::prelude::contains_substring; +use googletest::prelude::eq; + +#[test] +fn test_cpc_wrapper() { + let lg_k = 10; + let mut sk1 = CpcSketch::new(lg_k); + let mut sk2 = CpcSketch::new(lg_k); + let mut sk_dst = CpcSketch::new(lg_k); + + let n = 100000; + for i in 0..n { + sk1.update(i); + sk2.update(i + n); + sk_dst.update(i); + sk_dst.update(i + n); + } + + let dst_est = sk_dst.estimate(); + let dst_lb = sk_dst.lower_bound(NumStdDev::Two); + let dst_ub = sk_dst.upper_bound(NumStdDev::Two); + + let concat_bytes = sk_dst.serialize(); + let concat_wrapper = CpcWrapper::new(&concat_bytes).unwrap(); + assert_that!(concat_wrapper.lg_k(), eq(lg_k)); + assert_that!(concat_wrapper.estimate(), eq(dst_est)); + assert_that!(concat_wrapper.lower_bound(NumStdDev::Two), eq(dst_lb)); + assert_that!(concat_wrapper.upper_bound(NumStdDev::Two), eq(dst_ub)); + + let mut union = CpcUnion::new(lg_k); + union.update(&sk1); + union.update(&sk2); + let merged = union.to_sketch(); + let merged_est = merged.estimate(); + let merged_lb = merged.lower_bound(NumStdDev::Two); + let merged_ub = merged.upper_bound(NumStdDev::Two); + + let merged_bytes = merged.serialize(); + let merged_wrapper = CpcWrapper::new(&merged_bytes).unwrap(); + assert_that!(merged_wrapper.lg_k(), eq(lg_k)); + assert_that!(merged_wrapper.estimate(), eq(merged_est)); + assert_that!(merged_wrapper.lower_bound(NumStdDev::Two), eq(merged_lb)); + assert_that!(merged_wrapper.upper_bound(NumStdDev::Two), eq(merged_ub)); +} + +#[test] +fn test_is_compressed() { + let sketch = CpcSketch::new(10); + let mut bytes = sketch.serialize(); + bytes[5] &= (-3i8) as u8; // clear compressed flag + let err = CpcWrapper::new(&bytes).unwrap_err(); + assert_that!( + err.message(), + contains_substring("only compressed sketches are supported") + ); +} Review Comment: Consider adding a test case for the `is_empty()` method. This would be consistent with other sketch tests in the codebase (e.g., cpc_update_test.rs:30, bloom_serialization_test.rs:40) and would verify that an empty serialized sketch correctly returns true for `is_empty()` and a non-empty sketch returns false. ```suggestion } #[test] fn test_is_empty() { // Empty sketch should be reported as empty after serialization and wrapping let empty_sketch = CpcSketch::new(10); let empty_bytes = empty_sketch.serialize(); let empty_wrapper = CpcWrapper::new(&empty_bytes).unwrap(); assert_that!(empty_wrapper.is_empty(), eq(true)); // Non-empty sketch should be reported as non-empty let mut non_empty_sketch = CpcSketch::new(10); non_empty_sketch.update(1u64); let non_empty_bytes = non_empty_sketch.serialize(); let non_empty_wrapper = CpcWrapper::new(&non_empty_bytes).unwrap(); assert_that!(non_empty_wrapper.is_empty(), eq(false)); } ``` ########## datasketches/src/cpc/wrapper.rs: ########## @@ -0,0 +1,173 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::codec::SketchSlice; +use crate::codec::family::Family; +use crate::codec::utility::ensure_preamble_longs_in; +use crate::codec::utility::ensure_serial_version_is; +use crate::common::NumStdDev; +use crate::cpc::MAX_LG_K; +use crate::cpc::MIN_LG_K; +use crate::cpc::estimator::estimate; +use crate::cpc::estimator::lower_bound; +use crate::cpc::estimator::upper_bound; +use crate::cpc::serialization::FLAG_COMPRESSED; +use crate::cpc::serialization::FLAG_HAS_HIP; +use crate::cpc::serialization::FLAG_HAS_TABLE; +use crate::cpc::serialization::FLAG_HAS_WINDOW; +use crate::cpc::serialization::SERIAL_VERSION; +use crate::cpc::serialization::make_preamble_ints; +use crate::error::Error; +use crate::error::ErrorKind; + +/// A read-only view of a serialized image of a CpcSketch. +#[derive(Debug, Clone)] +pub struct CpcWrapper { + lg_k: u8, + merge_flag: bool, + num_coupons: u32, + hip_est_accum: f64, +} + +impl CpcWrapper { + /// Creates a new `CpcWrapper` from the given byte slice without copying. + pub fn new(bytes: &[u8]) -> Result<Self, Error> { + fn make_error(tag: &'static str) -> impl FnOnce(std::io::Error) -> Error { + move |_| Error::insufficient_data(tag) + } + + let mut cursor = SketchSlice::new(bytes); + let preamble_ints = cursor.read_u8().map_err(make_error("preamble_ints"))?; + let serial_version = cursor.read_u8().map_err(make_error("serial_version"))?; + let family_id = cursor.read_u8().map_err(make_error("family_id"))?; + Family::CPC.validate_id(family_id)?; + ensure_serial_version_is(SERIAL_VERSION, serial_version)?; + + let lg_k = cursor.read_u8().map_err(make_error("lg_k"))?; + let first_interesting_column = cursor + .read_u8() + .map_err(make_error("first_interesting_column"))?; + if !(MIN_LG_K..=MAX_LG_K).contains(&lg_k) { + return Err(Error::invalid_argument(format!( + "lg_k out of range; got {}", + lg_k + ))); + } + if first_interesting_column > 63 { + return Err(Error::invalid_argument(format!( + "first_interesting_column out of range; got {}", + first_interesting_column + ))); + } + + let flags = cursor.read_u8().map_err(make_error("flags"))?; + let is_compressed = flags & (1 << FLAG_COMPRESSED) != 0; + if !is_compressed { + return Err(Error::new( + ErrorKind::InvalidData, + "only compressed sketches are supported", + )); + } + let has_hip = flags & (1 << FLAG_HAS_HIP) != 0; + let has_table = flags & (1 << FLAG_HAS_TABLE) != 0; + let has_window = flags & (1 << FLAG_HAS_WINDOW) != 0; + + cursor.read_u16_le().map_err(make_error("seed_hash"))?; + + let mut num_coupons = 0; + let mut hip_est_accum = 0.0; + + if has_table || has_window { + num_coupons = cursor.read_u32_le().map_err(make_error("num_coupons"))?; + if has_table && has_window { + cursor + .read_u32_le() + .map_err(make_error("table_num_entries"))?; + if has_hip { + cursor.read_f64_le().map_err(make_error("kxp"))?; + hip_est_accum = cursor.read_f64_le().map_err(make_error("hip_est_accum"))?; + } + } + if has_table { + cursor + .read_u32_le() + .map_err(make_error("table_data_words"))? as usize; + } + if has_window { + cursor + .read_u32_le() + .map_err(make_error("window_data_words"))? as usize; Review Comment: The cast to `usize` is unnecessary here since the result is immediately discarded. The field is intentionally read to advance the cursor position in the byte stream but its value is not used. Consider removing the cast for clarity. ```suggestion .map_err(make_error("table_data_words"))?; } if has_window { cursor .read_u32_le() .map_err(make_error("window_data_words"))?; ``` ########## datasketches/src/cpc/wrapper.rs: ########## @@ -0,0 +1,173 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::codec::SketchSlice; +use crate::codec::family::Family; +use crate::codec::utility::ensure_preamble_longs_in; +use crate::codec::utility::ensure_serial_version_is; +use crate::common::NumStdDev; +use crate::cpc::MAX_LG_K; +use crate::cpc::MIN_LG_K; +use crate::cpc::estimator::estimate; +use crate::cpc::estimator::lower_bound; +use crate::cpc::estimator::upper_bound; +use crate::cpc::serialization::FLAG_COMPRESSED; +use crate::cpc::serialization::FLAG_HAS_HIP; +use crate::cpc::serialization::FLAG_HAS_TABLE; +use crate::cpc::serialization::FLAG_HAS_WINDOW; +use crate::cpc::serialization::SERIAL_VERSION; +use crate::cpc::serialization::make_preamble_ints; +use crate::error::Error; +use crate::error::ErrorKind; + +/// A read-only view of a serialized image of a CpcSketch. +#[derive(Debug, Clone)] +pub struct CpcWrapper { + lg_k: u8, + merge_flag: bool, + num_coupons: u32, + hip_est_accum: f64, +} + +impl CpcWrapper { + /// Creates a new `CpcWrapper` from the given byte slice without copying. Review Comment: The phrase "without copying" in the doc comment might be slightly misleading. While the wrapper doesn't retain a reference to the input bytes, it does extract and store values from them (lg_k, merge_flag, num_coupons, hip_est_accum) which involves copying those values. Consider either removing this phrase or clarifying it to "without retaining a reference to the byte slice" to be more precise about what is meant. ```suggestion /// Creates a new `CpcWrapper` from the given byte slice without retaining a reference to it. ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
