Copilot commented on code in PR #87:
URL: https://github.com/apache/datasketches-rust/pull/87#discussion_r2780424101


##########
datasketches/src/theta/sketch.rs:
##########
@@ -248,6 +263,199 @@ impl ThetaSketch {
         )
         .expect("theta should always be valid")
     }
+
+    /// Serialize the sketch to bytes in compact format.
+    ///
+    /// The serialized format is compatible with Java and C++ DataSketches
+    /// implementations.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use datasketches::theta::ThetaSketch;
+    /// let mut sketch = ThetaSketch::builder().build();
+    /// sketch.update("apple");
+    /// let bytes = sketch.serialize();
+    /// let restored = ThetaSketch::deserialize(&bytes).unwrap();
+    /// assert_eq!(sketch.estimate(), restored.estimate());
+    /// ```
+    pub fn serialize(&self) -> Vec<u8> {
+        // Determine preamble size based on state
+        let is_empty = self.is_empty();
+        let is_estimation_mode = self.is_estimation_mode();
+
+        let preamble_longs = if is_empty {
+            PREAMBLE_LONGS_EMPTY
+        } else if is_estimation_mode {
+            PREAMBLE_LONGS_ESTIMATION
+        } else {
+            PREAMBLE_LONGS_EXACT
+        };
+
+        let num_entries = self.num_retained();
+        let preamble_bytes = (preamble_longs as usize) * 8;
+        let data_bytes = num_entries * HASH_SIZE_BYTES;
+        let total_bytes = preamble_bytes + data_bytes;
+
+        let mut bytes = SketchBytes::with_capacity(total_bytes);
+
+        // Build flags byte
+        let mut flags: u8 = FLAG_IS_COMPACT | FLAG_IS_READ_ONLY | 
FLAG_IS_ORDERED;
+        if is_empty {
+            flags |= FLAG_IS_EMPTY;
+        }
+
+        // Write preamble (first 8 bytes always present)
+        bytes.write_u8(preamble_longs);
+        bytes.write_u8(SERIAL_VERSION);
+        bytes.write_u8(THETA_FAMILY_ID);
+        bytes.write_u8(self.lg_k());
+        bytes.write_u8(self.lg_k()); // lgArr = lgK for compact
+        bytes.write_u8(flags);
+        bytes.write_u16_le(compute_seed_hash(self.table.seed()));
+
+        // Write second 8 bytes if not empty (retained count + padding)
+        if !is_empty {
+            bytes.write_u32_le(num_entries as u32);
+            bytes.write_u32_le(0); // padding (p field, unused in compact)
+        }
+
+        // Write theta if in estimation mode
+        if is_estimation_mode {
+            bytes.write_u64_le(self.table.theta());
+        }
+
+        // Write sorted hash values
+        let mut entries: Vec<u64> = self.iter().collect();
+        entries.sort_unstable();
+        for entry in entries {
+            bytes.write_u64_le(entry);
+        }
+
+        bytes.into_bytes()
+    }
+
+    /// Deserialize a sketch from bytes.
+    ///
+    /// Uses the default seed (9001). For sketches created with a different 
seed,
+    /// use [`deserialize_with_seed`](Self::deserialize_with_seed).
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if the bytes are invalid or corrupted.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use datasketches::theta::ThetaSketch;
+    /// let mut sketch = ThetaSketch::builder().build();
+    /// sketch.update("apple");
+    /// let bytes = sketch.serialize();
+    /// let restored = ThetaSketch::deserialize(&bytes).unwrap();
+    /// assert_eq!(sketch.estimate(), restored.estimate());
+    /// ```
+    pub fn deserialize(bytes: &[u8]) -> Result<Self, Error> {
+        Self::deserialize_with_seed(bytes, DEFAULT_UPDATE_SEED)
+    }
+
+    /// Deserialize a sketch from bytes with a specific seed.
+    ///
+    /// # Arguments
+    ///
+    /// * `bytes` - The serialized sketch bytes
+    /// * `seed` - The seed used during sketch creation
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if:
+    /// - The bytes are too short
+    /// - The format is invalid (wrong family ID, unsupported version)
+    /// - The seed hash doesn't match
+    pub fn deserialize_with_seed(bytes: &[u8], seed: u64) -> Result<Self, 
Error> {
+        fn make_error(tag: &'static str) -> impl FnOnce(std::io::Error) -> 
Error {
+            move |_| Error::insufficient_data(tag)
+        }
+
+        if bytes.len() < 8 {
+            return Err(Error::insufficient_data("preamble"));
+        }
+
+        let mut cursor = SketchSlice::new(bytes);
+
+        // Read first 8 bytes (always present)
+        let preamble_longs = 
cursor.read_u8().map_err(make_error("preamble_longs"))?;
+        let serial_version = 
cursor.read_u8().map_err(make_error("serial_version"))?;
+        let family_id = cursor.read_u8().map_err(make_error("family_id"))?;

Review Comment:
   `deserialize_with_seed` reads `preamble_longs` but never validates it 
against the supported compact Theta formats. As written, values like 
`preamble_longs=1` with `FLAG_IS_EMPTY` unset will be parsed as non-empty and 
can misinterpret the payload. Please validate `preamble_longs` (e.g., require 1 
for empty, 2 for exact, 3 for estimation) and return 
`Error::invalid_preamble_longs(...)` (or equivalent) on unexpected values.



##########
datasketches/src/theta/hash_table.rs:
##########
@@ -296,6 +296,44 @@ impl ThetaHashTable {
         self.lg_nom_size
     }
 
+    /// Get the hash seed
+    pub fn seed(&self) -> u64 {
+        self.hash_seed
+    }
+
+    /// Create a hash table from deserialized entries
+    ///
+    /// This is used during deserialization to reconstruct the hash table.
+    pub fn from_entries(lg_nom_size: u8, seed: u64, theta: u64, entries: 
Vec<u64>) -> Self {
+        let lg_max_size = lg_nom_size + 1;
+        let lg_cur_size = lg_max_size; // Use max size for deserialized tables
+        let num_entries = entries.len();
+
+        // Rebuild hash table from compact entries
+        let table_size = 1usize << lg_cur_size;
+        let mut table_entries = vec![0u64; table_size];
+
+        for entry in &entries {
+            if *entry != 0 {
+                if let Some(idx) = Self::find_in_entries(&table_entries, 
*entry, lg_cur_size) {
+                    table_entries[idx] = *entry;
+                }
+            }
+        }
+
+        Self {
+            lg_cur_size,
+            lg_nom_size,
+            lg_max_size,
+            resize_factor: ResizeFactor::X8, // Default for deserialized
+            sampling_probability: 1.0,       // Unknown, assume 1.0
+            theta,
+            hash_seed: seed,
+            entries: table_entries,
+            num_entries,
+        }
+    }

Review Comment:
   `from_entries` sets `num_entries` to `entries.len()` even if some values are 
skipped (0s) or fail insertion (e.g., table full) or are duplicates. This can 
make the table internally inconsistent and leads to incorrect 
`estimate()`/iteration behavior. Consider validating the input (no 0s, no 
duplicates, len <= capacity) and computing `num_entries` from the actual number 
of inserted uniques; if insertion fails, return an error to the caller instead 
of silently dropping entries.



##########
datasketches/src/theta/sketch.rs:
##########
@@ -248,6 +263,199 @@ impl ThetaSketch {
         )
         .expect("theta should always be valid")
     }
+
+    /// Serialize the sketch to bytes in compact format.
+    ///
+    /// The serialized format is compatible with Java and C++ DataSketches
+    /// implementations.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use datasketches::theta::ThetaSketch;
+    /// let mut sketch = ThetaSketch::builder().build();
+    /// sketch.update("apple");
+    /// let bytes = sketch.serialize();
+    /// let restored = ThetaSketch::deserialize(&bytes).unwrap();
+    /// assert_eq!(sketch.estimate(), restored.estimate());
+    /// ```
+    pub fn serialize(&self) -> Vec<u8> {
+        // Determine preamble size based on state
+        let is_empty = self.is_empty();
+        let is_estimation_mode = self.is_estimation_mode();
+
+        let preamble_longs = if is_empty {
+            PREAMBLE_LONGS_EMPTY
+        } else if is_estimation_mode {
+            PREAMBLE_LONGS_ESTIMATION
+        } else {
+            PREAMBLE_LONGS_EXACT
+        };
+
+        let num_entries = self.num_retained();
+        let preamble_bytes = (preamble_longs as usize) * 8;
+        let data_bytes = num_entries * HASH_SIZE_BYTES;
+        let total_bytes = preamble_bytes + data_bytes;
+
+        let mut bytes = SketchBytes::with_capacity(total_bytes);
+
+        // Build flags byte
+        let mut flags: u8 = FLAG_IS_COMPACT | FLAG_IS_READ_ONLY | 
FLAG_IS_ORDERED;
+        if is_empty {
+            flags |= FLAG_IS_EMPTY;
+        }
+
+        // Write preamble (first 8 bytes always present)
+        bytes.write_u8(preamble_longs);
+        bytes.write_u8(SERIAL_VERSION);
+        bytes.write_u8(THETA_FAMILY_ID);
+        bytes.write_u8(self.lg_k());
+        bytes.write_u8(self.lg_k()); // lgArr = lgK for compact
+        bytes.write_u8(flags);
+        bytes.write_u16_le(compute_seed_hash(self.table.seed()));
+
+        // Write second 8 bytes if not empty (retained count + padding)
+        if !is_empty {
+            bytes.write_u32_le(num_entries as u32);
+            bytes.write_u32_le(0); // padding (p field, unused in compact)
+        }
+
+        // Write theta if in estimation mode
+        if is_estimation_mode {
+            bytes.write_u64_le(self.table.theta());
+        }
+
+        // Write sorted hash values
+        let mut entries: Vec<u64> = self.iter().collect();
+        entries.sort_unstable();
+        for entry in entries {
+            bytes.write_u64_le(entry);
+        }
+
+        bytes.into_bytes()
+    }
+
+    /// Deserialize a sketch from bytes.
+    ///
+    /// Uses the default seed (9001). For sketches created with a different 
seed,
+    /// use [`deserialize_with_seed`](Self::deserialize_with_seed).
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if the bytes are invalid or corrupted.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use datasketches::theta::ThetaSketch;
+    /// let mut sketch = ThetaSketch::builder().build();
+    /// sketch.update("apple");
+    /// let bytes = sketch.serialize();
+    /// let restored = ThetaSketch::deserialize(&bytes).unwrap();
+    /// assert_eq!(sketch.estimate(), restored.estimate());
+    /// ```
+    pub fn deserialize(bytes: &[u8]) -> Result<Self, Error> {
+        Self::deserialize_with_seed(bytes, DEFAULT_UPDATE_SEED)
+    }
+
+    /// Deserialize a sketch from bytes with a specific seed.
+    ///
+    /// # Arguments
+    ///
+    /// * `bytes` - The serialized sketch bytes
+    /// * `seed` - The seed used during sketch creation
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if:
+    /// - The bytes are too short
+    /// - The format is invalid (wrong family ID, unsupported version)
+    /// - The seed hash doesn't match
+    pub fn deserialize_with_seed(bytes: &[u8], seed: u64) -> Result<Self, 
Error> {
+        fn make_error(tag: &'static str) -> impl FnOnce(std::io::Error) -> 
Error {
+            move |_| Error::insufficient_data(tag)
+        }
+
+        if bytes.len() < 8 {
+            return Err(Error::insufficient_data("preamble"));
+        }
+
+        let mut cursor = SketchSlice::new(bytes);
+
+        // Read first 8 bytes (always present)
+        let preamble_longs = 
cursor.read_u8().map_err(make_error("preamble_longs"))?;
+        let serial_version = 
cursor.read_u8().map_err(make_error("serial_version"))?;
+        let family_id = cursor.read_u8().map_err(make_error("family_id"))?;
+        let lg_k = cursor.read_u8().map_err(make_error("lg_k"))?;
+        let _lg_arr = cursor.read_u8().map_err(make_error("lg_arr"))?;
+        let flags = cursor.read_u8().map_err(make_error("flags"))?;

Review Comment:
   The `flags` byte is read but endianness is not enforced. If 
`FLAG_IS_BIG_ENDIAN` is set, the current code will still parse all multi-byte 
fields as little-endian and produce incorrect results. Either reject sketches 
with the big-endian flag set (return an InvalidData error) or branch reads to 
use big-endian decoding when the flag is present.
   ```suggestion
           let flags = cursor.read_u8().map_err(make_error("flags"))?;
   
           // Enforce endianness: this implementation only supports 
little-endian sketches.
           if (flags & FLAG_IS_BIG_ENDIAN) != 0 {
               return Err(Error::new(
                   ErrorKind::InvalidData,
                   "big-endian ThetaSketch serialization is not supported",
               ));
           }
   ```



##########
datasketches/src/theta/sketch.rs:
##########
@@ -248,6 +263,199 @@ impl ThetaSketch {
         )
         .expect("theta should always be valid")
     }
+
+    /// Serialize the sketch to bytes in compact format.
+    ///
+    /// The serialized format is compatible with Java and C++ DataSketches
+    /// implementations.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use datasketches::theta::ThetaSketch;
+    /// let mut sketch = ThetaSketch::builder().build();
+    /// sketch.update("apple");
+    /// let bytes = sketch.serialize();
+    /// let restored = ThetaSketch::deserialize(&bytes).unwrap();
+    /// assert_eq!(sketch.estimate(), restored.estimate());
+    /// ```
+    pub fn serialize(&self) -> Vec<u8> {
+        // Determine preamble size based on state
+        let is_empty = self.is_empty();
+        let is_estimation_mode = self.is_estimation_mode();
+
+        let preamble_longs = if is_empty {
+            PREAMBLE_LONGS_EMPTY
+        } else if is_estimation_mode {
+            PREAMBLE_LONGS_ESTIMATION
+        } else {
+            PREAMBLE_LONGS_EXACT
+        };
+
+        let num_entries = self.num_retained();
+        let preamble_bytes = (preamble_longs as usize) * 8;
+        let data_bytes = num_entries * HASH_SIZE_BYTES;
+        let total_bytes = preamble_bytes + data_bytes;
+
+        let mut bytes = SketchBytes::with_capacity(total_bytes);
+
+        // Build flags byte
+        let mut flags: u8 = FLAG_IS_COMPACT | FLAG_IS_READ_ONLY | 
FLAG_IS_ORDERED;
+        if is_empty {
+            flags |= FLAG_IS_EMPTY;
+        }
+
+        // Write preamble (first 8 bytes always present)
+        bytes.write_u8(preamble_longs);
+        bytes.write_u8(SERIAL_VERSION);
+        bytes.write_u8(THETA_FAMILY_ID);
+        bytes.write_u8(self.lg_k());
+        bytes.write_u8(self.lg_k()); // lgArr = lgK for compact
+        bytes.write_u8(flags);
+        bytes.write_u16_le(compute_seed_hash(self.table.seed()));
+
+        // Write second 8 bytes if not empty (retained count + padding)
+        if !is_empty {
+            bytes.write_u32_le(num_entries as u32);
+            bytes.write_u32_le(0); // padding (p field, unused in compact)
+        }
+
+        // Write theta if in estimation mode
+        if is_estimation_mode {
+            bytes.write_u64_le(self.table.theta());
+        }
+
+        // Write sorted hash values
+        let mut entries: Vec<u64> = self.iter().collect();
+        entries.sort_unstable();
+        for entry in entries {
+            bytes.write_u64_le(entry);
+        }
+
+        bytes.into_bytes()
+    }
+
+    /// Deserialize a sketch from bytes.
+    ///
+    /// Uses the default seed (9001). For sketches created with a different 
seed,
+    /// use [`deserialize_with_seed`](Self::deserialize_with_seed).
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if the bytes are invalid or corrupted.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use datasketches::theta::ThetaSketch;
+    /// let mut sketch = ThetaSketch::builder().build();
+    /// sketch.update("apple");
+    /// let bytes = sketch.serialize();
+    /// let restored = ThetaSketch::deserialize(&bytes).unwrap();
+    /// assert_eq!(sketch.estimate(), restored.estimate());
+    /// ```
+    pub fn deserialize(bytes: &[u8]) -> Result<Self, Error> {
+        Self::deserialize_with_seed(bytes, DEFAULT_UPDATE_SEED)
+    }
+
+    /// Deserialize a sketch from bytes with a specific seed.
+    ///
+    /// # Arguments
+    ///
+    /// * `bytes` - The serialized sketch bytes
+    /// * `seed` - The seed used during sketch creation
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if:
+    /// - The bytes are too short
+    /// - The format is invalid (wrong family ID, unsupported version)
+    /// - The seed hash doesn't match
+    pub fn deserialize_with_seed(bytes: &[u8], seed: u64) -> Result<Self, 
Error> {
+        fn make_error(tag: &'static str) -> impl FnOnce(std::io::Error) -> 
Error {
+            move |_| Error::insufficient_data(tag)
+        }
+
+        if bytes.len() < 8 {
+            return Err(Error::insufficient_data("preamble"));
+        }
+
+        let mut cursor = SketchSlice::new(bytes);
+
+        // Read first 8 bytes (always present)
+        let preamble_longs = 
cursor.read_u8().map_err(make_error("preamble_longs"))?;
+        let serial_version = 
cursor.read_u8().map_err(make_error("serial_version"))?;
+        let family_id = cursor.read_u8().map_err(make_error("family_id"))?;
+        let lg_k = cursor.read_u8().map_err(make_error("lg_k"))?;
+        let _lg_arr = cursor.read_u8().map_err(make_error("lg_arr"))?;
+        let flags = cursor.read_u8().map_err(make_error("flags"))?;
+        let stored_seed_hash = 
cursor.read_u16_le().map_err(make_error("seed_hash"))?;
+
+        // Validate format
+        if family_id != THETA_FAMILY_ID {
+            return Err(Error::invalid_family(
+                THETA_FAMILY_ID,
+                family_id,
+                "ThetaSketch",
+            ));
+        }
+        if serial_version != SERIAL_VERSION && serial_version != 1 && 
serial_version != 2 {
+            return Err(Error::unsupported_serial_version(
+                SERIAL_VERSION,
+                serial_version,
+            ));
+        }
+        if !(MIN_LG_K..=MAX_LG_K).contains(&lg_k) {
+            return Err(Error::new(
+                ErrorKind::InvalidData,
+                format!("lg_k {} is out of range [{}, {}]", lg_k, MIN_LG_K, 
MAX_LG_K),
+            ));
+        }
+
+        // Validate seed hash
+        let expected_seed_hash = compute_seed_hash(seed);
+        if stored_seed_hash != expected_seed_hash {
+            return Err(Error::new(
+                ErrorKind::InvalidData,
+                format!(
+                    "seed hash mismatch: expected 0x{:04X}, got 0x{:04X}",
+                    expected_seed_hash, stored_seed_hash
+                ),
+            ));
+        }
+
+        // Parse flags
+        let is_empty = (flags & FLAG_IS_EMPTY) != 0;
+        let _is_compact = (flags & FLAG_IS_COMPACT) != 0;
+
+        // Handle empty sketch
+        if is_empty {
+            return Ok(ThetaSketch::builder().lg_k(lg_k).seed(seed).build());
+        }
+
+        // Read retained count (bytes 8-11)
+        let num_entries = 
cursor.read_u32_le().map_err(make_error("num_entries"))? as usize;
+        let _padding = cursor.read_u32_le().map_err(make_error("padding"))?;
+
+        // Read theta if in estimation mode (preamble_longs >= 3)
+        let theta = if preamble_longs >= PREAMBLE_LONGS_ESTIMATION {
+            cursor.read_u64_le().map_err(make_error("theta"))?
+        } else {
+            MAX_THETA
+        };
+

Review Comment:
   `theta` is accepted without validation. If it is 0 (or > MAX_THETA), 
`estimate()` can divide by zero (or compute nonsensical results). Validate that 
`0 < theta && theta <= MAX_THETA` when `preamble_longs` indicates an estimation 
sketch; otherwise return InvalidData.
   ```suggestion
   
           // Validate theta for estimation sketches to avoid division by zero 
or invalid estimates
           if preamble_longs >= PREAMBLE_LONGS_ESTIMATION && (theta == 0 || 
theta > MAX_THETA) {
               return Err(Error::new(
                   ErrorKind::InvalidData,
                   format!("theta {} is out of range (0, {}]", theta, 
MAX_THETA),
               ));
           }
   ```



##########
datasketches/src/theta/sketch.rs:
##########
@@ -248,6 +263,199 @@ impl ThetaSketch {
         )
         .expect("theta should always be valid")
     }
+
+    /// Serialize the sketch to bytes in compact format.
+    ///
+    /// The serialized format is compatible with Java and C++ DataSketches
+    /// implementations.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use datasketches::theta::ThetaSketch;
+    /// let mut sketch = ThetaSketch::builder().build();
+    /// sketch.update("apple");
+    /// let bytes = sketch.serialize();
+    /// let restored = ThetaSketch::deserialize(&bytes).unwrap();
+    /// assert_eq!(sketch.estimate(), restored.estimate());
+    /// ```
+    pub fn serialize(&self) -> Vec<u8> {
+        // Determine preamble size based on state
+        let is_empty = self.is_empty();
+        let is_estimation_mode = self.is_estimation_mode();
+
+        let preamble_longs = if is_empty {
+            PREAMBLE_LONGS_EMPTY
+        } else if is_estimation_mode {
+            PREAMBLE_LONGS_ESTIMATION
+        } else {
+            PREAMBLE_LONGS_EXACT
+        };
+
+        let num_entries = self.num_retained();
+        let preamble_bytes = (preamble_longs as usize) * 8;
+        let data_bytes = num_entries * HASH_SIZE_BYTES;
+        let total_bytes = preamble_bytes + data_bytes;
+
+        let mut bytes = SketchBytes::with_capacity(total_bytes);
+
+        // Build flags byte
+        let mut flags: u8 = FLAG_IS_COMPACT | FLAG_IS_READ_ONLY | 
FLAG_IS_ORDERED;
+        if is_empty {
+            flags |= FLAG_IS_EMPTY;
+        }
+
+        // Write preamble (first 8 bytes always present)
+        bytes.write_u8(preamble_longs);
+        bytes.write_u8(SERIAL_VERSION);
+        bytes.write_u8(THETA_FAMILY_ID);
+        bytes.write_u8(self.lg_k());
+        bytes.write_u8(self.lg_k()); // lgArr = lgK for compact
+        bytes.write_u8(flags);
+        bytes.write_u16_le(compute_seed_hash(self.table.seed()));
+
+        // Write second 8 bytes if not empty (retained count + padding)
+        if !is_empty {
+            bytes.write_u32_le(num_entries as u32);
+            bytes.write_u32_le(0); // padding (p field, unused in compact)
+        }
+
+        // Write theta if in estimation mode
+        if is_estimation_mode {
+            bytes.write_u64_le(self.table.theta());
+        }
+
+        // Write sorted hash values
+        let mut entries: Vec<u64> = self.iter().collect();
+        entries.sort_unstable();
+        for entry in entries {
+            bytes.write_u64_le(entry);
+        }
+
+        bytes.into_bytes()
+    }
+
+    /// Deserialize a sketch from bytes.
+    ///
+    /// Uses the default seed (9001). For sketches created with a different 
seed,
+    /// use [`deserialize_with_seed`](Self::deserialize_with_seed).
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if the bytes are invalid or corrupted.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use datasketches::theta::ThetaSketch;
+    /// let mut sketch = ThetaSketch::builder().build();
+    /// sketch.update("apple");
+    /// let bytes = sketch.serialize();
+    /// let restored = ThetaSketch::deserialize(&bytes).unwrap();
+    /// assert_eq!(sketch.estimate(), restored.estimate());
+    /// ```
+    pub fn deserialize(bytes: &[u8]) -> Result<Self, Error> {
+        Self::deserialize_with_seed(bytes, DEFAULT_UPDATE_SEED)
+    }
+
+    /// Deserialize a sketch from bytes with a specific seed.
+    ///
+    /// # Arguments
+    ///
+    /// * `bytes` - The serialized sketch bytes
+    /// * `seed` - The seed used during sketch creation
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if:
+    /// - The bytes are too short
+    /// - The format is invalid (wrong family ID, unsupported version)
+    /// - The seed hash doesn't match
+    pub fn deserialize_with_seed(bytes: &[u8], seed: u64) -> Result<Self, 
Error> {
+        fn make_error(tag: &'static str) -> impl FnOnce(std::io::Error) -> 
Error {
+            move |_| Error::insufficient_data(tag)
+        }
+
+        if bytes.len() < 8 {
+            return Err(Error::insufficient_data("preamble"));
+        }
+
+        let mut cursor = SketchSlice::new(bytes);
+
+        // Read first 8 bytes (always present)
+        let preamble_longs = 
cursor.read_u8().map_err(make_error("preamble_longs"))?;
+        let serial_version = 
cursor.read_u8().map_err(make_error("serial_version"))?;
+        let family_id = cursor.read_u8().map_err(make_error("family_id"))?;
+        let lg_k = cursor.read_u8().map_err(make_error("lg_k"))?;
+        let _lg_arr = cursor.read_u8().map_err(make_error("lg_arr"))?;
+        let flags = cursor.read_u8().map_err(make_error("flags"))?;
+        let stored_seed_hash = 
cursor.read_u16_le().map_err(make_error("seed_hash"))?;
+
+        // Validate format
+        if family_id != THETA_FAMILY_ID {
+            return Err(Error::invalid_family(
+                THETA_FAMILY_ID,
+                family_id,
+                "ThetaSketch",
+            ));
+        }
+        if serial_version != SERIAL_VERSION && serial_version != 1 && 
serial_version != 2 {
+            return Err(Error::unsupported_serial_version(
+                SERIAL_VERSION,
+                serial_version,
+            ));
+        }
+        if !(MIN_LG_K..=MAX_LG_K).contains(&lg_k) {
+            return Err(Error::new(
+                ErrorKind::InvalidData,
+                format!("lg_k {} is out of range [{}, {}]", lg_k, MIN_LG_K, 
MAX_LG_K),
+            ));
+        }
+
+        // Validate seed hash
+        let expected_seed_hash = compute_seed_hash(seed);
+        if stored_seed_hash != expected_seed_hash {
+            return Err(Error::new(
+                ErrorKind::InvalidData,
+                format!(
+                    "seed hash mismatch: expected 0x{:04X}, got 0x{:04X}",
+                    expected_seed_hash, stored_seed_hash
+                ),
+            ));
+        }
+
+        // Parse flags
+        let is_empty = (flags & FLAG_IS_EMPTY) != 0;
+        let _is_compact = (flags & FLAG_IS_COMPACT) != 0;
+
+        // Handle empty sketch
+        if is_empty {
+            return Ok(ThetaSketch::builder().lg_k(lg_k).seed(seed).build());
+        }
+
+        // Read retained count (bytes 8-11)
+        let num_entries = 
cursor.read_u32_le().map_err(make_error("num_entries"))? as usize;
+        let _padding = cursor.read_u32_le().map_err(make_error("padding"))?;
+
+        // Read theta if in estimation mode (preamble_longs >= 3)
+        let theta = if preamble_longs >= PREAMBLE_LONGS_ESTIMATION {
+            cursor.read_u64_le().map_err(make_error("theta"))?
+        } else {
+            MAX_THETA
+        };
+
+        // Read hash entries
+        let mut entries = Vec::with_capacity(num_entries);
+        for _ in 0..num_entries {
+            let hash = cursor.read_u64_le().map_err(make_error("hash_entry"))?;
+            entries.push(hash);
+        }

Review Comment:
   `num_entries` is taken directly from the input and used to pre-allocate 
(`Vec::with_capacity`) and to drive reads, without any upper bound derived from 
`lg_k`/format. A malformed input can request an extremely large allocation and 
trigger OOM. Add sanity checks (e.g., `num_entries <= 1<<lg_k` and 
`bytes.len()` is at least `preamble_longs*8 + num_entries*8`) before 
allocating/looping.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to