jecsand838 commented on code in PR #8006: URL: https://github.com/apache/arrow-rs/pull/8006#discussion_r2234573374
########## arrow-avro/src/schema.rs: ########## @@ -260,13 +274,304 @@ pub struct Fixed<'a> { pub attributes: Attributes<'a>, } +/// Supported fingerprint algorithms for Avro schema identification. +/// Currently only `Rabin` is supported, `SHA256` and `MD5` support will come in a future update +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub enum FingerprintAlgorithm { + /// 64‑bit CRC‑64‑AVRO Rabin fingerprint. + Rabin, +} + +/// A schema fingerprint in one of the supported formats. +/// +/// This is used as the key inside `SchemaStore` `HashMap`. Each `SchemaStore` +/// instance always stores only one variant, matching its configured +/// `FingerprintAlgorithm`, but the enum makes the API uniform. +/// Currently only `Rabin` is supported +/// +/// <https://avro.apache.org/docs/1.11.1/specification/#schema-fingerprints> +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub enum Fingerprint { + /// A 64-bit Rabin fingerprint. + Rabin(u64), +} + +/// Allow easy extraction of the algorithm used to create a fingerprint. +impl From<&Fingerprint> for FingerprintAlgorithm { + #[inline] + fn from(fp: &Fingerprint) -> Self { + match fp { + Fingerprint::Rabin(_) => FingerprintAlgorithm::Rabin, + } + } +} + +/// Generates a fingerprint for the given `Schema` using the specified `FingerprintAlgorithm`. +#[inline] +pub(crate) fn generate_fingerprint( + schema: &Schema, + hash_type: FingerprintAlgorithm, +) -> Fingerprint { + let canonical = generate_canonical_form(schema); + match hash_type { + FingerprintAlgorithm::Rabin => Fingerprint::Rabin(compute_fingerprint_rabin(&canonical)), + } +} + +/// Generates the 64-bit Rabin fingerprint for the given `Schema`. +/// +/// The fingerprint is computed from the canonical form of the schema. +/// This is also known as `CRC-64-AVRO`. +/// +/// # Returns +/// A `Fingerprint::Rabin` variant containing the 64-bit fingerprint. +#[inline] +pub fn generate_fingerprint_rabin(schema: &Schema) -> Fingerprint { + generate_fingerprint(schema, FingerprintAlgorithm::Rabin) +} + +/// Generates the Parsed Canonical Form for the given [`Schema`]. +/// +/// The canonical form is a standardized JSON representation of the schema, +/// primarily used for generating a schema fingerprint for equality checking. +/// +/// This form strips attributes that do not affect the schema's identity, +/// such as `doc` fields, `aliases`, and any properties not defined in the +/// Avro specification. +/// +/// <https://avro.apache.org/docs/1.11.1/specification/#parsing-canonical-form-for-schemas> +#[inline] +pub fn generate_canonical_form(schema: &Schema) -> String { + serde_json::to_string(&parse_canonical_json(schema)).unwrap() +} + +/// An in-memory cache of Avro schemas, indexed by their fingerprint. +/// +/// `SchemaStore` provides a mechanism to store and retrieve Avro schemas efficiently. +/// Each schema is associated with a unique [`Fingerprint`], which is generated based +/// on the schema's canonical form and a specific hashing algorithm. +/// +/// A `SchemaStore` instance is configured to use a single [`FingerprintAlgorithm`] such as Rabin, +/// MD5 (not yet supported), or SHA256 (not yet supported) for all its operations. +/// This ensures consistency when generating fingerprints and looking up schemas. +/// All schemas registered will have their fingerprint computed with this algorithm, and +/// lookups must use a matching fingerprint. +/// +/// The lifetime parameter `'a` corresponds to the lifetime of the string slices +/// contained within the stored [`Schema`] objects. This means the `SchemaStore` +/// cannot outlive the data referenced by the schemas it contains. +/// +/// # Examples +/// +/// ```no_run +/// // Create a new store with the default Rabin fingerprinting. +/// use arrow_avro::schema::{PrimitiveType, Schema, SchemaStore, TypeName}; +/// +/// let mut store = SchemaStore::new(); +/// let schema = Schema::TypeName(TypeName::Primitive(PrimitiveType::String)); +/// // Register the schema to get its fingerprint. +/// let fingerprint = store.register(schema.clone()).unwrap(); +/// // Use the fingerprint to look up the schema. +/// let retrieved_schema = store.lookup(&fingerprint); +/// assert_eq!(retrieved_schema, Some(schema)); +/// ``` +#[derive(Debug, Clone)] +pub struct SchemaStore<'a> { + /// The hashing algorithm used for generating fingerprints. + fingerprint_algorithm: FingerprintAlgorithm, + /// A map from a schema's fingerprint to the schema itself. + schemas: HashMap<Fingerprint, Schema<'a>>, +} + +impl<'a> TryFrom<&'a [Schema<'a>]> for SchemaStore<'a> { + type Error = ArrowError; + + /// Creates a `SchemaStore` from a slice of schemas. + /// Each schema in the slice is registered with the new store. + fn try_from(schemas: &'a [Schema<'a>]) -> Result<Self, Self::Error> { + let mut store = SchemaStore::new(); + for schema in schemas { + store.register(schema.clone())?; + } + Ok(store) + } +} + +impl<'a> Default for SchemaStore<'a> { + fn default() -> Self { + Self { + fingerprint_algorithm: FingerprintAlgorithm::Rabin, + schemas: HashMap::new(), + } + } +} + +impl<'a> SchemaStore<'a> { + /// Creates an empty `SchemaStore` using the default fingerprinting algorithm (64-bit Rabin). + pub fn new() -> Self { + Self::default() + } + + /// Registers a schema with the store and returns its fingerprint. + /// + /// A fingerprint is calculated for the given schema using the store's configured + /// hash type. If a schema with the same fingerprint does not already exist in the + /// store, the new schema is inserted. If the fingerprint already exists, the + /// existing schema is not overwritten. + /// + /// # Arguments + /// + /// * `schema` - The schema to register. + /// + /// # Returns + /// + /// A `Result` containing the `Fingerprint` of the schema if successful, + /// or an `ArrowError` on failure. + pub fn register(&mut self, schema: Schema<'a>) -> Result<Fingerprint, ArrowError> { + let fp = generate_fingerprint(&schema, self.fingerprint_algorithm); + self.schemas.entry(fp).or_insert(schema); + Ok(fp) + } + + /// Looks up a schema by its `Fingerprint`. + /// + /// # Arguments + /// + /// * `fp` - A reference to the `Fingerprint` of the schema to look up. + /// + /// # Returns + /// + /// An `Option` containing a clone of the `Schema` if found, otherwise `None`. + pub fn lookup(&self, fp: &Fingerprint) -> Option<Schema<'a>> { + self.schemas.get(fp).cloned() Review Comment: This was a great suggestion, ty for making it. This was included in my last commit as well. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org