jecsand838 commented on code in PR #8006: URL: https://github.com/apache/arrow-rs/pull/8006#discussion_r2234570731
########## arrow-avro/src/schema.rs: ########## @@ -260,13 +274,304 @@ pub struct Fixed<'a> { pub attributes: Attributes<'a>, } +/// Supported fingerprint algorithms for Avro schema identification. +/// Currently only `Rabin` is supported, `SHA256` and `MD5` support will come in a future update +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub enum FingerprintAlgorithm { + /// 64‑bit CRC‑64‑AVRO Rabin fingerprint. + Rabin, +} + +/// A schema fingerprint in one of the supported formats. +/// +/// This is used as the key inside `SchemaStore` `HashMap`. Each `SchemaStore` +/// instance always stores only one variant, matching its configured +/// `FingerprintAlgorithm`, but the enum makes the API uniform. +/// Currently only `Rabin` is supported +/// +/// <https://avro.apache.org/docs/1.11.1/specification/#schema-fingerprints> +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub enum Fingerprint { + /// A 64-bit Rabin fingerprint. + Rabin(u64), +} + +/// Allow easy extraction of the algorithm used to create a fingerprint. +impl From<&Fingerprint> for FingerprintAlgorithm { + #[inline] + fn from(fp: &Fingerprint) -> Self { + match fp { + Fingerprint::Rabin(_) => FingerprintAlgorithm::Rabin, + } + } +} + +/// Generates a fingerprint for the given `Schema` using the specified `FingerprintAlgorithm`. +#[inline] +pub(crate) fn generate_fingerprint( + schema: &Schema, + hash_type: FingerprintAlgorithm, +) -> Fingerprint { + let canonical = generate_canonical_form(schema); + match hash_type { + FingerprintAlgorithm::Rabin => Fingerprint::Rabin(compute_fingerprint_rabin(&canonical)), + } +} + +/// Generates the 64-bit Rabin fingerprint for the given `Schema`. +/// +/// The fingerprint is computed from the canonical form of the schema. +/// This is also known as `CRC-64-AVRO`. +/// +/// # Returns +/// A `Fingerprint::Rabin` variant containing the 64-bit fingerprint. +#[inline] +pub fn generate_fingerprint_rabin(schema: &Schema) -> Fingerprint { + generate_fingerprint(schema, FingerprintAlgorithm::Rabin) +} + +/// Generates the Parsed Canonical Form for the given [`Schema`]. +/// +/// The canonical form is a standardized JSON representation of the schema, +/// primarily used for generating a schema fingerprint for equality checking. +/// +/// This form strips attributes that do not affect the schema's identity, +/// such as `doc` fields, `aliases`, and any properties not defined in the +/// Avro specification. +/// +/// <https://avro.apache.org/docs/1.11.1/specification/#parsing-canonical-form-for-schemas> +#[inline] +pub fn generate_canonical_form(schema: &Schema) -> String { + serde_json::to_string(&parse_canonical_json(schema)).unwrap() Review Comment: I cleaned this up in my last commit. Ty for catching this. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org