On Wed, Jun 11, 2025 at 04:48:53PM +0200, Filip Schauer wrote: > This crate can parse and extract an OCI image bundled as a tar archive. > > Signed-off-by: Filip Schauer <f.scha...@proxmox.com> > --- > Cargo.toml | 1 + > proxmox-oci/Cargo.toml | 22 ++++ > proxmox-oci/debian/changelog | 5 + > proxmox-oci/debian/control | 47 ++++++++ > proxmox-oci/debian/debcargo.toml | 7 ++ > proxmox-oci/src/lib.rs | 196 +++++++++++++++++++++++++++++++ > proxmox-oci/src/oci_tar_image.rs | 167 ++++++++++++++++++++++++++ > 7 files changed, 445 insertions(+) > create mode 100644 proxmox-oci/Cargo.toml > create mode 100644 proxmox-oci/debian/changelog > create mode 100644 proxmox-oci/debian/control > create mode 100644 proxmox-oci/debian/debcargo.toml > create mode 100644 proxmox-oci/src/lib.rs > create mode 100644 proxmox-oci/src/oci_tar_image.rs > > diff --git a/Cargo.toml b/Cargo.toml > index bf9e83d7..8365b18a 100644 > --- a/Cargo.toml > +++ b/Cargo.toml > @@ -26,6 +26,7 @@ members = [ > "proxmox-metrics", > "proxmox-network-api", > "proxmox-notify", > + "proxmox-oci", > "proxmox-openid", > "proxmox-product-config", > "proxmox-rest-server", > diff --git a/proxmox-oci/Cargo.toml b/proxmox-oci/Cargo.toml > new file mode 100644 > index 00000000..4daff6ab > --- /dev/null > +++ b/proxmox-oci/Cargo.toml > @@ -0,0 +1,22 @@ > +[package] > +name = "proxmox-oci" > +description = "OCI image parsing and extraction" > +version = "0.1.0" > + > +authors.workspace = true > +edition.workspace = true > +exclude.workspace = true > +homepage.workspace = true > +license.workspace = true > +repository.workspace = true > +rust-version.workspace = true > + > +[dependencies] > +flate2.workspace = true > +oci-spec = "0.8.1" > +sha2 = "0.10" > +tar.workspace = true > +thiserror = "1" > +zstd.workspace = true > + > +proxmox-io.workspace = true > diff --git a/proxmox-oci/debian/changelog b/proxmox-oci/debian/changelog > new file mode 100644 > index 00000000..754d06c1 > --- /dev/null > +++ b/proxmox-oci/debian/changelog > @@ -0,0 +1,5 @@ > +rust-proxmox-oci (0.1.0-1) bookworm; urgency=medium > + > + * Initial release. > + > + -- Proxmox Support Team <supp...@proxmox.com> Mon, 28 Apr 2025 12:34:56 > +0200 > diff --git a/proxmox-oci/debian/control b/proxmox-oci/debian/control > new file mode 100644 > index 00000000..3974cf48 > --- /dev/null > +++ b/proxmox-oci/debian/control > @@ -0,0 +1,47 @@ > +Source: rust-proxmox-oci > +Section: rust > +Priority: optional > +Build-Depends: debhelper-compat (= 13), > + dh-sequence-cargo > +Build-Depends-Arch: cargo:native <!nocheck>, > + rustc:native (>= 1.82) <!nocheck>, > + libstd-rust-dev <!nocheck>, > + librust-flate2-1+default-dev <!nocheck>, > + librust-oci-spec-0.8+default-dev (>= 0.8.1-~~) <!nocheck>, > + librust-proxmox-io-1+default-dev (>= 1.1.0-~~) <!nocheck>, > + librust-sha2-0.10+default-dev <!nocheck>, > + librust-tar-0.4+default-dev <!nocheck>, > + librust-thiserror-1+default-dev <!nocheck>, > + librust-zstd-0.12+bindgen-dev <!nocheck>, > + librust-zstd-0.12+default-dev <!nocheck> > +Maintainer: Proxmox Support Team <supp...@proxmox.com> > +Standards-Version: 4.7.0 > +Vcs-Git: git://git.proxmox.com/git/proxmox.git > +Vcs-Browser: https://git.proxmox.com/?p=proxmox.git > +Homepage: https://proxmox.com > +X-Cargo-Crate: proxmox-oci > +Rules-Requires-Root: no > + > +Package: librust-proxmox-oci-dev > +Architecture: any > +Multi-Arch: same > +Depends: > + ${misc:Depends}, > + librust-flate2-1+default-dev, > + librust-oci-spec-0.8+default-dev (>= 0.8.1-~~), > + librust-proxmox-io-1+default-dev (>= 1.1.0-~~), > + librust-sha2-0.10+default-dev, > + librust-tar-0.4+default-dev, > + librust-thiserror-1+default-dev, > + librust-zstd-0.12+bindgen-dev, > + librust-zstd-0.12+default-dev > +Provides: > + librust-proxmox-oci+default-dev (= ${binary:Version}), > + librust-proxmox-oci-0-dev (= ${binary:Version}), > + librust-proxmox-oci-0+default-dev (= ${binary:Version}), > + librust-proxmox-oci-0.1-dev (= ${binary:Version}), > + librust-proxmox-oci-0.1+default-dev (= ${binary:Version}), > + librust-proxmox-oci-0.1.0-dev (= ${binary:Version}), > + librust-proxmox-oci-0.1.0+default-dev (= ${binary:Version}) > +Description: OCI image parsing and extraction - Rust source code > + Source code for Debianized Rust crate "proxmox-oci" > diff --git a/proxmox-oci/debian/debcargo.toml > b/proxmox-oci/debian/debcargo.toml > new file mode 100644 > index 00000000..b7864cdb > --- /dev/null > +++ b/proxmox-oci/debian/debcargo.toml > @@ -0,0 +1,7 @@ > +overlay = "." > +crate_src_path = ".." > +maintainer = "Proxmox Support Team <supp...@proxmox.com>" > + > +[source] > +vcs_git = "git://git.proxmox.com/git/proxmox.git" > +vcs_browser = "https://git.proxmox.com/?p=proxmox.git" > diff --git a/proxmox-oci/src/lib.rs b/proxmox-oci/src/lib.rs > new file mode 100644 > index 00000000..cc5a1d46 > --- /dev/null > +++ b/proxmox-oci/src/lib.rs > @@ -0,0 +1,196 @@ > +use flate2::read::GzDecoder; > +use oci_spec::image::{Arch, Config, ImageConfiguration, ImageManifest, > MediaType}; > +use oci_spec::OciSpecError; > +use oci_tar_image::OciTarImage;
^ Please group this import with its `mod`. > +use sha2::digest::generic_array::GenericArray; > +use sha2::{Digest, Sha256}; > +use std::collections::HashMap; > +use std::fs::File; > +use std::io::{Read, Seek}; > +use std::path::PathBuf; > +use std::str::FromStr; > +use tar::Archive; > +use thiserror::Error; ^ Please group imports. `std` first, the rest by "distance": - std - external - ours - `crate::...` > + > +pub mod oci_tar_image; ^ Does it make sense to make this public? (Generally: avoid using `pub` and only add it where needed.) > + > +fn compute_digest<R: Read, H: Digest>( > + mut reader: R, > + mut hasher: H, > +) -> GenericArray<u8, H::OutputSize> { > + let mut buf = proxmox_io::boxed::zeroed(4096); > + > + loop { > + let bytes_read = reader.read(&mut buf).unwrap(); ^ This `unwrap()` is reachable through regular file I/O and potentially compression layers AFAICT... We definitely need to propagate errors here. > + if bytes_read == 0 { > + break hasher.finalize(); > + } > + > + hasher.update(&buf[..bytes_read]); > + } > +} > + > +fn compute_sha256<R: Read>(reader: R) -> oci_spec::image::Sha256Digest { > + let digest = compute_digest(reader, Sha256::new()); > + oci_spec::image::Sha256Digest::from_str(&format!("{:x}", > digest)).unwrap() > +} > + > +/// Build a mapping from uncompressed layer digests (as found in the image > config's `rootfs.diff_ids`) > +/// to their corresponding compressed-layer digests (i.e. the filenames > under `blobs/<algorithm>/<digest>`) > +fn build_layer_map<R: Read + Seek>( > + oci_tar_image: &mut OciTarImage<R>, > + image_manifest: &ImageManifest, > +) -> HashMap<oci_spec::image::Digest, oci_spec::image::Descriptor> { > + let mut layer_mapping = HashMap::new(); > + > + for layer in image_manifest.layers() { > + let digest = match layer.media_type() { > + MediaType::ImageLayer | MediaType::ImageLayerNonDistributable => > { > + Some(layer.digest().clone()) > + } > + MediaType::ImageLayerGzip | > MediaType::ImageLayerNonDistributableGzip => { > + let compressed_blob = > oci_tar_image.open_blob(layer.digest()).unwrap(); > + let decoder = GzDecoder::new(compressed_blob); > + Some(compute_sha256(decoder).into()) > + } > + MediaType::ImageLayerZstd | > MediaType::ImageLayerNonDistributableZstd => { > + let compressed_blob = > oci_tar_image.open_blob(layer.digest()).unwrap(); > + let decoder = zstd::Decoder::new(compressed_blob).unwrap(); > + Some(compute_sha256(decoder).into()) > + } > + _ => None, This could just `continue` and the match can then drop the `Option` and we can skip the `if let Some()` below. Also: Are other types not important? If so, why? (This should be documented.) And do we know that this stays the case, or should we list the rest explicitly, so that `oci-spec` crate updates force use to look at that? > + }; > + > + if let Some(digest) = digest { > + layer_mapping.insert(digest, layer.clone()); > + } > + } > + > + layer_mapping > +} > + > +#[derive(Debug, Error)] > +pub enum ProxmoxOciError { > + #[error("Error while parsing OCI image: {0}")] > + ParseError(#[from] ParseError), > + #[error("Error while extracting OCI image: {0}")] > + ExtractError(#[from] ExtractError), > +} > + > +pub fn parse_and_extract_image( > + oci_tar_path: &str, > + rootfs_path: &str, Use `&Path` or `P: AsRef<Path>` for these parameters. > +) -> Result<Option<Config>, ProxmoxOciError> { > + let (mut oci_tar_image, image_manifest, image_config) = > parse_image(oci_tar_path)?; > + > + extract_image_rootfs( > + &mut oci_tar_image, > + &image_manifest, > + &image_config, > + rootfs_path.into(), > + )?; > + > + Ok(image_config.config().clone()) > +} > + > +#[derive(Debug, Error)] > +pub enum ParseError { > + #[error("Not an OCI image: {0}")] > + NotAnOciImage(OciSpecError), ^ I'm not convinced this is the right name, as in I don't think mapping all `OciSpecError::SerDe` and `...::Builder` errors to "this is not an OCI image" makes sense. Just use `OciSpec(OciSpecError)`. > + #[error("Wrong media type")] > + WrongMediaType, > + #[error("IO error: {0}")] > + IoError(#[from] std::io::Error), ^ I'd drop the `Error` suffix since we're already in the `Parse*Error*` type anyway ;-) (The other variants don't have an Error suffix either, after all). > + #[error("Unsupported CPU architecture")] > + UnsupportedArchitecture, > + #[error("Missing image config")] > + MissingImageConfig, > +} > + > +impl From<OciSpecError> for ParseError { > + fn from(oci_spec_err: OciSpecError) -> Self { > + match oci_spec_err { > + OciSpecError::Io(ioerr) => Self::IoError(ioerr), > + ocierr => Self::NotAnOciImage(ocierr), ^ If we decide that `NotAnOciImage` is not the correct name, I'd also drop this `From` impl and just use `#[from]`, since then it would be nice to have the IO error in the place where it happens I think... > + } > + } > +} > + > +fn parse_image( > + oci_tar_path: &str, > +) -> Result<(OciTarImage<File>, ImageManifest, ImageConfiguration), > ParseError> { > + let oci_tar_file = File::open(oci_tar_path)?; > + let mut oci_tar_image = OciTarImage::new(oci_tar_file)?; > + > + let image_manifest = oci_tar_image > + .image_manifest(&Arch::Amd64) > + .ok_or(ParseError::UnsupportedArchitecture)??; > + > + let image_config_descriptor = image_manifest.config(); > + > + if image_config_descriptor.media_type() != &MediaType::ImageConfig { > + return Err(ParseError::WrongMediaType); > + } > + > + let image_config_file = oci_tar_image > + .open_blob(image_config_descriptor.digest()) > + .ok_or(ParseError::MissingImageConfig)?; > + let image_config = ImageConfiguration::from_reader(image_config_file)?; > + > + Ok((oci_tar_image, image_manifest, image_config)) > +} > + > +#[derive(Debug, Error)] > +pub enum ExtractError { > + #[error("Rootfs destination path not found")] > + RootfsDestinationNotFound, ^ Is it really that helpful to have this case at all? If the only entry point into this crate is via the `parse_and_extract_image()` function, we can just document that the path should be pre-created and not check this. > + #[error("Unknown layer digest found in rootfs.diff_ids")] > + UnknownLayerDigest, > + #[error("Layer file mentioned in image manifest is missing")] > + MissingLayerFile, ^ Could consider including the digest in the above 2 cases. > + #[error("IO error: {0}")] > + IoError(#[from] std::io::Error), ^ I'd drop the `Error` suffix since we're already in the `Extract*Error*` type anyway ;-) > + #[error("Layer has wrong media type")] > + WrongMediaType, ^ Could consider including the textual version of the type. > +} > + > +fn extract_image_rootfs<R: Read + Seek>( > + oci_tar_image: &mut OciTarImage<R>, > + image_manifest: &ImageManifest, > + image_config: &ImageConfiguration, > + target_path: PathBuf, Make this &Path and drop the & in the unpack() call. > +) -> Result<(), ExtractError> { > + if !target_path.exists() { > + return Err(ExtractError::RootfsDestinationNotFound); > + } ^ As mentioned - not sure this is too helpful. > + > + let layer_map = build_layer_map(oci_tar_image, image_manifest); > + > + for layer in image_config.rootfs().diff_ids() { > + let layer_digest = oci_spec::image::Digest::from_str(layer) > + .map_err(|_| ExtractError::UnknownLayerDigest)?; > + let layer_descriptor = layer_map > + .get(&layer_digest) > + .ok_or(ExtractError::UnknownLayerDigest)?; > + let layer_file = oci_tar_image > + .open_blob(layer_descriptor.digest()) > + .ok_or(ExtractError::MissingLayerFile)?; > + > + let tar_file: Box<dyn Read> = match layer_descriptor.media_type() { > + MediaType::ImageLayer | MediaType::ImageLayerNonDistributable => > Box::new(layer_file), > + MediaType::ImageLayerGzip | > MediaType::ImageLayerNonDistributableGzip => { > + Box::new(GzDecoder::new(layer_file)) > + } > + MediaType::ImageLayerZstd | > MediaType::ImageLayerNonDistributableZstd => { > + Box::new(zstd::Decoder::new(layer_file)?) > + } > + _ => return Err(ExtractError::WrongMediaType), > + }; > + > + let mut archive = Archive::new(tar_file); > + archive.set_preserve_ownerships(true); What about `.set_preserve_permissions()`, `.set_preserve_xattrs()`, > + archive.unpack(&target_path)?; > + } > + > + Ok(()) > +} > diff --git a/proxmox-oci/src/oci_tar_image.rs > b/proxmox-oci/src/oci_tar_image.rs > new file mode 100644 > index 00000000..555558f5 > --- /dev/null > +++ b/proxmox-oci/src/oci_tar_image.rs > @@ -0,0 +1,167 @@ > +use oci_spec::image::{Arch, Digest, ImageIndex, ImageManifest, MediaType}; > +use oci_spec::OciSpecError; > +use std::cmp::min; > +use std::collections::HashMap; > +use std::io::{Read, Seek, SeekFrom}; > +use std::path::PathBuf; > +use tar::Archive; ^ Group these as well. > + > +#[derive(Clone)] ^ Could also be Copy > +pub struct TarEntry { > + offset: u64, > + size: usize, > +} > + > +pub struct TarEntryReader<'a, R: Read + Seek> { ^ This is quite generic. We could put this in `proxmox-io`: But then I'd argue to drop the lifetime and take `R` directly, since both `Read` and `Seek` are implemented for mutable references: struct RangeReader<R: Read + Seek> { reader: R, /// Range inside `R`. range: Range<u64>, /// Relative position inside `range`. position: u64, /// True once we performed the initial seek. ready: bool, } impl<R: Read + Seek> RangeReader<R> { pub fn new(reader: R, range: Range<u64>) -> Self; pub fn into_inner(self) -> R; pub fn size(&self) -> usize; pub fn remaining(&self) -> usize; } > + tar_entry: TarEntry, > + reader: &'a mut R, > + position: u64, > +} > + > +impl<'a, R: Read + Seek> TarEntryReader<'a, R> { > + pub fn new(tar_entry: TarEntry, reader: &'a mut R) -> Self { ^ As an *internal* helper (if not moved to proxmox-io), this `pub` should be dropped. > + Self { > + tar_entry, > + reader, > + position: 0, > + } > + } > +} > + > +impl<R: Read + Seek> Read for TarEntryReader<'_, R> { > + fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> { > + let max_read = min(buf.len(), self.tar_entry.size - self.position as > usize); ^ For maintainability/readability, I'd prefer the `size-position` to be factored out into an `fn remaining(&self) -> usize`. > + let limited_buf = &mut buf[..max_read]; > + self.reader > + .seek(SeekFrom::Start(self.tar_entry.offset + self.position))?; ^ We can reduce syscalls by doing this only once: As long as this instance of `TarEntryReader` (or `RangeReader`) exist we have exclusive access to `R`, since we own it (or it is a mutable reference). > + let bytes_read = self.reader.read(limited_buf)?; > + self.position += bytes_read as u64; ^ If made generic in `proxmox-io`, this should use `bytes_read.min(self.size())`, just in case `R` has a broken `read()` implementation and returns a value larger than the buffer's size... > + > + Ok(bytes_read) > + } > +} > + > +impl<R: Read + Seek> Seek for TarEntryReader<'_, R> { > + fn seek(&mut self, pos: SeekFrom) -> std::io::Result<u64> { > + self.position = match pos { > + SeekFrom::Start(position) => min(position, self.tar_entry.size > as u64), > + SeekFrom::End(offset) => { > + if offset > self.tar_entry.size as i64 { > + return Err(std::io::Error::new( > + std::io::ErrorKind::NotSeekable, ^ NotSeekable -> InvalidInput, see `lseek(2)`/`fseek(3)`. > + "Tried to seek before the beginning of the file", > + )); > + } > + > + (if offset <= 0 { > + self.tar_entry.size > + } else { > + self.tar_entry.size - offset as usize > + }) as u64 > + } > + SeekFrom::Current(offset) => { > + if (self.position as i64 + offset) < 0 { ^ Wraparound checks are better done with `self.position.checked_add(offset)`, which will give you an `Err` if it wraps. But to be honest... I think we may as well disregard this and just use `position.saturating_add(offset).min(size)`... For "real" files, seeking past the end is not an error after all, and it would simply put us at EOF. > + return Err(std::io::Error::new( > + std::io::ErrorKind::NotSeekable, > + "Tried to seek before the beginning of the file", > + )); > + } > + > + min(self.position + offset as u64, self.tar_entry.size as > u64) > + } > + }; > + > + Ok(self.position) > + } > +} > + > +struct TarArchive<R: Read + Seek> { > + reader: R, > + entries: HashMap<PathBuf, TarEntry>, > +} > + > +impl<R: Read + Seek> TarArchive<R> { ^ The struct is private, so all fns here can drop the `pub`. > + pub fn new(reader: R) -> std::io::Result<Self> { > + let mut archive = Archive::new(reader); > + let entries = archive.entries_with_seek()?; > + let mut entries_index = HashMap::new(); > + > + for entry in entries { > + let entry = entry?; > + let offset = entry.raw_file_position(); > + let size = entry.size() as usize; > + let path = entry.path()?.into_owned(); > + let tar_entry = TarEntry { offset, size }; > + entries_index.insert(path, tar_entry); > + } > + > + Ok(Self { > + reader: archive.into_inner(), > + entries: entries_index, > + }) > + } > + > + pub fn get_inner_file(&mut self, inner_file_path: PathBuf) -> > Option<TarEntryReader<R>> { ^ We don't need ownership here, just use `&Path`. Or, if you don't want to have to call `.as_ref()` at the call site, use `P where P: AsRef<Path>`. > + match self.entries.get(&inner_file_path) { > + Some(tar_entry) => Some(TarEntryReader::new(tar_entry.clone(), > &mut self.reader)), > + None => None, > + } > + } > +} > + > +pub struct OciTarImage<R: Read + Seek> { > + archive: TarArchive<R>, > + image_index: ImageIndex, > +} > + > +impl<R: Read + Seek> OciTarImage<R> { > + pub fn new(reader: R) -> oci_spec::Result<Self> { > + let mut archive = TarArchive::new(reader)?; > + let index_file = archive > + .get_inner_file("index.json".into()) > + .ok_or(OciSpecError::Other("Missing index.json file".into()))?; > + let image_index = ImageIndex::from_reader(index_file)?; > + > + Ok(Self { > + archive, > + image_index, > + }) > + } > + > + pub fn image_index(&self) -> &ImageIndex { > + &self.image_index > + } > + > + pub fn open_blob(&mut self, digest: &Digest) -> > Option<TarEntryReader<R>> { > + self.archive.get_inner_file(get_blob_path(digest)) > + } > + > + pub fn image_manifest( > + &mut self, > + architecture: &Arch, > + ) -> Option<oci_spec::Result<ImageManifest>> { > + match self.image_index.manifests().iter().find(|&x| { > + x.media_type() == &MediaType::ImageManifest > + && x.platform() > + .as_ref() > + .is_none_or(|platform| platform.architecture() == > architecture) > + }) { > + Some(descriptor) => match > self.open_blob(&descriptor.digest().clone()) { > + Some(image_manifest_file) => > Some(ImageManifest::from_reader(image_manifest_file)), > + None => Some(Err(OciSpecError::Other( > + "Image manifest mentioned in image index is > missing".into(), Would be good to include the digest in the error. > + ))), > + }, > + None => None, > + } > + } > +} > + > +fn get_blob_path(digest: &Digest) -> PathBuf { Given that all elements here are regular strings, this could just do: format!("blobs/{algorithm}/{digest}", algorithm = digest.algorithm()).into() And when using `<P: AsRef<Path>>` above, you could return `String` and drop the `.into()` at the end... > + let algorithm = digest.algorithm().as_ref(); > + let digest = digest.digest(); > + let mut path = PathBuf::from("blobs"); > + path.push(algorithm); > + path.push(digest); > + path > +} > -- > 2.39.5 _______________________________________________ pve-devel mailing list pve-devel@lists.proxmox.com https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel