I don't really like that this significantly rewrites the implementation from 2 patches prior; it makes review more difficult. Is there a reason you did it that way? If not, I'd suggest squashing the two patches together
On Fri, Mar 6, 2026 at 7:00 AM Stefano Tondo via lists.openembedded.org <[email protected]> wrote: > > Enrich source download packages in SPDX SBOMs with comprehensive > source tracking metadata: > > External references: > - VCS references for Git repositories (ExternalRefType.vcs) > - Distribution references for HTTP/HTTPS/FTP archive downloads > - Homepage references from HOMEPAGE variable > > Source PURL qualifiers: > - Add ?type=source qualifier for recipe source tarballs to > distinguish them from built runtime packages > - Only applied to pkg:yocto or pkg:generic PURLs (ecosystem-specific > PURLs like pkg:npm already have their own semantics) > > Version extraction with priority chain: > - Priority 1: ;tag= parameter from SRC_URI (preferred, provides > meaningful versions like '1.2.3') > - Priority 2: fd.revision (resolved Git commit hash) > - Priority 3: SRCREV variable > - Priority 4: PV from recipe metadata > > PURL generation: > - Generate pkg:github PURLs for GitHub-hosted repositories > - Extensible via SPDX_GIT_PURL_MAPPINGS for other hosting services > - Ecosystem-specific version and PURL integration for Rust crates, > Go modules, PyPI, NPM packages > > Also add defensive error handling for download_location retrieval > and wire up extract_dependency_metadata() for non-Git sources. > > Signed-off-by: Stefano Tondo <[email protected]> > --- > meta/lib/oe/spdx30_tasks.py | 178 +++++++++++++++++++++++++----------- > 1 file changed, 126 insertions(+), 52 deletions(-) > > diff --git a/meta/lib/oe/spdx30_tasks.py b/meta/lib/oe/spdx30_tasks.py > index 78d1dfd250..b82015341b 100644 > --- a/meta/lib/oe/spdx30_tasks.py > +++ b/meta/lib/oe/spdx30_tasks.py > @@ -20,7 +20,6 @@ from datetime import datetime, timezone > from pathlib import Path > > > - > def extract_dependency_metadata(d, file_name): > """Extract ecosystem-specific PURL for dependency packages. > > @@ -573,15 +572,29 @@ def add_download_files(d, objset): > dep_version = None > dep_purl = None > add_download_files is getting pretty long, can this new functionality be split into a new function? > - # For Git repositories, extract version from SRCREV > + # Get download location for external references > + download_location = None > + try: > + download_location = oe.spdx_common.fetch_data_to_uri(fd, > fd.name) > + except Exception as e: > + bb.debug(1, f"Could not get download location for > {file_name}: {e}") > + > + # For Git repositories, extract version from SRCREV or tag > if fd.type == "git": > srcrev = None > > - # Try to get SRCREV for this specific source URL > + # Prefer ;tag= parameter from SRC_URI > + if hasattr(fd, 'parm') and fd.parm and 'tag' in fd.parm: > + tag = fd.parm['tag'] > + if tag and tag not in ['${AUTOREV}', 'AUTOINC', > 'INVALID']: > + dep_version = tag[1:] if tag.startswith('v') else tag > + version_source = "tag" This feels like a little too much of a heuristic to me (mapping a tag to a version); Do we need to do this? The SRCREV seems more precise > + # Try fd.revision for resolved SRCREV > # Note: fd.revision (not fd.revisions) contains the resolved > revision > - if hasattr(fd, 'revision') and fd.revision: > + if not dep_version and hasattr(fd, 'revision') and > fd.revision: > srcrev = fd.revision > bb.debug(1, f"SPDX: Found fd.revision for {file_name}: > {srcrev}") > + version_source = "fd.revision" > > # Note: We intentionally do NOT fall back to > d.getVar('SRCREV') > # because referencing SRCREV in BBIMPORTS-registered module > code > @@ -590,65 +603,127 @@ def add_download_files(d, objset): > # "AUTOREV/SRCPV set too late" errors for non-git temp > recipes > # used by recipetool/devtool with HTTP sources. > # fd.revision is always available for git sources after > fetch. > - if srcrev and srcrev not in ['${AUTOREV}', 'AUTOINC', > 'INVALID']: > - # Use first 12 characters of Git commit as version > (standard Git short hash) > + if not dep_version and srcrev and srcrev not in > ['${AUTOREV}', 'AUTOINC', 'INVALID']: > dep_version = srcrev[:12] if len(srcrev) >= 12 else > srcrev > - bb.debug(1, f"SPDX: Extracted Git version for > {file_name}: {dep_version}") > - > - # Generate PURL for Git hosting services > - # Reference: > https://github.com/package-url/purl-spec/blob/master/PURL-TYPES.rst > - download_location = oe.spdx_common.fetch_data_to_uri(fd, > fd.name) > - if download_location and > download_location.startswith('git+'): > - git_url = download_location[4:] # Remove 'git+' > prefix > - > - # Build Git PURL handlers from default + custom > mappings > - # Format: 'domain': ('purl_type', lambda to extract > path) > - # Can be extended in meta-siemens or other layers > via SPDX_GIT_PURL_MAPPINGS > - git_purl_handlers = { > - 'github.com': ('pkg:github', lambda parts: > f"{parts[0]}/{parts[1].replace('.git', '')}" if len(parts) >= 2 else None), > - # Note: pkg:gitlab is NOT in official PURL spec, > so we omit it by default > - # Other Git hosts can be added via > SPDX_GIT_PURL_MAPPINGS > - } > - > - # Allow layers to extend PURL mappings via > SPDX_GIT_PURL_MAPPINGS variable > - # Format: "domain1:purl_type1 domain2:purl_type2" > - # Example: SPDX_GIT_PURL_MAPPINGS = > "gitlab.com:pkg:gitlab git.example.com:pkg:generic" > - custom_mappings = d.getVar('SPDX_GIT_PURL_MAPPINGS') > - if custom_mappings: > - for mapping in custom_mappings.split(): > - try: > - domain, purl_type = mapping.split(':') > - # Use simple path handler for custom > domains > - git_purl_handlers[domain] = (purl_type, > lambda parts: f"{parts[0]}/{parts[1].replace('.git', '')}" if len(parts) >= 2 > else None) > - bb.debug(2, f"SPDX: Added custom Git > PURL mapping: {domain} -> {purl_type}") > - except ValueError: > - bb.warn(f"SPDX: Invalid > SPDX_GIT_PURL_MAPPINGS entry: {mapping} (expected format: domain:purl_type)") > - > - for domain, (purl_type, path_handler) in > git_purl_handlers.items(): > - if f'://{domain}/' in git_url or f'//{domain}/' > in git_url: > - # Extract path after domain > - path_start = git_url.find(f'{domain}/') + > len(f'{domain}/') > - path = git_url[path_start:].split('/') > - purl_path = path_handler(path) > - if purl_path: > - dep_purl = > f"{purl_type}/{purl_path}@{srcrev}" > - bb.debug(1, f"SPDX: Generated > {purl_type} PURL: {dep_purl}") > - break > - > - # Fallback: use parent package version if no other version found > + bb.debug(1, f"Extracted Git version for {file_name}: > {dep_version} (from {version_source})") > + > + # Generate PURL for Git hosting services > + # Reference: > https://github.com/package-url/purl-spec/blob/master/PURL-TYPES.rst > + if dep_version and download_location and > isinstance(download_location, str) and download_location.startswith('git+'): > + git_url = download_location[4:] # Remove 'git+' prefix > + > + # Default Git PURL handler (github.com) > + git_purl_handlers = { > + 'github.com': ('pkg:github', lambda parts: > f"{parts[0]}/{parts[1].replace('.git', '')}" if len(parts) >= 2 else None), > + # Note: pkg:gitlab is NOT in official PURL spec, so > we omit it by default > + } > + > + # Custom PURL mappings from SPDX_GIT_PURL_MAPPINGS > + # Format: "domain1:purl_type1 domain2:purl_type2" > + # Example: SPDX_GIT_PURL_MAPPINGS = > "gitlab.com:pkg:gitlab git.example.com:pkg:generic" > + custom_mappings = d.getVar('SPDX_GIT_PURL_MAPPINGS') > + if custom_mappings: > + for mapping in custom_mappings.split(): > + try: > + domain, purl_type = mapping.split(':') > + git_purl_handlers[domain] = (purl_type, > lambda parts: f"{parts[0]}/{parts[1].replace('.git', '')}" if len(parts) >= 2 > else None) > + bb.debug(2, f"Added custom Git PURL mapping: > {domain} -> {purl_type}") > + except ValueError: > + bb.warn(f"Invalid SPDX_GIT_PURL_MAPPINGS > entry: {mapping} (expected format: domain:purl_type)") > + > + for domain, (purl_type, path_handler) in > git_purl_handlers.items(): > + if f'://{domain}/' in git_url or f'//{domain}/' in > git_url: > + path_start = git_url.find(f'{domain}/') + > len(f'{domain}/') > + path = git_url[path_start:].split('/') > + purl_path = path_handler(path) > + if purl_path: > + purl_version = dep_version if version_source > == "tag" else (srcrev if srcrev else dep_version) > + dep_purl = > f"{purl_type}/{purl_path}@{purl_version}" > + bb.debug(1, f"Generated {purl_type} PURL: > {dep_purl}") > + break > + > + # Fallback to recipe PV > if not dep_version: > pv = d.getVar('PV') > if pv and pv not in ['git', 'AUTOINC', 'INVALID', '${PV}']: > dep_version = pv > - bb.debug(1, f"SPDX: Using parent PV for {file_name}: > {dep_version}") > + # Non-Git: try ecosystem-specific PURL > + if fd.type != "git": > + ecosystem_version, ecosystem_purl = > extract_dependency_metadata(d, file_name) > + > + if ecosystem_version and not dep_version: > + dep_version = ecosystem_version > + if ecosystem_purl and not dep_purl: > + dep_purl = ecosystem_purl > + bb.debug(1, f"Generated ecosystem PURL for {file_name}: > {dep_purl}") > > - # Set version and PURL if extracted > if dep_version: > dl.software_packageVersion = dep_version > > if dep_purl: > dl.software_packageUrl = dep_purl > > + # Add ?type=source qualifier for source tarballs > + if (primary_purpose == oe.spdx30.software_SoftwarePurpose.source > and > + fd.type != "git" and > + file_name.endswith(('.tar.gz', '.tar.bz2', '.tar.xz', > '.zip', '.tgz'))): Why does it have to be one of these archive formats? > + > + current_purl = dl.software_packageUrl Isn't this `dep_purl`? Why can't this code live before dl.software_packageUrl is assigned. > + if current_purl: > + purl_type = current_purl.split('/')[0] if '/' in > current_purl else '' > + if purl_type in ['pkg:yocto', 'pkg:generic']: > + source_purl = f"{current_purl}?type=source" > + dl.software_packageUrl = source_purl > + else: > + recipe_purl = oe.purl.get_base_purl(d) > + if recipe_purl: > + base_purl = recipe_purl > + source_purl = f"{base_purl}?type=source" > + dl.software_packageUrl = source_purl > + # Add external references > + > + # VCS reference for Git repositories > + if fd.type == "git" and download_location and > isinstance(download_location, str) and download_location.startswith('git+'): > + git_url = download_location[4:] # Remove 'git+' prefix > + # Clean up URL (remove commit hash if present) > + if '@' in git_url: > + git_url = git_url.split('@')[0] > + > + dl.externalRef = dl.externalRef or [] > + dl.externalRef.append( > + oe.spdx30.ExternalRef( > + externalRefType=oe.spdx30.ExternalRefType.vcs, > + locator=[git_url], > + ) > + ) > + > + # Distribution reference for tarball/archive downloads > + elif download_location and isinstance(download_location, str) > and ( > + download_location.startswith('http://') or > + download_location.startswith('https://') or > + download_location.startswith('ftp://')): > + dl.externalRef = dl.externalRef or [] > + dl.externalRef.append( > + oe.spdx30.ExternalRef( > + > externalRefType=oe.spdx30.ExternalRefType.altDownloadLocation, > + locator=[download_location], > + ) > + ) > + > + # Homepage reference if available > + homepage = d.getVar('HOMEPAGE') Does HOMEPAGE apply to all downloads? I'm not sure. > + if homepage: > + homepage = homepage.strip() > + dl.externalRef = dl.externalRef or [] > + # Only add if not already added as distribution reference > + if not any(homepage in ref.locator for ref in > dl.externalRef): > + dl.externalRef.append( > + oe.spdx30.ExternalRef( > + > externalRefType=oe.spdx30.ExternalRefType.altWebPage, > + locator=[homepage], > + ) > + ) > + > if fd.method.supports_checksum(fd): > # TODO Need something better than hard coding this > for checksum_id in ["sha256", "sha1"]: > @@ -665,7 +740,6 @@ def add_download_files(d, objset): > ) > ) > > - inputs.add(dl) > > return inputs > > -- > 2.53.0 > > > >
-=-=-=-=-=-=-=-=-=-=-=- Links: You receive all messages sent to this group. View/Reply Online (#232622): https://lists.openembedded.org/g/openembedded-core/message/232622 Mute This Topic: https://lists.openembedded.org/mt/118170498/21656 Group Owner: [email protected] Unsubscribe: https://lists.openembedded.org/g/openembedded-core/unsub [[email protected]] -=-=-=-=-=-=-=-=-=-=-=-
