diff --git a/vulnerabilities/improvers/__init__.py b/vulnerabilities/improvers/__init__.py index 97c18e6f9..982b4bbd8 100644 --- a/vulnerabilities/improvers/__init__.py +++ b/vulnerabilities/improvers/__init__.py @@ -20,6 +20,7 @@ from vulnerabilities.pipelines import populate_vulnerability_summary_pipeline from vulnerabilities.pipelines import remove_duplicate_advisories from vulnerabilities.pipelines.v2_improvers import collect_ssvc_trees +from vulnerabilities.pipelines.v2_improvers import compute_advisory_content_hash from vulnerabilities.pipelines.v2_improvers import compute_advisory_todo as compute_advisory_todo_v2 from vulnerabilities.pipelines.v2_improvers import compute_package_risk as compute_package_risk_v2 from vulnerabilities.pipelines.v2_improvers import ( @@ -74,5 +75,6 @@ compute_advisory_todo.ComputeToDo, collect_ssvc_trees.CollectSSVCPipeline, relate_severities.RelateSeveritiesPipeline, + compute_advisory_content_hash.ComputeAdvisoryContentHash, ] ) diff --git a/vulnerabilities/migrations/0116_advisoryv2_advisory_content_hash.py b/vulnerabilities/migrations/0116_advisoryv2_advisory_content_hash.py new file mode 100644 index 000000000..2844f034d --- /dev/null +++ b/vulnerabilities/migrations/0116_advisoryv2_advisory_content_hash.py @@ -0,0 +1,23 @@ +# Generated by Django 5.2.11 on 2026-03-11 08:46 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("vulnerabilities", "0115_impactedpackageaffecting_and_more"), + ] + + operations = [ + migrations.AddField( + model_name="advisoryv2", + name="advisory_content_hash", + field=models.CharField( + blank=True, + help_text="A unique hash computed from the content of the advisory used to identify advisories with the same content.", + max_length=64, + null=True, + ), + ), + ] diff --git a/vulnerabilities/models.py b/vulnerabilities/models.py index 1981a2861..d1c88f285 100644 --- a/vulnerabilities/models.py +++ b/vulnerabilities/models.py @@ -3010,6 +3010,13 @@ class AdvisoryV2(models.Model): help_text="Related advisories that are used to calculate the severity of this advisory.", ) + advisory_content_hash = models.CharField( + max_length=64, + blank=True, + null=True, + help_text="A unique hash computed from the content of the advisory used to identify advisories with the same content.", + ) + @property def risk_score(self): """ @@ -3078,35 +3085,6 @@ def get_aliases(self): """ return self.aliases.all() - def compute_advisory_content(self): - """ - Compute a unique content hash for an advisory by normalizing its data and hashing it. - - :param advisory: An Advisory object - :return: SHA-256 hash digest as content hash - """ - normalized_data = { - "summary": normalize_text(self.summary), - "impacted_packages": sorted( - [impact.to_dict() for impact in self.impacted_packages.all()], - key=lambda x: json.dumps(x, sort_keys=True), - ), - "patches": sorted( - [patch.to_patch_data().to_dict() for patch in self.patches.all()], - key=lambda x: json.dumps(x, sort_keys=True), - ), - "severities": sorted( - [sev.to_vulnerability_severity_data().to_dict() for sev in self.severities.all()], - key=lambda x: (x.get("system"), x.get("value")), - ), - "weaknesses": normalize_list([weakness.cwe_id for weakness in self.weaknesses.all()]), - } - - normalized_json = json.dumps(normalized_data, separators=(",", ":"), sort_keys=True) - content_hash = hashlib.sha256(normalized_json.encode("utf-8")).hexdigest() - - return content_hash - alias = get_aliases diff --git a/vulnerabilities/pipelines/v2_importers/alpine_linux_importer.py b/vulnerabilities/pipelines/v2_importers/alpine_linux_importer.py index 336afa298..4c176732e 100644 --- a/vulnerabilities/pipelines/v2_importers/alpine_linux_importer.py +++ b/vulnerabilities/pipelines/v2_importers/alpine_linux_importer.py @@ -193,7 +193,8 @@ def load_advisories( fixed_version_range = None try: - fixed_version_range = AlpineLinuxVersionRange.from_versions([version]) + if version: + fixed_version_range = AlpineLinuxVersionRange.from_versions([version]) except InvalidVersion as e: logger( f"{version!r} is not a valid AlpineVersion {e!r}", diff --git a/vulnerabilities/pipelines/v2_importers/apache_httpd_importer.py b/vulnerabilities/pipelines/v2_importers/apache_httpd_importer.py index 51e4b5e77..3987eea73 100644 --- a/vulnerabilities/pipelines/v2_importers/apache_httpd_importer.py +++ b/vulnerabilities/pipelines/v2_importers/apache_httpd_importer.py @@ -330,7 +330,7 @@ def to_version_ranges(self, versions_data, fixed_versions): "=": "=", } comparator = comparator_by_range_expression.get(range_expression) - if comparator: + if comparator and version_value and version_value not in self.ignorable_versions: constraints.append( VersionConstraint(comparator=comparator, version=SemverVersion(version_value)) ) @@ -338,11 +338,12 @@ def to_version_ranges(self, versions_data, fixed_versions): for fixed_version in fixed_versions: # The VersionConstraint method `invert()` inverts the fixed_version's comparator, # enabling inclusion of multiple fixed versions with the `affected_version_range` values. - constraints.append( - VersionConstraint( - comparator="=", - version=SemverVersion(fixed_version), - ).invert() - ) + if fixed_version and fixed_version not in self.ignorable_versions: + constraints.append( + VersionConstraint( + comparator="=", + version=SemverVersion(fixed_version), + ).invert() + ) return ApacheVersionRange(constraints=constraints) diff --git a/vulnerabilities/pipelines/v2_importers/elixir_security_importer.py b/vulnerabilities/pipelines/v2_importers/elixir_security_importer.py index 8d7cdfc9e..3b9f86d8e 100644 --- a/vulnerabilities/pipelines/v2_importers/elixir_security_importer.py +++ b/vulnerabilities/pipelines/v2_importers/elixir_security_importer.py @@ -35,6 +35,7 @@ class ElixirSecurityImporterPipeline(VulnerableCodeBaseImporterPipelineV2): spdx_license_expression = "CC0-1.0" license_url = "https://github.com/dependabot/elixir-security-advisories/blob/master/LICENSE.txt" repo_url = "git+https://github.com/dependabot/elixir-security-advisories" + run_once = True precedence = 200 diff --git a/vulnerabilities/pipelines/v2_importers/gitlab_importer.py b/vulnerabilities/pipelines/v2_importers/gitlab_importer.py index 7c164b1c0..2c12f3a1a 100644 --- a/vulnerabilities/pipelines/v2_importers/gitlab_importer.py +++ b/vulnerabilities/pipelines/v2_importers/gitlab_importer.py @@ -252,6 +252,7 @@ def parse_gitlab_advisory( original_advisory_text=json.dumps(gitlab_advisory, indent=2, ensure_ascii=False), ) affected_version_range = None + fixed_version_range = None fixed_versions = gitlab_advisory.get("fixed_versions") or [] affected_range = gitlab_advisory.get("affected_range") gitlab_native_schemes = set(["pypi", "gem", "npm", "go", "packagist", "conan"]) @@ -285,7 +286,8 @@ def parse_gitlab_advisory( if affected_version_range: vrc = affected_version_range.__class__ - fixed_version_range = vrc.from_versions(parsed_fixed_versions) + if parsed_fixed_versions: + fixed_version_range = vrc.from_versions(parsed_fixed_versions) if not fixed_version_range and not affected_version_range: return diff --git a/vulnerabilities/pipelines/v2_importers/ruby_importer.py b/vulnerabilities/pipelines/v2_importers/ruby_importer.py index 455a0a491..fad09a1b5 100644 --- a/vulnerabilities/pipelines/v2_importers/ruby_importer.py +++ b/vulnerabilities/pipelines/v2_importers/ruby_importer.py @@ -162,7 +162,9 @@ def get_affected_packages(record, purl): affected_packages = [] for unaffected_version in record.get("unaffected_versions", []): try: - affected_version_range = GemVersionRange.from_native(unaffected_version).invert() + if unaffected_version: + unaffected_version = unaffected_version.strip() + affected_version_range = GemVersionRange.from_native(unaffected_version).invert() validate_comparators(affected_version_range.constraints) affected_packages.append( AffectedPackageV2( diff --git a/vulnerabilities/pipelines/v2_improvers/compute_advisory_content_hash.py b/vulnerabilities/pipelines/v2_improvers/compute_advisory_content_hash.py new file mode 100644 index 000000000..cd91db771 --- /dev/null +++ b/vulnerabilities/pipelines/v2_improvers/compute_advisory_content_hash.py @@ -0,0 +1,65 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# VulnerableCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/vulnerablecode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + + +from aboutcode.pipeline import LoopProgress + +from vulnerabilities.models import AdvisoryV2 +from vulnerabilities.pipelines import VulnerableCodePipeline +from vulnerabilities.utils import compute_advisory_content + + +class ComputeAdvisoryContentHash(VulnerableCodePipeline): + """Compute Advisory Content Hash for Advisory.""" + + pipeline_id = "compute_advisory_content_hash_v2" + + @classmethod + def steps(cls): + return (cls.compute_advisory_content_hash,) + + def compute_advisory_content_hash(self): + """Create ToDos for missing summary, affected and fixed packages.""" + + advisories = AdvisoryV2.objects.filter(advisory_content_hash__isnull=True) + + advisories_count = advisories.count() + + self.log( + f"Checking missing summary, affected and fixed packages in {advisories_count} Advisories" + ) + progress = LoopProgress( + total_iterations=advisories_count, + logger=self.log, + progress_step=1, + ) + + to_update = [] + batch_size = 5000 + + for advisory in progress.iter(advisories.iterator(chunk_size=batch_size)): + advisory.advisory_content_hash = compute_advisory_content(advisory) + to_update.append(advisory) + + if len(to_update) >= batch_size: + AdvisoryV2.objects.bulk_update( + to_update, + ["advisory_content_hash"], + batch_size=batch_size, + ) + to_update.clear() + + if to_update: + AdvisoryV2.objects.bulk_update( + to_update, + ["advisory_content_hash"], + batch_size=batch_size, + ) + + self.log("Finished computing advisory_content_hash") diff --git a/vulnerabilities/pipes/advisory.py b/vulnerabilities/pipes/advisory.py index c812fdf86..00414a259 100644 --- a/vulnerabilities/pipes/advisory.py +++ b/vulnerabilities/pipes/advisory.py @@ -48,6 +48,7 @@ from vulnerabilities.models import VulnerabilitySeverity from vulnerabilities.models import Weakness from vulnerabilities.pipes.univers_utils import get_exact_purls_v2 +from vulnerabilities.utils import compute_advisory_content def get_or_create_aliases(aliases: List) -> QuerySet: @@ -301,6 +302,7 @@ def insert_advisory_v2( advisory_obj = None created = False content_id = compute_content_id_v2(advisory_data=advisory) + advisory_content_hash = compute_advisory_content(advisory_data=advisory) try: default_data = { "datasource_id": pipeline_id, @@ -311,6 +313,7 @@ def insert_advisory_v2( "original_advisory_text": advisory.original_advisory_text, "url": advisory.url, "precedence": precedence, + "advisory_content_hash": advisory_content_hash, } advisory_obj, created = AdvisoryV2.objects.get_or_create( diff --git a/vulnerabilities/utils.py b/vulnerabilities/utils.py index 0eb1d1258..f90d42401 100644 --- a/vulnerabilities/utils.py +++ b/vulnerabilities/utils.py @@ -848,7 +848,11 @@ def group_advisories_by_content(advisories): grouped = {} for advisory in advisories: - content_hash = advisory.compute_advisory_content() + content_hash = ( + advisory.advisory_content_hash + if advisory.advisory_content_hash + else compute_advisory_content(advisory) + ) entry = grouped.setdefault( content_hash, @@ -867,3 +871,30 @@ def group_advisories_by_content(advisories): entry["secondary"].add(advisory) return grouped + + +def compute_advisory_content(advisory_data): + """ + Compute a unique content hash for an advisory by normalizing its data and hashing it. + + :param advisory_data: An AdvisoryData object + :return: SHA-256 hash digest as content hash + """ + from vulnerabilities.models import AdvisoryV2 + + if isinstance(advisory_data, AdvisoryV2): + advisory_data = advisory_data.to_advisory_data() + normalized_data = { + "summary": normalize_text(advisory_data.summary), + "affected_packages": [ + pkg.to_dict() for pkg in normalize_list(advisory_data.affected_packages) if pkg + ], + "severities": [sev.to_dict() for sev in normalize_list(advisory_data.severities) if sev], + "weaknesses": normalize_list(advisory_data.weaknesses), + "patches": [patch.to_dict() for patch in normalize_list(advisory_data.patches)], + } + + normalized_json = json.dumps(normalized_data, separators=(",", ":"), sort_keys=True) + content_hash = hashlib.sha256(normalized_json.encode("utf-8")).hexdigest() + + return content_hash