File: spdx_writer.py

package info (click to toggle)
chromium 120.0.6099.224-1~deb11u1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 6,112,112 kB
  • sloc: cpp: 32,907,025; ansic: 8,148,123; javascript: 3,679,536; python: 2,031,248; asm: 959,718; java: 804,675; xml: 617,256; sh: 111,417; objc: 100,835; perl: 88,443; cs: 53,032; makefile: 29,579; fortran: 24,137; php: 21,162; tcl: 21,147; sql: 20,809; ruby: 17,735; pascal: 12,864; yacc: 8,045; lisp: 3,388; lex: 1,323; ada: 727; awk: 329; jsp: 267; csh: 117; exp: 43; sed: 37
file content (213 lines) | stat: -rw-r--r-- 7,053 bytes parent folder | download | duplicates (9)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
# Copyright 2022 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import collections
import dataclasses
import json
import os
import pathlib
import re
from typing import Callable, DefaultDict, Tuple


class SpdxWriter:
  """Accepts package metadata and outputs licensing info in SPDX format."""

  def __init__(self,
               root: str,
               root_package_name: str,
               root_license: str,
               link_prefix: str,
               doc_name: str = None,
               doc_namespace: str = None,
               read_file=lambda x: pathlib.Path(x).read_text(encoding='utf-8')):
    self.root_package = _Package(root_package_name, root_license)
    # Use dict to ensure no duplicate pkgs.
    # In >=py3.7 dicts are ordered by insertion.
    self.packages = {}

    self.root = root
    self.link_prefix = link_prefix
    self.doc_namespace = doc_namespace
    self.read_file = read_file

    if not doc_name:
      doc_name = root_package_name
    self.doc_name = doc_name

  def add_package(self, name: str, license_file: str):
    """Add a package to the SPDX output."""
    self.packages[_Package(name, license_file)] = None

  def write_to_file(self, file_path: str):
    """Writes the content to a file."""
    with open(file_path, 'w', encoding='utf-8') as f:
      f.write(self.write())

  def write(self) -> str:
    """Writes out SPDX in JSON format."""
    writer = _SPDXJSONWriter(self.root, self.root_package, self.link_prefix,
                             self.doc_name, self.doc_namespace, self.read_file)

    for pkg in self.packages:
      writer.add_package(pkg)

    return writer.write()


@dataclasses.dataclass(frozen=True)
class _Package:
  """Stores needed data for a package to output SPDX."""
  name: str
  file: str

  @property
  def package_spdx_id(self) -> str:
    return self._escape_id(f'SPDXRef-Package-{self.name}')

  def _escape_id(self, spdx_id: str) -> str:
    return re.sub(r'[^a-zA-Z0-9-\.]', '-', spdx_id)

  @property
  def license_spdx_id(self) -> str:
    return self._escape_id(f'LicenseRef-{self.name}')


def _get_spdx_path(root: str, license_file_path: str) -> str:
  """Get relative path from the spdx root."""
  # remove rel path things in path
  abs_path = os.path.abspath(license_file_path)
  abs_root = os.path.abspath(root)
  if not abs_path.startswith(abs_root):
    raise ValueError(f'spdx root not valid. {abs_path} is not under {abs_root}')
  return abs_path[len(abs_root):]


class _SPDXJSONWriter():
  """Writes SPDX data in JSON format.

  Produce SPDX JSON output adherring to this schema:
    https://github.com/spdx/spdx-spec/blob/development/v2.2.2/schemas/spdx-schema.json
    See example:
    https://github.com/spdx/spdx-spec/blob/development/v2.2.2/examples/SPDXJSONExample-v2.2.spdx.json
  """

  def __init__(self, root: str, root_package: _Package, link_prefix: str,
               doc_name: str, doc_namespace: str,
               read_file: Callable[[str], str]):
    self.root = root
    self.root_package_id = root_package.package_spdx_id
    self.link_prefix = link_prefix

    self.read_file = read_file

    self.content = {
        # Actually 2.2.2, but only SPDX-N.M is needed.
        'spdxVersion': 'SPDX-2.2',
        'SPDXID': 'SPDXRef-DOCUMENT',
        'name': doc_name,
        'documentNamespace': doc_namespace,
        'creationInfo': {
            'creators': [f'Tool: {os.path.basename(__file__)}'],
        },
        'dataLicense': 'CC0-1.0',
        'documentDescribes': [self.root_package_id],
        'packages': [],
        'hasExtractedLicensingInfos': [],
        'relationships': [],
    }

    # Used to dedup license files based on file path.
    self.existing_license_files = {}  # 'file path': 'licenseId'
    # Used to make sure that there are no duplicate ids.
    self.existing_package_ids = collections.defaultdict(int)  # 'packageId': num
    self.existing_license_ids = collections.defaultdict(int)  # 'licenseId': num

    # Add the root package to make sure that its ID isn't taken.
    self.add_package(root_package)

  def write(self) -> str:
    """Returns a JSON string for the current state of the writer."""
    return json.dumps(self.content, indent=4)

  def _get_dedup_id(self, elem_id: str, id_dict: DefaultDict[str, int]) -> str:
    """Returns a unique id given a dictionary with existing ids.

    IDs are case sensitive, so this method ignores casing for uniqueness.

    Args:
      elem_id: the requested id to use for the element.
      id_dict: dictionary holding already used ids.

    Returns:
      When the elem_id is already unique, return elem_id.
      When the elem_id has been used, return elem_id + '-[next num]'.
    """
    suffix = id_dict[elem_id]
    id_dict[elem_id] += 1
    return f'{elem_id}-{suffix}' if suffix > 0 else elem_id

  def _get_package_id(self, pkg: _Package) -> str:
    """Makes sure that there are no pkg id duplicates."""
    return self._get_dedup_id(pkg.package_spdx_id, self.existing_package_ids)

  def _get_license_id(self, pkg: _Package) -> Tuple[str, bool]:
    """Handles license deduplication.

    If this pkg.file has already been seen, reuse that same id instead. If
    there are two packages with the same name but different license files,
    handle deduping the names.

    Args:
      pkg: The package to get a license id for.

    Returns:
      First return value is the id, second is whether the license needs to be
        added to the SPDX doc (False if it already exists in the doc).
    """
    existing = self.existing_license_files.get(pkg.file)
    if existing:
      return existing, False

    license_id = self._get_dedup_id(pkg.license_spdx_id,
                                    self.existing_license_ids)
    self.existing_license_files[pkg.file] = license_id
    return license_id, True

  def add_package(self, pkg: _Package):
    """Writes a package to the file (package metadata)."""
    pkg_id = self._get_package_id(pkg)
    license_id, need_to_add_license = self._get_license_id(pkg)

    self.content['packages'].append({
        'SPDXID': pkg_id,
        'name': pkg.name,
        'licenseConcluded': license_id,
    })

    if pkg.package_spdx_id != self.root_package_id:
      self.content['relationships'].append({
          'spdxElementId': self.root_package_id,
          'relationshipType': 'CONTAINS',
          'relatedSpdxElement': pkg_id,
      })

    if need_to_add_license:
      self._add_license_file(pkg, license_id)

  def _add_license_file(self, pkg: _Package, license_id: str):
    """Writes a license to the file (raw license text)."""
    spdx_path = _get_spdx_path(self.root, pkg.file)
    url = f'{self.link_prefix}{spdx_path.replace(os.sep, "/")}'
    self.content['hasExtractedLicensingInfos'].append({
        'name':
        f'{pkg.name}',
        'licenseId':
        license_id,
        'extractedText':
        self.read_file(pkg.file),
        'crossRefs': [{
            'url': url,
        }],
    })