File: extract_ir_lib.py

package info (click to toggle)
llvm-toolchain-20 1%3A20.1.6-1~exp1
  • links: PTS, VCS
  • area: main
  • in suites: experimental
  • size: 2,111,304 kB
  • sloc: cpp: 7,438,677; ansic: 1,393,822; asm: 1,012,926; python: 241,650; f90: 86,635; objc: 75,479; lisp: 42,144; pascal: 17,286; sh: 10,027; ml: 5,082; perl: 4,730; awk: 3,523; makefile: 3,349; javascript: 2,251; xml: 892; fortran: 672
file content (417 lines) | stat: -rw-r--r-- 15,446 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
"""Library functions for IR extraction."""

import os
import pathlib
import re
import shutil
import subprocess
import multiprocessing
import functools
import json
import logging

from typing import Dict, List, Optional

_UNSPECIFIED_OVERRIDE = ["<UNSPECIFIED>"]


# TODO(ml-compiler-opt): maybe we can also convert here the cmdline file,from a
# \0 - separated list of strings, to a \n one.
def should_include_module(cmdline: str, match_regexp: Optional[str]) -> bool:
    """Determine if the module should be included."""
    if match_regexp is None:
        return True
    lines = cmdline.split("\0")
    return any(len(re.findall(match_regexp, l)) for l in lines)


def get_thinlto_index(cmdline: str, basedir: str) -> Optional[str]:
    opts = cmdline.split("\0")
    for option in opts:
        if option.startswith("-fthinlto-index"):
            return os.path.join(basedir, option.split("=")[1])
    return None


class TrainingIRExtractor:
    """IR and command line extraction from an object file."""

    def __init__(self, obj_relative_path, output_base_dir, obj_base_dir=None):
        """Set up a TrainingIRExtractor.

        Args:
          obj_relative_path: relative path to the input object file. It will be also
            used to construct the absolute path of the output IR and cmd files, by
            appending it to output_base_dir.
          output_base_dir: the directory under which the output will be produced.
          obj_base_dir: the base directory for all the input object files.
        """
        self._obj_relative_path = obj_relative_path
        self._output_base_dir = output_base_dir
        self._obj_base_dir = obj_base_dir if obj_base_dir is not None else ""

    def obj_base_dir(self):
        return self._obj_base_dir

    def output_base_dir(self):
        return self._output_base_dir

    def relative_output_path(self):
        return self._obj_relative_path

    def input_obj(self):
        return os.path.join(self.obj_base_dir(), self._obj_relative_path)

    def lld_src_bc(self):
        # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
        # IR bitcode saved by lld. It is hardcoded into lld.
        return os.path.join(
            self._obj_base_dir, self._obj_relative_path + ".3.import.bc"
        )

    def lld_src_thinlto(self):
        return os.path.join(self._obj_base_dir, self._obj_relative_path + ".thinlto.bc")

    def dest_dir(self):
        return os.path.join(
            self.output_base_dir(), os.path.dirname(self._obj_relative_path)
        )

    def module_name(self):
        return os.path.basename(self._obj_relative_path)

    def cmd_file(self):
        return os.path.join(self.dest_dir(), self.module_name() + ".cmd")

    def bc_file(self):
        return os.path.join(self.dest_dir(), self.module_name() + ".bc")

    def thinlto_index_file(self):
        return os.path.join(self.dest_dir(), self.module_name() + ".thinlto.bc")

    def _get_extraction_cmd_command(
        self, llvm_objcopy_path: str, cmd_section_name: str
    ):
        """Get llvm-objcopy and process args to a produce a command string that,
        when invoked, will extract the cmd section info ths self.cmd_file() file.
        """
        return [
            llvm_objcopy_path,
            "--dump-section=" + cmd_section_name + "=" + self.cmd_file(),
            self.input_obj(),
            "/dev/null",
        ]

    def _get_extraction_bc_command(
        self, llvm_objcopy_path: str, bitcode_section_name: str
    ):
        """Gets llvm-objcopy and process args to produce a command string that,
        when invoked, will extract the bitcode section into the self.bc_file()
        file.
        """
        return [
            llvm_objcopy_path,
            "--dump-section=" + bitcode_section_name + "=" + self.bc_file(),
            self.input_obj(),
            "/dev/null",
        ]

    def _extract_clang_artifacts(
        self,
        llvm_objcopy_path: str,
        cmd_filter: str,
        is_thinlto: bool,
        cmd_section_name: str,
        bitcode_section_name: str,
    ) -> Optional[str]:
        """Run llvm-objcopy to extract the .bc and command line."""
        if not os.path.exists(self.input_obj()):
            logging.info("%s does not exist.", self.input_obj())
            return None
        os.makedirs(self.dest_dir(), exist_ok=True)
        try:
            subprocess.check_output(
                self._get_extraction_cmd_command(llvm_objcopy_path, cmd_section_name),
                stderr=subprocess.STDOUT,
                encoding="utf-8",
            )
            if cmd_filter is not None or is_thinlto:
                with open(self.cmd_file(), encoding="utf-8") as f:
                    lines = f.readlines()
                assert len(lines) == 1
                cmdline = lines[0]
                if not should_include_module(cmdline, cmd_filter):
                    logging.info(
                        "Excluding module %s because it does not match the filter",
                        self.input_obj(),
                    )
                    os.remove(self.cmd_file())
                    return None
                if is_thinlto:
                    index_file = get_thinlto_index(cmdline, self.obj_base_dir())
                    shutil.copy(index_file, self.thinlto_index_file())

            subprocess.check_output(
                self._get_extraction_bc_command(
                    llvm_objcopy_path, bitcode_section_name
                ),
                stderr=subprocess.STDOUT,
                encoding="utf-8",
            )
        except subprocess.CalledProcessError as e:
            # This may happen if  .o file was build from asm (.S source).
            logging.warning("%s was not processed: %s", self.input_obj(), e)
            logging.info(e.output)
            return None
        assert (
            os.path.exists(self.cmd_file())
            and os.path.exists(self.bc_file())
            and (not is_thinlto or os.path.exists(self.thinlto_index_file()))
        )
        return self.relative_output_path()

    def _extract_lld_artifacts(self) -> Optional[str]:
        """Extract the .bc file with ThinLTO index from an lld ThinLTO invocation."""
        if not os.path.exists(self.lld_src_bc()):
            logging.info("%s does not exist.", self.lld_src_bc())
            return None
        if not os.path.exists(self.lld_src_thinlto()):
            logging.info("%s does not exist.", self.lld_src_thinlto())
            return None
        os.makedirs(self.dest_dir(), exist_ok=True)

        # Copy over the files
        shutil.copy(self.lld_src_bc(), self.bc_file())
        shutil.copy(self.lld_src_thinlto(), self.thinlto_index_file())

        assert os.path.exists(self.bc_file())
        assert os.path.exists(self.thinlto_index_file())
        return self._obj_relative_path

    def extract(
        self,
        llvm_objcopy_path: Optional[str] = None,
        cmd_filter: Optional[str] = None,
        thinlto_build: Optional[str] = None,
        cmd_section_name: Optional[str] = ".llvmcmd",
        bitcode_section_name: Optional[str] = ".llvmbc",
    ) -> Optional[str]:
        if thinlto_build == "local":
            return self._extract_lld_artifacts()
        return self._extract_clang_artifacts(
            llvm_objcopy_path=llvm_objcopy_path,
            cmd_filter=cmd_filter,
            is_thinlto=thinlto_build == "distributed",
            cmd_section_name=cmd_section_name,
            bitcode_section_name=bitcode_section_name,
        )


def convert_compile_command_to_objectfile(
    command: Dict[str, str], output_dir: str
) -> Optional[TrainingIRExtractor]:
    obj_base_dir = command["directory"]
    if "arguments" in command:
        cmd_parts = command["arguments"]
    elif "command" in command:
        cmd_parts = command["command"].split()
    else:
        logging.info("compile_commands element has no command and arguments")
        return None

    try:
        obj_index = cmd_parts.index("-o") + 1
    except ValueError:
        # This could happen if there are non-clang commands in compile_commands.json
        logging.info("Command has no -o option: %s", " ".join(cmd_parts))
        return None
    obj_rel_path = cmd_parts[obj_index]
    # TODO(mtrofin): is the obj_base_dir correct for thinlto index bc files?
    return TrainingIRExtractor(
        obj_relative_path=obj_rel_path,
        output_base_dir=output_dir,
        obj_base_dir=obj_base_dir,
    )


def load_from_compile_commands(
    json_array: List[Dict[str, str]], output_dir: str
) -> List[TrainingIRExtractor]:
    objs = [
        convert_compile_command_to_objectfile(cmd, output_dir) for cmd in json_array
    ]
    # Filter out None, in case there were non-clang commands in the .json
    return [obj for obj in objs if obj is not None]


def load_from_lld_params(
    params_array: List[str], obj_base_dir: str, output_dir: str
) -> List[TrainingIRExtractor]:
    """Create an ObjectFile array based on lld's parameters."""
    # yank out -o and the output. After that, anything not starting with '-', and
    # ending in a '.o', is an object file.
    try:
        minus_o_idx = params_array.index("-o")
        del params_array[minus_o_idx : minus_o_idx + 2]
        just_obj_paths = [
            o for o in params_array if not o.startswith("-") and o.endswith(".o")
        ]
    except ValueError:
        logging.info("This params file does not have an explicit -o option.")
        just_obj_paths = params_array

    def make_obj(obj_file: str) -> TrainingIRExtractor:
        return TrainingIRExtractor(
            obj_relative_path=obj_file,
            output_base_dir=output_dir,
            obj_base_dir=obj_base_dir,
        )

    return [make_obj(obj_file) for obj_file in just_obj_paths]


def load_from_directory(
    obj_base_dir: str, output_dir: str
) -> List[TrainingIRExtractor]:
    """Create an object file array by globbing an entire drectory.

    Args:
      obj_base_dir: The base build directory that all object files will be
        written out as being relative to.
      output_dir: The output directory where extracted .bc and .cmd files should
        be placed.
    """
    paths = [str(p) for p in pathlib.Path(obj_base_dir).glob("**/*.o")]

    def make_spec(obj_file: str):
        return TrainingIRExtractor(
            obj_relative_path=os.path.relpath(obj_file, start=obj_base_dir),
            output_base_dir=output_dir,
            obj_base_dir=obj_base_dir,
        )

    return [make_spec(path) for path in paths]


def load_for_lld_thinlto(
    obj_base_dir: str, output_dir: str
) -> List[TrainingIRExtractor]:
    # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
    # IR bitcode saved by lld. It is hardcoded into lld. ThinLTO index files
    # are also emitted next to the postimport bitcode, with the suffix
    # .thinlto.bc instead
    paths = [str(p) for p in pathlib.Path(obj_base_dir).glob("**/*.3.import.bc")]

    def make_spec(obj_file: str):
        return TrainingIRExtractor(
            # Cut away .3.import.bc
            obj_relative_path=os.path.relpath(obj_file, start=obj_base_dir)[:-12],
            output_base_dir=output_dir,
            obj_base_dir=obj_base_dir,
        )

    return [make_spec(path) for path in paths]


def load_bazel_aquery(aquery_json, obj_base_dir: str, output_dir: str):
    """Creates an object file array by looking at the JSON output of bazel aquery.

    Args:
      aquery_json: The JSON-formatted output of the bazel aquery command for
        the target of interest. The bazel aquery JSON should be a JSON
        serialized version of the analysis.ActionGraphContainer proto.
        https://github.com/bazelbuild/bazel/blob/master/src/main/protobuf/analysis_v2.proto
      obj_base_dir: The base build directory that all object files will be
        written out as arelative to.
      output_dir: The output directory where extracted .bc and .cmd files should
        be placed.
    """
    linker_params = []

    for action_info in aquery_json["actions"]:
        if action_info["mnemonic"] != "CppLink":
            continue
        linker_params = action_info["arguments"]

    return load_from_lld_params(linker_params, obj_base_dir, output_dir)


def run_extraction(
    objs: List[TrainingIRExtractor],
    num_workers: int,
    llvm_objcopy_path: str,
    cmd_filter: str,
    thinlto_build: str,
    cmd_section_name: str,
    bitcode_section_name: str,
):
    """Extracts all specified object files into the corpus directory.

    Args:
      objs: A list of TrainingIRExtractor Objects that represent the object files
        to extract bitcode/commands from.
      num_workers: The number of parallel processes to spawn to run the
        extraction.
      llvm_objcopy_path: The path to the llvm-objcopy to use for dumping sections.
      cmd_filter: A regular expression that is used to select for compilations
        performed with specific flags. If you want to include all compilations,
        set this to None.
      thinlto_build: Whether or not this is a ThinLTO build, and if so, the type.
        Set this to None if the build was not done with ThinLTO.
      cmd_section_name: The name of the command line section created by the
        bitcode embedding.
      bitcode_section_name: The name of the bitcode section created by the
        bitcode embedding.
    """
    extract_artifacts = functools.partial(
        TrainingIRExtractor.extract,
        llvm_objcopy_path=llvm_objcopy_path,
        cmd_filter=cmd_filter,
        thinlto_build=thinlto_build,
        cmd_section_name=cmd_section_name,
        bitcode_section_name=bitcode_section_name,
    )

    with multiprocessing.Pool(num_workers) as pool:
        relative_output_paths = pool.map(extract_artifacts, objs)
        pool.close()
        pool.join()
    return relative_output_paths


def write_corpus_manifest(
    thinlto_build: str, relative_output_paths: List[str], output_dir: str
):
    """Writes a corpus_manifest.json containing all necessary information about
    the corpus.

    Args:
      thinlto_build: Whether or not the build was done with ThinLTO and if so,
        what kind of ThinLTO. Set this to none if the build was not performed with
        ThinLTO.
      relative_output_paths: The relative (to the corpus directory) output paths
        of all the bitcode files that should be placed in the corpus manifest
      output_dir: The corpus directory where the corpus manifest should be
        placed.
    """
    # This comes first rather than later so global_command_override is at the top
    # of the .json after being written
    if thinlto_build == "local":
        corpus_description = {"global_command_override": _UNSPECIFIED_OVERRIDE}
    else:
        corpus_description = {}

    corpus_description.update(
        {
            "has_thinlto": thinlto_build is not None,
            "modules": [path for path in relative_output_paths if path is not None],
        }
    )

    with open(
        os.path.join(output_dir, "corpus_description.json"), "w", encoding="utf-8"
    ) as f:
        json.dump(corpus_description, f, indent=2)