File: collect_warnings.py

package info (click to toggle)
chromium 139.0.7258.127-1
links: PTS, VCS
area: main
in suites:
size: 6,122,068 kB
sloc: cpp: 35,100,771; ansic: 7,163,530; javascript: 4,103,002; python: 1,436,920; asm: 946,517; xml: 746,709; pascal: 187,653; perl: 88,691; sh: 88,436; objc: 79,953; sql: 51,488; cs: 44,583; fortran: 24,137; makefile: 22,147; tcl: 15,277; php: 13,980; yacc: 8,984; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (266 lines) | stat: -rwxr-xr-x 9,676 bytes
parent folder | download | duplicates (6)
#!/usr/bin/env python3

# Copyright 2024 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import argparse
import collections
import json
import os
import re
import sys
"""
This script parses all the log files in a directory, looking for instances
of a particular warning. It collects all the ones it finds, and writes the
results to an output file, recording which files had warnings, and the
location(s) in each file. It also counts the total number of files/warnings.

It can be configured to either print a (somewhat) human-readable list of files
and locations, or a more structured json for automatic processing.

See README.md in this directory for more details.
"""


def parse_args(args):
    """
    Parse commandline flags. Possible options:

    Configuration options:
    log_dir :     The directory containing the log files to scrape, or just
                  a single build log.
    output :      Where the collected warning information should go. Either the
                  string "stdout" (case-insensitive) or a path to a file.
    warning_text: The text in the log indicating a warning was raised.
    summarize:    If present, we output a human-readable summary.
                  Otherwise, we output a json with more information.
    print-links:  If present, try to provide a direct link to the first warning
                  in each file on chromium codesearch.
    """
    parser = argparse.ArgumentParser(description=__doc__,)
    parser.add_argument("-l",
                        "--log-dir",
                        required=True,
                        type=str,
                        help="Path to the directory containing the build logs, "
                        "or to a single build log.")
    parser.add_argument("-o",
                        "--output",
                        required=True,
                        type=str,
                        help="Where the collected warning information should "
                        "go. This should be either the string 'stdout', a dash "
                        "(also meaning stdout), or a path to a file.\n"
                        "ex. -o out.txt, -o stdout, -o -")
    parser.add_argument("-w",
                        "--warning",
                        type=str,
                        required=True,
                        help="Text indicating the warning of interest. "
                        "Should appear at the end of a line containing the "
                        "filename and warning location.\n"
                        "ex. -w [-Wthread-safety-reference-return]")
    parser.add_argument(
        "-s",
        "--summarize",
        action="store_true",
        help="If present, output a (somewhat) human-readable text file "
        "cataloguing the warnings. Otherwise, output a json file "
        "with more detailed information about each instance.")
    parser.add_argument(
        "-k",
        "--print_links",
        action="store_true",
        help="If present, attempt to provide direct links to codesearch for "
        "the first warning in each file. Files which don't directly correspond "
        "to anything, such as generated files, print the filename instead.")

    parsed_args = vars(parser.parse_args(args))

    return parsed_args


_TARGET_RE = re.compile('([^:(]+)(?:[:(])([0-9]+)(?::|, ?)([0-9]+)\)?:')


def make_codesearch_link(file, line):
    """
    Construct a codesearch link to the specified position in the file, to
    easily inspect the site of the warning.
    """
    if not file.startswith("../../"):
        # Probably a generated file, can't construct a good link automatically
        return file

    return "https://crsrc.org/{};l={}".format(file.removeprefix('../../'), line)


def extract_warning_location(line):
    """
    Given a line of the build log indicating that a warning has occurred,
    extract the file name and position of the warning (line # + col #).
    """
    # Matches:
    # |/path/to/file(123, 45):...|, for Windows
    # |/path/to/file:123:45:...|, elsewhere
    # Captures path, line number, and column number.
    match = _TARGET_RE.match(line)
    if not match:
        return None
    path, line, col = match.groups()
    return os.path.normpath(path), int(line), int(col)


def collect_warning(summarize, print_links, log_name, log_file, collection,
                    warning_info):
    """
    Add information about a warning into our collection, avoiding
    duplicates and merging as necessary.

    `collection` is expected to be a dictionary mapping log file names to the
    warning info generated in the file (the empty list, by default).
    If we're summarizing, we just collect the line and column number of each
    warning.

    If we're not summarizing, we also store the name of the log file (so we know
    which systems the warning occurs on), and the next line of the log file
    (which contains the text of the line, in case line numbers change later.)
    """
    path, line_num, col_num = warning_info

    # If we're collecting a summary, we just need the line and column numbers
    if summarize:
        logged_info = line_num, col_num
        if logged_info not in collection[path]:
            # Haven't seen this particular warning before
            collection[path].append(logged_info)
        return

    # If we're not summarizing, we store extra info:
    # 1. The next (nonempty) line, and
    # 2. the name of the log that the warning occurred in
    next_line = next(log_file)
    while "|" not in next_line:
        next_line = next(log_file)

    log_name = os.path.basename(log_name)
    if print_links:
        logged_info = (line_num, col_num, make_codesearch_link(path, line_num),
                       next_line.split("|")[1].strip(), [log_name])
    else:
        logged_info = (line_num, col_num, next_line.split("|")[1].strip(),
                       [log_name])

    # Should be either a singleton or empty
    existing_info = [
        x for x in collection[path]
        if x[0] == logged_info[0] and x[1] == logged_info[1]
    ]

    if len(existing_info) == 0:
        # Haven't seen this particular warning before
        collection[path].append(logged_info)
        return

    # If the info's already in the list, then just note the name of the log file
    # It's possible for the same warning to appear multiple times in a file
    if log_name not in existing_info[0][-1]:
        existing_info[0][-1].append(log_name)
    return


def read_file(filename, warning_text, summarize, print_links, collection,
              failures):
    """
    Go through a single build log, collecting all the warnings that occurred and
    storing them in `collection`. Also keep track of any lines we tried to get
    information from but failed (this shouldn't happen).
    """
    with open(filename) as file:
        for line in file:
            if not line.rstrip().endswith(warning_text):
                continue

            warning_info = extract_warning_location(line)
            if not warning_info:
                builder_name, _ = os.path.splitext(os.path.basename(filename))
                failures.append("{}: {}".format(builder_name, line))
                continue

            collect_warning(summarize, print_links, filename, file, collection,
                            warning_info)


def log_output(summarize, print_links, collection, output):
    """
    Write the results of the collection to the output.
    If a summary was requested, output a text summary.
    Otherwise, dump to json.
    """

    output_to_stdout = (output == "-" or output.lower() == "stdout")

    if output_to_stdout:
        output_file = sys.stdout
    else:
        output_file = open(output, "w")
        print("Writing output to " + os.path.abspath(output))

    if not summarize:
        json.dump(collection, output_file, indent=2, sort_keys=True)
        return

    keys = list(collection.keys())
    hits = 0
    for key in sorted(keys):
        values = collection[key]
        hits += len(values)
        padding = " "
        if print_links:
            key = make_codesearch_link(key, values[0][0])
            padding = "\n    "
        output_file.write("{}{}({} hits): {}\n".format(key, padding,
                                                       str(len(values)),
                                                       str(values)))

    output_file.write("\nTotal Files: {}, Total Hits: {}".format(
        len(keys), hits))

    if not output_to_stdout:
        output_file.close()


def main(args):
    parsed_args = parse_args(args)
    try:
        log_files = [
            os.path.join(parsed_args["log_dir"], f)
            for f in os.listdir(parsed_args["log_dir"])
        ]
    except NotADirectoryError:
        # Assume the argument was the (one) file to read.
        log_files = [parsed_args["log_dir"]]

    collection = collections.defaultdict(list)
    failures = []
    for file in log_files:
        read_file(file, parsed_args["warning"], parsed_args["summarize"],
                  parsed_args["print_links"], collection, failures)

    items = collection.copy().items()
    for path, locs in items:
        collection[path] = sorted(locs)

    log_output(parsed_args["summarize"], parsed_args["print_links"], collection,
               parsed_args["output"])

    if failures:
        sys.stderr.write(
            "\nFound lines with an unexpected format but the right ending:")
        for line in failures:
            sys.stderr.write("\n" + line)


if __name__ == "__main__":
    sys.exit(main(sys.argv[1:]))