1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
|
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
r"""Combine multiple training corpus into a single training corpus.
Currently only support the case that multiple corpus share the same
configurables except the "modules" field.
Usage: we'd like to combine training corpus corpus1 and corpus2 into
combinedcorpus; we first structure the files as follows:
combinedcorpus
combinedcorpus/corpus1
combinedcorpus/corpus2
Running this script with
python3 \
compiler_opt/tools/combine_training_corpus.py \
--root_dir=$PATH_TO_combinedcorpus
generates combinedcorpus/corpus_description.json file. In this way corpus1
and corpus2 are combined into combinedcorpus.
"""
import argparse
import logging
from mlgo.corpus import combine_training_corpus_lib
from mlgo.corpus import flags
def parse_args_and_run():
parser = argparse.ArgumentParser(
description="A tool for combining multiple training corpora"
)
parser.add_argument(
"--root_dir", type=str, help="The root dir of module paths to combine."
)
flags.add_verbosity_arguments(parser)
args = parser.parse_args()
main(args)
def main(args):
logging.basicConfig(level=args.verbosity)
combine_training_corpus_lib.combine_corpus(args.root_dir)
if __name__ == "__main__":
parse_args_and_run()
|