File: run-benchmark

package info (click to toggle)
awscli 2.31.35-1
links: PTS, VCS
area: main
in suites: forky, sid
size: 156,692 kB
sloc: python: 213,816; xml: 14,082; makefile: 189; sh: 178; javascript: 8
file content (224 lines) | stat: -rwxr-xr-x 7,088 bytes
#!/usr/bin/env python
"""Script to benchmark several high level cli commands.

As of now this benchmarks `cp` and `rm` with test cases for multiple 4kb files
(default 10000 files) and a single large file (default 10gb, `cp` only).
"""

import argparse
import inspect
import json
import os
import platform
import random
import shutil
from datetime import datetime
from subprocess import PIPE, Popen, check_call

import s3transfer

import awscli

TEST_BUCKET = os.environ.get('PERF_TEST_BUCKET')
REPO_ROOT = os.path.dirname(
    os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
)
WORKDIR = os.environ.get('PERF_WORKDIR', os.path.join(REPO_ROOT, 'workdir'))
MANY_FILES_DIR = 'many'
LARGE_FILE_DIR = 'large'


def run(command):
    return check_call(command, shell=True)


def generate_run_id():
    run_id = datetime.now().strftime("%Y-%m-%d-%H-%M-%S-")
    run_id += str(random.randint(1, 10000))
    return run_id


def initialize_files(num_files, file_size):
    # TODO: We probably need to recreate these files each time.
    # Because you can specify --num-files and --large-file-size
    # those arguments can be potentially ignored if you've run
    # this previously with different values.
    many_files_dir = os.path.join(WORKDIR, MANY_FILES_DIR)
    if not os.path.exists(many_files_dir):
        os.makedirs(many_files_dir)
        run(
            f'caf gen --file-size 4kb --max-files {num_files} --directory {many_files_dir}'
        )

    large_file_dir = os.path.join(WORKDIR, LARGE_FILE_DIR)
    if not os.path.exists(large_file_dir):
        os.makedirs(large_file_dir)
        run(
            f'caf gen --file-size {file_size} --max-files 1 --directory {large_file_dir}'
        )


def write_metadata_file(filename):
    metadata = _collect_metadata()
    with open(filename, 'w') as f:
        f.write(json.dumps(metadata, indent=2))


def _collect_metadata():
    # We want to track things like the machine where the benchmark was run,
    # and which version of the aws-cli/s3transfer code is being run.
    # This helps us make more meaningful comparison.
    metadata = {
        'python_version': platform.python_version(),
        'os': '%s/%s' % (platform.system(), platform.release()),
    }
    _inject_package_info(awscli, metadata)
    _inject_package_info(s3transfer, metadata)
    return metadata


def _inject_package_info(package, metadata):
    name = package.__name__
    metadata[name + '_version'] = package.__version__
    metadata[name + '_git_version'] = _get_git_version(package)


def _get_git_version(package):
    dname = os.path.dirname(inspect.getfile(package))
    git_sha = (
        Popen('git rev-parse HEAD', cwd=dname, shell=True, stdout=PIPE)
        .communicate()[0]
        .strip()
    )
    git_branch = (
        Popen(
            'git rev-parse --abbrev-ref HEAD',
            cwd=dname,
            shell=True,
            stdout=PIPE,
        )
        .communicate()[0]
        .strip()
    )
    return '%s (%s)' % (git_sha, git_branch)


def main(args):
    initialize_files(args.num_files, args.large_file_size)
    run_id = generate_run_id()
    results_dir = os.path.join(WORKDIR, 'results', run_id)
    os.makedirs(results_dir)
    write_metadata_file(os.path.join(results_dir, 'metadata.json'))
    try:
        benchmark(args.bucket, results_dir, args.num_iterations)
        print("RUN ID: " + run_id)
    except Exception:
        shutil.rmtree(results_dir)
        raise


def benchmark(bucket, results_dir, num_iterations=1):
    perf_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
    perf_dir = os.path.join(perf_dir, 'performance')

    s3_location = bucket + '/' + MANY_FILES_DIR
    local_dir = os.path.join(WORKDIR, MANY_FILES_DIR)
    try:
        # 10k upload
        results = os.path.join(results_dir, 'upload-10k-small')
        os.makedirs(results)
        benchmark_cp = os.path.join(perf_dir, 'benchmark-cp')
        run(
            benchmark_cp + ' --recursive --num-iterations %s '
            '--source %s --dest %s --result-dir %s --no-cleanup'
            % (num_iterations, local_dir, s3_location, results)
        )

        # 10k download
        results = os.path.join(results_dir, 'download-10k-small')
        os.makedirs(results)
        run(
            benchmark_cp + ' --recursive --num-iterations %s '
            '--source %s --dest %s --result-dir %s'
            % (num_iterations, s3_location, local_dir, results)
        )

        # 10k rm
        results = os.path.join(results_dir, 'delete-10k-small')
        os.makedirs(results)
        benchmark_rm = os.path.join(perf_dir, 'benchmark-rm')
        run(
            benchmark_rm + ' --recursive --num-iterations %s '
            '--target %s --result-dir %s'
            % (num_iterations, s3_location, results)
        )
    finally:
        # Note that the delete-10k-small benchmark restores
        # the files it's deleted once the script is finished.
        # Therefore we need to explicitly cleanup any files
        # we've created.
        run('aws s3 rm --recursive ' + s3_location)

    s3_location = bucket + '/' + LARGE_FILE_DIR
    local_dir = os.path.join(WORKDIR, LARGE_FILE_DIR)
    try:
        # 10gb upload
        results = os.path.join(results_dir, 'upload-10gb')
        os.makedirs(results)
        run(
            benchmark_cp + ' --recursive --num-iterations %s '
            '--source %s --dest %s --result-dir %s --no-cleanup'
            % (num_iterations, local_dir, s3_location, results)
        )

        # 10gb download
        results = os.path.join(results_dir, 'download-10gb')
        os.makedirs(results)
        run(
            benchmark_cp + ' --recursive --num-iterations %s '
            '--source %s --dest %s --result-dir %s'
            % (num_iterations, s3_location, local_dir, results)
        )
    finally:
        # Not benchmarking a single rm call since it's just a single call
        run('aws s3 rm --recursive ' + s3_location)


def s3_uri(value):
    if not value.startswith('s3://'):
        return 's3://' + value
    return value


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-n',
        '--num-iterations',
        type=int,
        default=10,
        help='The number of times to run each test.',
    )
    parser.add_argument(
        '-b',
        '--bucket',
        default=TEST_BUCKET,
        type=s3_uri,
        required=TEST_BUCKET is None,
        help='The bucket to use for testing as an s3 uri. This can also be '
        'set by the environment variable PERF_TEST_BUCKET. If the '
        'environment variable is not set, then this argument is required.',
    )
    parser.add_argument(
        '--num-files',
        default=10000,
        type=int,
        help='The number of files to use for the multiple file case.',
    )
    parser.add_argument(
        '--large-file-size',
        default='10gb',
        help='The file size for the large file case. This can be in the form '
        '10gb, 4kb, etc.',
    )
    main(parser.parse_args())