File: cload.py

package info (click to toggle)
python-cooler 0.9.1-1
links: PTS, VCS
area: main
in suites: bookworm
size: 32,596 kB
sloc: python: 10,555; makefile: 198; sh: 31
file content (604 lines) | stat: -rw-r--r-- 17,110 bytes
import sys

import click
import h5py
import numpy as np
import pandas as pd
import simplejson as json
from cytoolz import compose
from multiprocess import Pool

from ..create import (
    HDF5Aggregator,
    PairixAggregator,
    TabixAggregator,
    aggregate_records,
    create_cooler,
    sanitize_records,
)
from . import cli, get_logger
from ._util import parse_bins, parse_field_param, parse_kv_list_param

_pandas_version = pd.__version__.split('.')
if int(_pandas_version[0]) > 0:
    from pandas.io.common import get_handle


# Copied from pairtools._headerops
def get_header(instream, comment_char='#'):
    '''Returns a header from the stream and an the reaminder of the stream
    with the actual data.
    Parameters
    ----------
    instream : a file object
        An input stream.
    comment_char : str
        The character prepended to header lines (use '@' when parsing sams,
        '#' when parsing pairsams).
    Returns
    -------
    header : list
        The header lines, stripped of terminal spaces and newline characters.
    remainder_stream : stream/file-like object
        Stream with the remaining lines.

    '''
    header = []
    if not comment_char:
        raise ValueError('Please, provide a comment char!')
    comment_byte = comment_char.encode()
    # get peekable buffer for the instream
    read_f, peek_f = None, None
    if hasattr(instream, 'buffer'):
        peek_f = instream.buffer.peek
        readline_f = instream.buffer.readline
    elif hasattr(instream, 'peek'):
        peek_f = instream.peek
        readline_f = instream.readline
    else:
        raise ValueError('Cannot find the peek() function of the provided stream!')

    current_peek = peek_f(1)
    while current_peek.startswith(comment_byte):
        # consuming a line from buffer guarantees
        # that the remainder of the buffer starts
        # with the beginning of the line.
        line = readline_f()
        if isinstance(line, bytes):
            line = line.decode()
        # append line to header, since it does start with header
        header.append(line.strip())
        # peek into the remainder of the instream
        current_peek = peek_f(1)
    # apparently, next line does not start with the comment
    # return header and the instream, advanced to the beginning of the data
    return header, instream


@cli.group()
def cload():
    """
    Create a cooler from genomic pairs and bins.

    Choose a subcommand based on the format of the input contact list.

    """
    pass


# flake8: noqa
def register_subcommand(func):
    return (
        cload.command()(
            click.argument(
                "bins",
                type=str,
                metavar="BINS")(
            click.argument(
                "pairs_path",
                type=click.Path(exists=True, allow_dash=True),
                metavar="PAIRS_PATH")(
            click.argument(
                "cool_path",
                metavar="COOL_PATH")(
            click.option(
                "--metadata",
                help="Path to JSON file containing user metadata.")(
            click.option(
                "--assembly",
                help="Name of genome assembly (e.g. hg19, mm10)")(
            func))))))
    )


def add_arg_help(func):
    func.__doc__ = func.__doc__.format("""
    BINS : One of the following

        <TEXT:INTEGER> : 1. Path to a chromsizes file, 2. Bin size in bp

        <TEXT> : Path to BED file defining the genomic bin segmentation.

    PAIRS_PATH : Path to contacts (i.e. read pairs) file.

    COOL_PATH : Output COOL file path or URI.""")
    return func


@register_subcommand
@add_arg_help
@click.option(
    "--chunksize", "-c",
    help="Control the number of pixels handled by each worker process at a time.",
    type=int,
    default=int(100e6),
    show_default=True
)
def hiclib(bins, pairs_path, cool_path, metadata, assembly, chunksize):
    """
    Bin a hiclib HDF5 contact list (frag) file.

    {}

    hiclib on BitBucket: <https://github.com/mirnylab/hiclib-legacy>.

    """

    chromsizes, bins = parse_bins(bins)

    if metadata is not None:
        with open(metadata) as f:
            metadata = json.load(f)

    with h5py.File(pairs_path, 'r') as h5pairs:
        iterator = HDF5Aggregator(h5pairs, chromsizes, bins, chunksize)
        create_cooler(
            cool_path, bins, iterator,
            metadata=metadata,
            assembly=assembly,
            ordered=True)


@register_subcommand
@add_arg_help
@click.option(
    "--nproc", "-p",
    help="Number of processes to split the work between.",
    type=int,
    default=8,
    show_default=True
)
@click.option(
    "--chrom2", "-c2",
    help="chrom2 field number (one-based)",
    type=int,
    default=4
)
@click.option(
    "--pos2", "-p2",
    help="pos2 field number (one-based)",
    type=int,
    default=5
)
@click.option(
    "--zero-based", "-0",
    help="Positions are zero-based",
    is_flag=True,
    default=False,
    show_default=True
)
@click.option(
    "--max-split", "-s",
    help="Divide the pairs from each chromosome into at most this many chunks. "
    "Smaller chromosomes will be split less frequently or not at all. "
    "Increase ths value if large chromosomes dominate the workload on "
    "multiple processors.",
    type=int,
    default=2,
    show_default=True
)
def tabix(bins, pairs_path, cool_path, metadata, assembly, nproc, zero_based, max_split, **kwargs):
    """
    Bin a tabix-indexed contact list file.

    {}

    See also: 'cooler csort' to sort and index a contact list file

    Tabix manpage: <http://www.htslib.org/doc/tabix.html>.

    """
    logger = get_logger(__name__)
    chromsizes, bins = parse_bins(bins)

    if metadata is not None:
        with open(metadata) as f:
            metadata = json.load(f)

    try:
        if nproc > 1:
            pool = Pool(nproc)
            logger.info(f"Using {nproc} cores")
            map = pool.imap

        opts = {}
        if 'chrom2' in kwargs:
            opts['C2'] = kwargs['chrom2'] - 1
        if 'pos2' in kwargs:
            opts['P2'] = kwargs['pos2'] - 1

        iterator = TabixAggregator(
            pairs_path,
            chromsizes,
            bins,
            map=map,
            is_one_based=(not zero_based),
            n_chunks=max_split,
            **opts
        )

        create_cooler(
            cool_path, bins, iterator,
            metadata=metadata,
            assembly=assembly,
            ordered=True)
    finally:
        if nproc > 1:
            pool.close()


@register_subcommand
@add_arg_help
@click.option(
    "--nproc", "-p",
    help="Number of processes to split the work between.",
    type=int,
    default=8,
    show_default=True
)
@click.option(
    "--zero-based", "-0",
    help="Positions are zero-based",
    is_flag=True,
    default=False,
    show_default=True
)
@click.option(
    "--max-split", "-s",
    help="Divide the pairs from each chromosome into at most this many chunks. "
    "Smaller chromosomes will be split less frequently or not at all. "
    "Increase ths value if large chromosomes dominate the workload on "
    "multiple processors.",
    type=int,
    default=2,
    show_default=True
)
def pairix(bins, pairs_path, cool_path, metadata, assembly, nproc, zero_based, max_split):
    """
    Bin a pairix-indexed contact list file.

    {}

    See also: 'cooler csort' to sort and index a contact list file

    Pairix on GitHub: <https://github.com/4dn-dcic/pairix>.

    """
    logger = get_logger(__name__)
    chromsizes, bins = parse_bins(bins)

    if metadata is not None:
        with open(metadata) as f:
            metadata = json.load(f)

    try:
        if nproc > 1:
            pool = Pool(nproc)
            logger.info(f"Using {nproc} cores")
            map = pool.imap

        iterator = PairixAggregator(
            pairs_path,
            chromsizes,
            bins,
            map=map,
            is_one_based=(not zero_based),
            n_chunks=max_split)

        create_cooler(
            cool_path, bins, iterator,
            metadata=metadata,
            assembly=assembly,
            ordered=True)
    finally:
        if nproc > 1:
            pool.close()


@register_subcommand
@add_arg_help
@click.option(
    "--chrom1", "-c1",
    help="chrom1 field number (one-based)",
    type=int,
    required=True
)
@click.option(
    "--pos1", "-p1",
    help="pos1 field number (one-based)",
    type=int,
    required=True
)
@click.option(
    "--chrom2", "-c2",
    help="chrom2 field number (one-based)",
    type=int,
    required=True
)
@click.option(
    "--pos2", "-p2",
    help="pos2 field number (one-based)",
    type=int,
    required=True
)
@click.option(
    "--chunksize",
    help="Number of input lines to load at a time",
    type=int,
    default=int(15e6)
)
@click.option(
    "--zero-based", "-0",
    help="Positions are zero-based",
    is_flag=True,
    default=False,
    show_default=True
)
@click.option(
    "--comment-char",
    type=str,
    default='#',
    show_default=True,
    help="Comment character that indicates lines to ignore."
)
@click.option(
    "--no-symmetric-upper", "-N",
    help="Create a complete square matrix without implicit symmetry. "
    "This allows for distinct upper- and lower-triangle values",
    is_flag=True,
    default=False
)
@click.option(
    "--input-copy-status",
    type=click.Choice(['unique', 'duplex']),
    default='unique',
    help="Copy status of input data when using symmetric-upper storage. | "
    "`unique`: Incoming data comes from a unique half of a symmetric "
    "map, regardless of how the coordinates of a pair are ordered. "
    "`duplex`: Incoming data contains upper- and lower-triangle duplicates. "
    "All input records that map to the lower triangle will be discarded! | "
    "If you wish to treat lower- and upper-triangle input data as "
    "distinct, use the ``--no-symmetric-upper`` option. ",
    show_default=True
)
@click.option(
    "--field",
    help="Specify quantitative input fields to aggregate into value columns "
    "using the syntax ``--field <field-name>=<field-number>``. "
    "Optionally, append ``:`` followed by ``dtype=<dtype>`` to specify "
    "the data type (e.g. float), and/or ``agg=<agg>`` to "
    "specify an aggregation function different from sum (e.g. mean). "
    "Field numbers are 1-based. Passing 'count' as the target name will "
    "override the default behavior of storing pair counts. "
    "Repeat the ``--field`` option for each additional field.",
    type=str,
    multiple=True
)
# @click.option(
#     "--no-count",
#     help="Do not store the pair counts. Use this only if you use `--field` to "
#          "specify at least one input field for aggregation as an alternative.",
#     is_flag=True,
#     default=False)
@click.option(
    "--temp-dir",
    help="Create temporary files in a specified directory. Pass ``-`` to use "
    "the platform default temp dir.",
    type=click.Path(exists=True, file_okay=False, dir_okay=True, allow_dash=True)
)
@click.option(
    "--no-delete-temp",
    help="Do not delete temporary files when finished.",
    is_flag=True,
    default=False
)
@click.option(
    "--max-merge",
    help="Maximum number of chunks to merge before invoking recursive merging",
    type=int,
    default=200,
    show_default=True
)
@click.option(
    "--storage-options",
    help="Options to modify the data filter pipeline. Provide as a "
    "comma-separated list of key-value pairs of the form 'k1=v1,k2=v2,...'. "
    "See http://docs.h5py.org/en/stable/high/dataset.html#filter-pipeline "
    "for more details."
)
@click.option(
    "--append", "-a",
    is_flag=True,
    default=False,
    help="Pass this flag to append the output cooler to an existing file "
         "instead of overwriting the file."
)
# @click.option(
#     "--format", "-f",
#     help="Preset data format.",
#     type=click.Choice(['4DN', 'BEDPE']))
# --sep
def pairs(bins, pairs_path, cool_path, metadata, assembly, chunksize,
          zero_based, comment_char, input_copy_status, no_symmetric_upper,
          field, temp_dir, no_delete_temp, max_merge, storage_options,
          append, **kwargs):
    """
    Bin any text file or stream of pairs.

    Pairs data need not be sorted. Accepts compressed files.
    To pipe input from stdin, set PAIRS_PATH to '-'.

    {}

    """
    chromsizes, bins = parse_bins(bins)

    symmetric_upper = not no_symmetric_upper
    tril_action = None
    if symmetric_upper:
        if input_copy_status == 'unique':
            tril_action = 'reflect'
        elif input_copy_status == 'duplex':
            tril_action = 'drop'

    if metadata is not None:
        with open(metadata) as f:
            metadata = json.load(f)

    input_field_names = [
        'chrom1', 'pos1', 'chrom2', 'pos2',
    ]
    input_field_dtypes = {
        'chrom1': str,
        'pos1': np.int64,
        'chrom2': str,
        'pos2': np.int64,
    }
    input_field_numbers = {}
    for name in ['chrom1', 'pos1', 'chrom2', 'pos2']:
        if kwargs[name] == 0:
            raise click.BadParameter(
                "Field numbers start at 1",
                param_hint=name)
        input_field_numbers[name] = kwargs[name] - 1

    # Include input value columns
    output_field_names = []
    output_field_dtypes = {}
    aggregations = {}
    if len(field):
        for arg in field:
            name, colnum, dtype, agg = parse_field_param(arg)

            # Special cases: these do not have input fields.
            # Omit field number and agg to change standard dtypes.
            if colnum is None:
                if (agg is None and dtype is not None
                        and name in {'bin1_id', 'bin2_id', 'count'}):
                    output_field_dtypes[name] = dtype
                    continue
                else:
                    raise click.BadParameter(
                        "A field number is required.", param_hint=arg)

            if name not in input_field_names:
                input_field_names.append(name)

            if name not in output_field_names:
                output_field_names.append(name)

            input_field_numbers[name] = colnum

            if dtype is not None:
                input_field_dtypes[name] = dtype
                output_field_dtypes[name] = dtype

            if agg is not None:
                aggregations[name] = agg
            else:
                aggregations[name] = 'sum'

    # # Pairs counts are always produced, unless supressed explicitly
    # do_count = not no_count
    # if do_count:
    #     if 'count' not in output_field_names:
    #         output_field_names.append('count')  # default dtype and agg
    # else:
    #     if not len(output_field_names):
    #         click.BadParameter(
    #             "To pass `--no-count`, specify at least one input "
    #             "value-column using `--field`.")
    if 'count' not in output_field_names:
        output_field_names.append('count')

    # Customize the HDF5 filters
    if storage_options is not None:
        h5opts = parse_kv_list_param(storage_options)
        for key in h5opts:
            if isinstance(h5opts[key], list):
                h5opts[key] = tuple(h5opts[key])
    else:
        h5opts = None

    # Initialize the input stream
    # TODO: we could save the header into metadata
    kwargs = {}
    if pairs_path == '-':
        f_in = sys.stdin
        _, f_in = get_header(f_in)
    elif int(_pandas_version[0]) > 0:
        if int(_pandas_version[1]) < 2:
            f_in = get_handle(pairs_path, mode='r', compression='infer')[0]
        else:
            f_in = get_handle(pairs_path, mode='r', compression='infer').handle

        _, f_in = get_header(f_in)
    else:
        f_in = pairs_path
        kwargs['comment'] = '#'


    reader = pd.read_csv(
        f_in,
        sep='\t',
        usecols=[input_field_numbers[name] for name in input_field_names],
        names=input_field_names,
        dtype=input_field_dtypes,
        iterator=True,
        chunksize=chunksize,
        **kwargs
    )

    sanitize = sanitize_records(
        bins,
        schema='pairs',
        decode_chroms=True,
        is_one_based=not zero_based,
        tril_action=tril_action,
        sort=True,
        validate=True
    )
    aggregate = aggregate_records(agg=aggregations, count=True, sort=False)
    pipeline = compose(aggregate, sanitize)

    create_cooler(
        cool_path,
        bins,
        map(pipeline, reader),
        columns=output_field_names,
        dtypes=output_field_dtypes,
        metadata=metadata,
        assembly=assembly,
        mergebuf=chunksize,
        max_merge=max_merge,
        temp_dir=temp_dir,
        delete_temp=not no_delete_temp,
        boundscheck=False,
        triucheck=False,
        dupcheck=False,
        ensure_sorted=False,
        symmetric_upper=symmetric_upper,
        h5opts=h5opts,
        ordered=False,
        mode="a" if append else "w"
    )