File: digest.py

package info (click to toggle)
python-cooler 0.9.1-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 32,596 kB
  • sloc: python: 10,555; makefile: 198; sh: 31
file content (88 lines) | stat: -rw-r--r-- 2,058 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import glob
import os.path as op
import sys

import click

from .. import util
from . import cli
from ._util import exit_on_broken_pipe


@cli.command()
@click.argument(
    "chromsizes",
    metavar="CHROMSIZES_PATH"
)
@click.argument(
    "fasta",
    metavar="FASTA_PATH"
)
@click.argument(
    "enzyme",
    metavar="ENZYME"
)
@click.option(
    "--out", "-o",
    help="Output file (defaults to stdout)"
)
@click.option(
    "--header",
    "-H",
    help="Print the header of column names as the first row.",
    is_flag=True,
    default=False,
    show_default=True,
)
@click.option(
    "--rel-ids",
    "-i",
    type=click.Choice(["0", "1"]),
    help="Include a column of relative bin IDs for each chromosome. "
    "Choose whether to report them as 0- or 1-based.",
)
@exit_on_broken_pipe(1)
def digest(chromsizes, fasta, enzyme, out, header, rel_ids):
    """
    Generate fragment-delimited genomic bins.

    Output a genome segmentation of restriction fragments as a BED file.

    CHROMSIZES_PATH : UCSC-like chromsizes file, with chromosomes in desired
    order.

    FASTA_PATH : Genome assembly FASTA file or folder containing FASTA files
    (uncompressed).

    ENZYME : Name of restriction enzyme

    """
    chromsizes = util.read_chromsizes(chromsizes, all_names=True)
    chroms = list(chromsizes.keys())

    # Load sequences
    if op.isdir(fasta):
        filepaths = glob.glob(op.join(fasta, "*.fa"))
        filepaths.extend(glob.glob(op.join(fasta, "*.fasta")))
    else:
        filepaths = [fasta]
    fasta_records = util.load_fasta(chroms, *filepaths)

    # Digest sequences
    frags = util.digest(fasta_records, enzyme)

    if rel_ids is not None:
        frags["id"] = frags.groupby("chrom").cumcount()
        if int(rel_ids) == 1:
            frags["id"] += 1

    # Write output
    if out is None:
        f = sys.stdout
    else:
        f = open(out, "w")

    if header:
        frags[0:0].to_csv(f, sep="\t", index=False, header=True)
    frags.to_csv(f, sep="\t", index=False, header=False)
    f.flush()