File: concat.py

package info (click to toggle)
python-bx 0.13.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,000 kB
  • sloc: python: 17,136; ansic: 2,326; makefile: 24; sh: 8
file content (61 lines) | stat: -rw-r--r-- 2,602 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
"""
Concatenate sets of intervals.

Preserves format of the first input -- it is possible to concat two files that
have different column orders. Of course, the meta-data of the second will be
lost (and filled with a "."). If all of the files (GenomicInteralReaders) are
the same format, sameformat=True will preserve all columns of the first input,
cuts extra columns on subsequent input, and pads missing columns. If
sameformat=False then extra columns are filled with ".".
"""

from bx.intervals.io import GenomicInterval
from bx.tabular.io import (
    Comment,
    Header,
)


def concat(readers, comments=True, header=True, sameformat=True):
    # Save columns from the first input
    chrom_col = readers[0].chrom_col
    start_col = readers[0].start_col
    end_col = readers[0].end_col
    strand_col = readers[0].strand_col
    nfields = None
    firstdataset = True
    output = False
    for intervals in readers:
        for interval in intervals:
            if isinstance(interval, GenomicInterval):
                if not nfields:
                    nfields = interval.nfields
                out_interval = interval.copy()
                if sameformat or firstdataset:
                    # everything except the first input has to be
                    # trimmed or padded to match the first input
                    if len(out_interval.fields) > nfields:
                        out_interval.fields = out_interval.fields[0:nfields]
                        while len(out_interval.fields) < nfields:
                            out_interval.fields.append(".")
                    output = True
                    yield out_interval
                else:
                    chrom = out_interval.chrom
                    start = out_interval.start
                    end = out_interval.end
                    strand = out_interval.strand
                    out_interval.fields = ["." for col in range(nfields)]
                    out_interval.fields[chrom_col] = chrom
                    out_interval.fields[start_col] = str(start)
                    out_interval.fields[end_col] = str(end)
                    # Strand is optional, might not exist in output
                    if strand_col < len(out_interval.fields):
                        out_interval.fields[strand_col] = strand
                    yield out_interval
            elif isinstance(interval, Header) and header:
                yield interval
            elif isinstance(interval, Comment) and comments:
                yield interval
        if output and firstdataset:
            firstdataset = False