1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
|
"""
Concatenate sets of intervals.
Preserves format of the first input -- it is possible to concat two files that
have different column orders. Of course, the meta-data of the second will be
lost (and filled with a "."). If all of the files (GenomicInteralReaders) are
the same format, sameformat=True will preserve all columns of the first input,
cuts extra columns on subsequent input, and pads missing columns. If
sameformat=False then extra columns are filled with ".".
"""
from bx.intervals.io import GenomicInterval
from bx.tabular.io import (
Comment,
Header,
)
def concat(readers, comments=True, header=True, sameformat=True):
# Save columns from the first input
chrom_col = readers[0].chrom_col
start_col = readers[0].start_col
end_col = readers[0].end_col
strand_col = readers[0].strand_col
nfields = None
firstdataset = True
output = False
for intervals in readers:
for interval in intervals:
if isinstance(interval, GenomicInterval):
if not nfields:
nfields = interval.nfields
out_interval = interval.copy()
if sameformat or firstdataset:
# everything except the first input has to be
# trimmed or padded to match the first input
if len(out_interval.fields) > nfields:
out_interval.fields = out_interval.fields[0:nfields]
while len(out_interval.fields) < nfields:
out_interval.fields.append(".")
output = True
yield out_interval
else:
chrom = out_interval.chrom
start = out_interval.start
end = out_interval.end
strand = out_interval.strand
out_interval.fields = ["." for col in range(nfields)]
out_interval.fields[chrom_col] = chrom
out_interval.fields[start_col] = str(start)
out_interval.fields[end_col] = str(end)
# Strand is optional, might not exist in output
if strand_col < len(out_interval.fields):
out_interval.fields[strand_col] = strand
yield out_interval
elif isinstance(interval, Header) and header:
yield interval
elif isinstance(interval, Comment) and comments:
yield interval
if output and firstdataset:
firstdataset = False
|