1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165
|
#!/usr/bin/python3
"""
Application to convert AXT file to MAF file. Reads an AXT file from standard
input and writes a MAF file to standard out; some statistics are written to
standard error.
axt_to_maf primary:lengths_file secondary:lengths_file < axt_file > maf_file
--silent: prevents stats report
Lengths files provide the length of each chromosome (maf format needs this
information but axt file does not contain it). The format is a series of
lines of the form:
<chromosome name> <length>
The chromosome field in each axt block must match some <chromosome name> in
the lengths file.
"""
__author__ = "Bob Harris (rsharris@bx.psu.edu)"
import copy
import sys
import bx.align.axt
import bx.align.maf
def usage(s=None):
message = __doc__
if s is None:
sys.exit(message)
else:
sys.exit(f"{s}\n{message}")
def main():
global debug
##########
# parse the command line
##########
primary = None
secondary = None
silent = False
# pick off options
args = sys.argv[1:]
while len(args) > 0:
arg = args.pop(0)
val = None
fields = arg.split("=", 1)
if len(fields) == 2:
arg = fields[0]
val = fields[1]
if val == "":
usage(f"missing a value in {arg}=")
if (arg == "--silent") and (val is None):
silent = True
elif (primary is None) and (val is None):
primary = arg
elif (secondary is None) and (val is None):
secondary = arg
else:
usage(f"unknown argument: {arg}")
if primary is None:
usage("missing primary species")
if secondary is None:
usage("missing secondary species")
fields = primary.split(":")
if len(fields) != 2:
usage("bad primary species (must be species:lengths_file")
primary = fields[0]
primaryLengths = fields[1]
fields = secondary.split(":")
if len(fields) != 2:
usage("bad secondary species (must be species:lengths_file")
secondary = fields[0]
secondaryLengths = fields[1]
##########
# read the lengths
##########
speciesToLengths = {}
speciesToLengths[primary] = read_lengths(primaryLengths)
speciesToLengths[secondary] = read_lengths(secondaryLengths)
##########
# read the alignments
##########
out = bx.align.maf.Writer(sys.stdout)
axtsRead = 0
axtsWritten = 0
for axtBlock in bx.align.axt.Reader(
sys.stdin, species_to_lengths=speciesToLengths, species1=primary, species2=secondary
):
axtsRead += 1
p = axtBlock.get_component_by_src_start(primary)
if p is None:
continue
s = axtBlock.get_component_by_src_start(secondary)
if s is None:
continue
mafBlock = bx.align.Alignment(axtBlock.score, axtBlock.attributes)
mafBlock.add_component(clone_component(p))
mafBlock.add_component(clone_component(s))
out.write(mafBlock)
axtsWritten += 1
if not silent:
sys.stderr.write("%d blocks read, %d written\n" % (axtsRead, axtsWritten))
def clone_component(c):
return bx.align.Component(c.src, c.start, c.size, c.strand, c.src_size, copy.copy(c.text))
def read_lengths(fileName):
chromToLength = {}
f = open(fileName)
for lineNumber, line in enumerate(f):
line = line.strip()
if line == "":
continue
if line.startswith("#"):
continue
fields = line.split()
if len(fields) != 2:
raise ValueError("bad lengths line (%s:%d): %s" % (fileName, lineNumber, line))
chrom = fields[0]
try:
length = int(fields[1])
except ValueError:
raise ValueError("bad lengths line (%s:%d): %s" % (fileName, lineNumber, line))
if chrom in chromToLength:
raise ValueError("%s appears more than once (%s:%d): %s" % (chrom, fileName, lineNumber, line))
chromToLength[chrom] = length
f.close()
return chromToLength
if __name__ == "__main__":
main()
|