1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
|
#!/usr/bin/python3
"""
Application to convert AXT file to LAV file. Reads an AXT file from standard
input and writes a LAV file to standard out; some statistics are written to
standard error.
usage: %prog primary_spec secondary_spec [--silent] < axt_file > lav_file
Each spec is of the form seq_file[:species_name]:lengths_file.
- seq_file should be a format string for the file names for the individual
sequences, with %s to be replaced by the alignment's src field. For
example, "hg18/%s.nib" would prescribe files named "hg18/chr1.nib",
"hg18/chr2.nib", etc.
- species_name is optional. If present, it is prepended to the alignment's
src field.
- Lengths files provide the length of each chromosome (lav format needs this
information but axt file does not contain it). The format is a series of
lines of the form:
<chromosome name> <length>
The chromosome field in each axt block must match some <chromosome name> in
the lengths file.
"""
__author__ = "Bob Harris (rsharris@bx.psu.edu)"
import sys
import bx.align.axt
import bx.align.lav
def usage(s=None):
message = __doc__
if s is None:
sys.exit(message)
else:
sys.exit(f"{s}\n{message}")
def main():
global debug
primary = None
secondary = None
silent = False
# pick off options
args = sys.argv[1:]
while len(args) > 0:
arg = args.pop(0)
val = None
fields = arg.split("=", 1)
if len(fields) == 2:
arg = fields[0]
val = fields[1]
if val == "":
usage(f"missing a value in {arg}=")
if arg == "--silent" and val is None:
silent = True
elif primary is None and val is None:
primary = arg
elif secondary is None and val is None:
secondary = arg
else:
usage(f"unknown argument: {arg}")
if primary is None:
usage("missing primary file name and length")
if secondary is None:
usage("missing secondary file name and length")
try:
(primaryFile, primary, primaryLengths) = parse_spec(primary)
except Exception:
usage("bad primary spec (must be seq_file[:species_name]:lengths_file")
try:
(secondaryFile, secondary, secondaryLengths) = parse_spec(secondary)
except Exception:
usage("bad secondary spec (must be seq_file[:species_name]:lengths_file")
# read the lengths
speciesToLengths = {}
speciesToLengths[primary] = read_lengths(primaryLengths)
speciesToLengths[secondary] = read_lengths(secondaryLengths)
# read the alignments
out = bx.align.lav.Writer(sys.stdout, attributes={"name_format_1": primaryFile, "name_format_2": secondaryFile})
axtsRead = 0
axtsWritten = 0
for axtBlock in bx.align.axt.Reader(
sys.stdin, species_to_lengths=speciesToLengths, species1=primary, species2=secondary, support_ids=True
):
axtsRead += 1
out.write(axtBlock)
axtsWritten += 1
out.close()
if not silent:
sys.stderr.write("%d blocks read, %d written\n" % (axtsRead, axtsWritten))
def parse_spec(spec): # returns (seq_file,species_name,lengths_file)
fields = spec.split(":")
if len(fields) == 2:
return (fields[0], "", fields[1])
elif len(fields) == 3:
return (fields[0], fields[1], fields[2])
else:
raise ValueError
def read_lengths(fileName):
chromToLength = {}
f = open(fileName)
for lineNumber, line in enumerate(f):
line = line.strip()
if line == "":
continue
if line.startswith("#"):
continue
fields = line.split()
if len(fields) != 2:
raise ValueError("bad lengths line (%s:%d): %s" % (fileName, lineNumber, line))
chrom = fields[0]
try:
length = int(fields[1])
except ValueError:
raise ValueError("bad lengths line (%s:%d): %s" % (fileName, lineNumber, line))
if chrom in chromToLength:
raise ValueError("%s appears more than once (%s:%d): %s" % (chrom, fileName, lineNumber, line))
chromToLength[chrom] = length
f.close()
return chromToLength
if __name__ == "__main__":
main()
|