1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148
|
#! /usr/bin/env python
# Copyright 2010 Martin C. Frith
# Read MAF-format alignments: write them in other formats.
import sys, os, fileinput, optparse, itertools, signal
##### General-purpose routines: #####
def flatten(listOfLists): return sum(listOfLists, [])
def valueFromMaf(mafLines, keyword):
for word in mafLines[0]:
if word.startswith(keyword + '='):
return word.split('=')[1]
return ''
def scoreFromMaf(mafLines):
score = valueFromMaf(mafLines, "score")
if not score: raise Exception("encountered an alignment without a score")
return score
##### Routines for converting to AXT format: #####
def axtDataFromMafLine(sLine):
chr = sLine[1]
mafBeg = int(sLine[2])
mafLen = int(sLine[3])
axtBeg = str(mafBeg + 1) # convert from 0-based to 1-based coordinate
axtEnd = str(mafBeg + mafLen)
strand = sLine[4]
return [chr, axtBeg, axtEnd, strand]
axtCounter = itertools.count()
def writeAxt(maf):
count = str(axtCounter.next())
score = scoreFromMaf(maf)
sLines = [i for i in maf if i[0] == "s"]
if not sLines: raise Exception("empty alignment")
sequences = [i[6] for i in sLines]
axtData = map(axtDataFromMafLine, sLines)
dataHead, dataTail = axtData[0], axtData[1:]
if dataHead[3] != "+":
raise Exception("for AXT, the 1st strand in each alignment must be +")
print " ".join([count] + dataHead[:3] + flatten(dataTail) + [score])
for i in sequences: print i
print # print a blank line at the end
##### Routines for converting to tabular format: #####
def gapString(gap1, gap2):
return str(gap1) + ":" + str(gap2)
def symbolSize(symbol, letterSize):
if symbol == "-": return 0
elif symbol == "\\": return 1
elif symbol == "/": return -1
else: return letterSize
def matchAndGapSizes(sequence1, sequence2, letterSize1, letterSize2):
if len(sequence1) != len(sequence2):
raise Exception('aligned sequences have different lengths')
matchSize = 0
gap1 = gap2 = 0
for i, j in zip(sequence1, sequence2):
if '-' in (i, j):
if matchSize != 0:
yield str(matchSize)
matchSize = 0
gap1 += symbolSize(i, letterSize1)
gap2 += symbolSize(j, letterSize2)
else:
if gap1 != 0 or gap2 != 0:
yield gapString(gap1, gap2)
gap1 = gap2 = 0
matchSize += 1 # this is correct for translated alignments too
if matchSize != 0:
yield str(matchSize)
elif gap1 != 0 or gap2 != 0:
yield gapString(gap1, gap2)
def gapSizePerLetter(sLine):
sequence = sLine[6]
if "/" in sequence or "\\" in sequence: return 3
gaps = sequence.count("-")
nonGaps = len(sequence) - gaps
mafLen = int(sLine[3])
if mafLen == nonGaps * 3: return 3
return 1
def writeTab(maf):
score = scoreFromMaf(maf)
expect = valueFromMaf(maf, "expect")
sLines = [i for i in maf if i[0] == "s"]
if len(sLines) != 2: raise Exception("pairwise alignments only, please")
outWords = [i[1:6] for i in sLines]
sequences = [i[6] for i in sLines]
sizes = map(gapSizePerLetter, sLines)
gapInfo = matchAndGapSizes(sequences[0], sequences[1], sizes[0], sizes[1])
gapString = ','.join(gapInfo)
fields = [score] + flatten(outWords) + [gapString]
if expect: fields += [expect]
print "\t".join(fields)
##### Routines for reading MAF format: #####
def mafInput(lines):
maf = []
for line in lines:
if line.startswith("#"):
print line,
elif line.isspace():
if maf: yield maf
maf = []
else:
maf.append(line.split())
if maf: yield maf
def formatName(formatString):
s = formatString.lower()
if "axt".startswith(s): return "axt"
elif "tabular".startswith(s): return "tab"
else: raise Exception("unknown format: " + formatString)
def mafConvert(args):
format = formatName(args[0])
for maf in mafInput(fileinput.input(args[1])):
if format == "axt": writeAxt(maf)
else: writeTab(maf)
if __name__ == "__main__":
signal.signal(signal.SIGPIPE, signal.SIG_DFL) # avoid silly error message
usage = """
%prog axt my-alignments.maf
%prog tab my-alignments.maf"""
op = optparse.OptionParser(usage=usage)
(opts, args) = op.parse_args()
if len(args) != 2: op.error("I need a format-name and a file-name")
try: mafConvert(args)
except KeyboardInterrupt: pass # avoid silly error message
except Exception, e:
prog = os.path.basename(sys.argv[0])
sys.exit(prog + ": error: " + str(e))
|