File: expand_scores_file.py

package info (click to toggle)
lastz 1.04.52-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 17,604 kB
  • sloc: ansic: 39,808; python: 6,073; makefile: 843; sh: 53
file content (181 lines) | stat: -rwxr-xr-x 4,577 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#!/usr/bin/env python
"""
Add scoring-related parameters to a lastz scores file
-----------------------------------------------------

:Author: Bob Harris (rsharris@bx.psu.edu)

Typical input scores file:

	# (a LASTZ scoring set, created by "LASTZ --infer")

	bad_score          = X:-1910 # used for sub[X][*] and sub[*][X]
	fill_score         = -191    # used when sub[*][*] not otherwise defined
	gap_open_penalty   = 400
	gap_extend_penalty = 30

	      A     C     G     T
	A    85  -164   -70  -191
	C  -164   100  -151   -70
	G   -70  -151   100  -164
	T  -191   -70  -164    85
"""

import sys

def usage(s=None):
	message = """
expand_scores_file [options]< scores_file > scores_file
  --overridegaps  ignore gap scores already set 
"""

	if (s == None): sys.exit (message)
	else:           sys.exit ("%s\n%s" % (s,message))


def main():

	overrideGaps = False

	for arg in sys.argv[1:]:
		if (arg in ["--help","-h","--h","-help"]):
			usage()
		elif (arg == "--overridegaps"):
				overrideGaps = True
				continue
		usage ("unrecognized argument: %s" % arg)

	# read the scores file

	lines = []
	numValueLines = None
	valuesFinished = False
	nameToVal = {}
	subs = subRows = subColumns = None

	lineNumber = 0
	for line in sys.stdin:
		lineNumber += 1
		line = line.rstrip()
		lines += [line]
		if (line == ""): continue
		if (line.startswith("#")): continue
		if ("#" in line): line = line.split("#",1)[0].strip()

		if ("=" in line):
			if (valuesFinished):
				raise "in scores file, unexpected assignment (line %d): %s" \
					% (lineNumber,line)
			fields = line.split("=",1)
			name = fields[0].strip()
			val  = fields[1].strip()
			if   (name == "gap_open_penalty"):   name = "O"
			elif (name == "gap_extend_penalty"): name = "E"
			if (name in nameToVal):
				raise "in scores file, %s is assigned twice (line %d): %s" \
					% (name,lineNumber,line)
			if (overrideGaps):
				if (name in ["O","E"]):
					lines.pop()
					continue
			try:
				nameToVal[name] = int_or_float(fields[1])
			except:
				if (name in ["O","E"]):
					raise "in scores file, bad assignment value (line %d): %s" \
						% (lineNumber,line)
		elif (not valuesFinished):
			numValueLines = len(lines) - 1
			valuesFinished = True
			subColumns = line.split()
			subRows    = []
			subs       = {}
		else:
			fields  =  line.split()
			rowCh   =  fields.pop(0)
			subRows += [rowCh]
			if (len(fields) != len(subColumns)):
				raise "in scores file, inconsistent matrix (line %d): %s" \
					% (lineNumber,line)
			for ix in range(len(fields)):
				colCh = subColumns[ix]
				subs[rowCh+colCh] = int_or_float(fields[ix])

	if (subs == None):
		raise "scores file is missing a matrix"

	if ("AA" not in subs):
		raise "scores file lacks A-to-A score"

	# compute a few values from the scores matrix

	bestSub  = float(max([subs[digram] for digram in subs]))
	worstSub = float(min([subs[digram] for digram in subs]))
	aaSub    = float(subs["AA"])

	# add expanded values

	knownVals = [name for name in nameToVal]

	if ("O" not in nameToVal):
		nameToVal["O"] = -int(3.25 * worstSub)
		
	if ("E" not in nameToVal):
		nameToVal["E"] = -int(0.25 * worstSub)
		
	if ("X" not in nameToVal):
		nameToVal["X"] = int(10 * aaSub)
		
	if ("Y" not in nameToVal):
		nameToVal["Y"] = int(nameToVal["O"] + 100*nameToVal["E"])
		
	if ("K" not in nameToVal):
		nameToVal["K"] = int(30 * bestSub)
		
	if ("L" not in nameToVal):
		nameToVal["L"] = int(30 * bestSub)

	if ("T" not in nameToVal) and (worstSub/bestSub < -1.5):
		nameToVal["T"] = "2"

	if ("Z" not in nameToVal) and (worstSub/bestSub < -3.0):
		nameToVal["Z"] = "3"

	# figure out what values we've added, and in what order to print them

	addedNames =  [name for name in ["T","Z","O","E","X","Y","K","L"] \
	                 if    (name in nameToVal) \
	                   and (name not in knownVals)]
	addedNames += [name for name in nameToVal \
	                 if    (name not in addedNames) \
	                   and (name not in knownVals)]

	# print the new scores file

	blankLine = False

	for ix in range(numValueLines):
		print (lines[ix])
		blankLine = (lines[ix] == "")

	if (addedNames != []):
		if (not blankLine): print ("")
		print ("# (score parameters added by expand_scores_file)")
		print ("")

		for name in addedNames:
			print ("%s=%s" % (name,nameToVal[name]))

		blankLine = (lines[numValueLines] == "")
		if (not blankLine): print ("")

	for ix in range(numValueLines,len(lines)):
		print (lines[ix])


def int_or_float(s):
	try:    return int(s)
	except: return float(s)


if __name__ == "__main__": main()