File: expand_scores_file.py

package info (click to toggle)
lastz 1.04.52-1
links: PTS, VCS
area: main
in suites: forky, sid
size: 17,604 kB
sloc: ansic: 39,808; python: 6,073; makefile: 843; sh: 53
file content (181 lines) | stat: -rwxr-xr-x 4,577 bytes
parent folder | download | duplicates (2)
#!/usr/bin/env python
"""
Add scoring-related parameters to a lastz scores file
-----------------------------------------------------

:Author: Bob Harris (rsharris@bx.psu.edu)

Typical input scores file:

	# (a LASTZ scoring set, created by "LASTZ --infer")

	bad_score          = X:-1910 # used for sub[X][*] and sub[*][X]
	fill_score         = -191    # used when sub[*][*] not otherwise defined
	gap_open_penalty   = 400
	gap_extend_penalty = 30

	      A     C     G     T
	A    85  -164   -70  -191
	C  -164   100  -151   -70
	G   -70  -151   100  -164
	T  -191   -70  -164    85
"""

import sys

def usage(s=None):
	message = """
expand_scores_file [options]< scores_file > scores_file
  --overridegaps  ignore gap scores already set 
"""

	if (s == None): sys.exit (message)
	else:           sys.exit ("%s\n%s" % (s,message))


def main():

	overrideGaps = False

	for arg in sys.argv[1:]:
		if (arg in ["--help","-h","--h","-help"]):
			usage()
		elif (arg == "--overridegaps"):
				overrideGaps = True
				continue
		usage ("unrecognized argument: %s" % arg)

	# read the scores file

	lines = []
	numValueLines = None
	valuesFinished = False
	nameToVal = {}
	subs = subRows = subColumns = None

	lineNumber = 0
	for line in sys.stdin:
		lineNumber += 1
		line = line.rstrip()
		lines += [line]
		if (line == ""): continue
		if (line.startswith("#")): continue
		if ("#" in line): line = line.split("#",1)[0].strip()

		if ("=" in line):
			if (valuesFinished):
				raise "in scores file, unexpected assignment (line %d): %s" \
					% (lineNumber,line)
			fields = line.split("=",1)
			name = fields[0].strip()
			val  = fields[1].strip()
			if   (name == "gap_open_penalty"):   name = "O"
			elif (name == "gap_extend_penalty"): name = "E"
			if (name in nameToVal):
				raise "in scores file, %s is assigned twice (line %d): %s" \
					% (name,lineNumber,line)
			if (overrideGaps):
				if (name in ["O","E"]):
					lines.pop()
					continue
			try:
				nameToVal[name] = int_or_float(fields[1])
			except:
				if (name in ["O","E"]):
					raise "in scores file, bad assignment value (line %d): %s" \
						% (lineNumber,line)
		elif (not valuesFinished):
			numValueLines = len(lines) - 1
			valuesFinished = True
			subColumns = line.split()
			subRows    = []
			subs       = {}
		else:
			fields  =  line.split()
			rowCh   =  fields.pop(0)
			subRows += [rowCh]
			if (len(fields) != len(subColumns)):
				raise "in scores file, inconsistent matrix (line %d): %s" \
					% (lineNumber,line)
			for ix in range(len(fields)):
				colCh = subColumns[ix]
				subs[rowCh+colCh] = int_or_float(fields[ix])

	if (subs == None):
		raise "scores file is missing a matrix"

	if ("AA" not in subs):
		raise "scores file lacks A-to-A score"

	# compute a few values from the scores matrix

	bestSub  = float(max([subs[digram] for digram in subs]))
	worstSub = float(min([subs[digram] for digram in subs]))
	aaSub    = float(subs["AA"])

	# add expanded values

	knownVals = [name for name in nameToVal]

	if ("O" not in nameToVal):
		nameToVal["O"] = -int(3.25 * worstSub)
		
	if ("E" not in nameToVal):
		nameToVal["E"] = -int(0.25 * worstSub)
		
	if ("X" not in nameToVal):
		nameToVal["X"] = int(10 * aaSub)
		
	if ("Y" not in nameToVal):
		nameToVal["Y"] = int(nameToVal["O"] + 100*nameToVal["E"])
		
	if ("K" not in nameToVal):
		nameToVal["K"] = int(30 * bestSub)
		
	if ("L" not in nameToVal):
		nameToVal["L"] = int(30 * bestSub)

	if ("T" not in nameToVal) and (worstSub/bestSub < -1.5):
		nameToVal["T"] = "2"

	if ("Z" not in nameToVal) and (worstSub/bestSub < -3.0):
		nameToVal["Z"] = "3"

	# figure out what values we've added, and in what order to print them

	addedNames =  [name for name in ["T","Z","O","E","X","Y","K","L"] \
	                 if    (name in nameToVal) \
	                   and (name not in knownVals)]
	addedNames += [name for name in nameToVal \
	                 if    (name not in addedNames) \
	                   and (name not in knownVals)]

	# print the new scores file

	blankLine = False

	for ix in range(numValueLines):
		print (lines[ix])
		blankLine = (lines[ix] == "")

	if (addedNames != []):
		if (not blankLine): print ("")
		print ("# (score parameters added by expand_scores_file)")
		print ("")

		for name in addedNames:
			print ("%s=%s" % (name,nameToVal[name]))

		blankLine = (lines[numValueLines] == "")
		if (not blankLine): print ("")

	for ix in range(numValueLines,len(lines)):
		print (lines[ix])


def int_or_float(s):
	try:    return int(s)
	except: return float(s)


if __name__ == "__main__": main()