File: align.py

package info (click to toggle)
python-sqt 0.8.0-9
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 824 kB
  • sloc: python: 5,964; sh: 38; makefile: 10
file content (106 lines) | stat: -rw-r--r-- 3,535 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/env python3
"""
Compare two strings
"""
from sqt import HelpfulArgumentParser
import sys
import os
from collections import namedtuple
from cutadapt import align
from sqt import FastaReader
from sqt.align import globalalign, GlobalAlignment
from sqt.dna import reverse_complement

__author__ = "Marcel Martin"


Alignment = namedtuple('Alignment', 'start1 stop1 start2 stop2 matches errors')

def add_arguments(parser):
	arg = parser.add_argument
	arg('--semiglobal', '--overlap', action='store_true', default=False,
		help='Run a semi-global alignment (for detecting overlaps). '
		'Default: global alignment.')
	arg('--max-error-rate', '-e', type=float, default=None,
		help='Switch to cutadapt algorithm (also enables --semiglobal). '
		'No alignment will be printed.')
	arg('--reverse-complement', '--rc', action='store_true',
		default=False,
		help='Run the alignment also with the reverse-complement of the second '
		'sequence')
	arg('--merge', action='store_true', default=False,
		help='Output a merged sequence (also enables --semiglobal)')
	arg('sequence1', help='Sequence or path to FASTA file. If FASTA, only the first sequence is used.')
	arg('sequence2', help='Sequence or path to FASTA file. If FASTA, only the first sequence is used.')


def print_numbers(sequence1, sequence2, alignment, overlap):
	print()
	print('Length of sequence 1:', len(sequence1))
	print('Length of sequence 2:', len(sequence2))
	print('Errors in alignment:', alignment.errors)
	if overlap:
		print('Length of overlap in sequence 1:', alignment.stop1 - alignment.start1)
		print('Length of overlap in sequence 2:', alignment.stop2 - alignment.start2)
	if hasattr(alignment, 'matches'):
		print('Matches:', alignment.matches)


def load_sequence(path_or_sequence):
	if os.path.exists(path_or_sequence):
		with FastaReader(path_or_sequence) as fr:
			sequence = next(iter(fr)).sequence
	else:
		sequence = path_or_sequence
	return sequence


def main(args=None):
	if args is None:
		parser = HelpfulArgumentParser(description=__doc__)
		add_arguments(parser)
		args = parser.parse_args()
	sequence1 = load_sequence(args.sequence1)
	sequence2 = load_sequence(args.sequence2)

	sequences = [ (False, sequence2) ]
	if args.reverse_complement:
		sequences.append((True, reverse_complement(sequence2)))

	if args.merge:
		args.semiglobal = True

	# credit: http://stackoverflow.com/questions/566746/
	rows, columns = os.popen('stty size', 'r').read().split()
	columns = int(columns)

	for revcomp, sequence2 in sequences:
		if revcomp:
			print('Alignment with reverse-complemented sequence 2:')
		else:
			print('Alignment:')
		print()
		if args.max_error_rate is not None:
			flags = align.SEMIGLOBAL
			result = align.locate(sequence1.upper(), sequence2.upper(), max_error_rate=args.max_error_rate, flags=flags)
			if result is None:
				print('No alignment found')
				continue
			alignment = Alignment(*result)
			print_numbers(sequence1, sequence2, alignment, overlap=True)
		else:
			alignment = GlobalAlignment(sequence1.upper(), sequence2.upper(), semiglobal=args.semiglobal)
			alignment.print(width=columns, gap_char='-')
			print_numbers(sequence1, sequence2, alignment, args.semiglobal)

		if args.merge:
			merged = sequence1[0:alignment.start1]
			merged += sequence2[0:alignment.start2]
			merged += sequence1[alignment.start1:alignment.stop1]
			merged += sequence1[alignment.stop1:]
			merged += sequence2[alignment.stop2:]
			print('Merged (length {}):'.format(len(merged)), merged)


if __name__ == '__main__':
	main()