File: dna.py

package info (click to toggle)
python-sqt 0.8.0-9
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 824 kB
  • sloc: python: 5,964; sh: 38; makefile: 10
file content (156 lines) | stat: -rw-r--r-- 3,993 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/usr/bin/env python3
"""
- Python 2 and 3 compatible fast reverse complement
- definition of genetic code (triplets to amino acids)
- fast translation of nucleotide strings to amino acids
- other misc. functions
"""
import sys
import re
import random
from ._codons import GENETIC_CODE  # for re-export
from sqt._helpers import nt_to_aa


if sys.version < '3':
	from string import maketrans
else:
	maketrans = bytes.maketrans
	_TR_STR = str.maketrans('ACGTUMRWSYKVHDBNacgtumrwsykvhdbn', 'TGCAAKYWSRMBDHVNtgcaakywsrmbdhvn')

_TR = maketrans(b'ACGTUMRWSYKVHDBNacgtumrwsykvhdbn', b'TGCAAKYWSRMBDHVNtgcaakywsrmbdhvn')

if sys.version < '3':
	def reverse_complement(s):
		return s.translate(_TR)[::-1]
else:
	def reverse_complement(s):
		if isinstance(s, str):
			return s.translate(_TR_STR)[::-1]
		else:
			return s.translate(_TR)[::-1]


AMINO_ACID_REGEXES = dict(
	A='GC[ACGT]',
	C='TG[CT]',
	D='GA[CT]',
	E='GA[AG]',
	F='TT[CT]',
	G='GG[ACGT]',
	H='CA[CT]',
	I='AT[ACT]',
	K='AA[AG]',
	L='(CT[ACGT]|TT[AG])',
	M='ATG',
	N='AA[CT]',
	P='CC[ACGT]',
	Q='CA[AG]',
	R='(AG[AG]|CG[ACGT])',
	S='AG[CT]|TC[ACGT]',
	T='AC[ACGT]',
	V='GT[ACGT]',
	W='TGG',
	Y='TA[CT]',
	X='[ACGT][ACGT][ACGT]'
)


def amino_acid_regex(aa_sequence, compile=False):
	"""
	Given an amino acid sequence, return a regular expression that can be used
	to match a nucleotide sequence. If compile is True, the regular expression
	is compiled with re.compile, otherwise the regex is returned as a string.
	"""
	regex = ''.join(AMINO_ACID_REGEXES[aa] for aa in aa_sequence)
	return re.compile(regex) if compile else regex


def n_intervals(sequence, N='N'):
	"""
	Given a sequence, yield all intervals containing only N characters as
	tuples (start, stop). If the sequence is a bytes/bytearray object,
	set N=ord(b'N')

	>>> list(n_intervals('ACGTnNAC'))
	[(4, 6)]
	>>> list(n_intervals(b'ACGTnNAC', N=ord(b'N')))
	[(4, 6)]
	"""
	sequence = sequence.upper()
	start = sequence.find(N)
	while start >= 0:
		stop = start + 1
		while stop < len(sequence) and sequence[stop] == N:
			stop += 1
		yield (start, stop)
		start = sequence.find(N, stop)


def intervals_complement(intervals, length):
	"""
	Given an iterable of sorted, nonoverlapping intervals as (start, stop)
	pairs, yield the complementary intervals. The result is equivalent to
	[(0, length)] minus the given intervals.

	>>> list(intervals_complement([(1, 2), (4, 6)], length=10))
	[(0, 1), (2, 4), (6, 10)]
	"""
	prev_stop = 0
	for start, stop in intervals:
		if start >= length:
			break
		if prev_stop != start:
			yield (prev_stop, start)
		prev_stop = stop
	if prev_stop < length:
		yield (prev_stop, length)


def mutate(seq, substitution_rate=0.1, indel_rate=0.01, alphabet='ACGT', counts=False):
	"""
	Mutate a DNA sequence by randomly introducing substitutions and indels.
	This does not use a very sophisticated model.

	Return the mutated sequence.

	substitution_rate -- The probability at which an individual base is substituted with a different one.

	indel_rate -- At any position, a random base is inserted at probability indel_rate/2, and
		the base is deleted at probability indel_rate/2.


	counts -- If this is True, a triple (mutated_sequence, number_of_substitutions, number_of_indels)
		is returned instead of just the sequence.
	"""
	other_chars = {}
	for c in alphabet:
		other_chars[c] = alphabet.replace(c, '')
	mutated = []
	n_sub = 0
	n_indel = 0
	indels = indel_rate > 0
	for c in seq:
		c = c.upper()
		r = random.random()
		if r < substitution_rate:
			# mutate base
			d = random.choice(other_chars.get(c, alphabet))
			mutated.append(d)
			n_sub += 1
		elif indels and r < (substitution_rate + 0.5 * indel_rate):
			# insertion
			mutated.append(random.choice(alphabet))
			mutated.append(c)
			n_indel += 1
		elif indels and r < (substitution_rate + indel_rate):
			n_indel += 1
			# deletion
			pass
		else:
			# no change
			mutated.append(c)
	if counts:
		return ''.join(mutated), n_sub, n_indel
	else:
		return ''.join(mutated)