File: addadapt.py

package info (click to toggle)
python-sqt 0.8.0-9
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 824 kB
  • sloc: python: 5,964; sh: 38; makefile: 10
file content (82 lines) | stat: -rw-r--r-- 2,423 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env python3
"""
Read in reads in FASTA format, insert an adapter sequence at a random
position, shorten the read to its original length, and write it out.

An annotation is added to the read description that describes the
adapter position, such as "adapterpos=17". The position is 1-based.
"""
import sys
from random import randint, random, seed, choice

from sqt import HelpfulArgumentParser
from sqt.io.fasta import FastaReader

__author__ = "Marcel Martin"

#adapter = 'GCCTAACTTCTTAGACTGCCTTAAGGACGT'

def mutate_sequence(seq, rate=0.1, alphabet='ACGT', indels=False, indel_rate=0.1):
	"""
	If indels is True, the mutation_rate is equally split up between insertions,
	deletions and substitutions.
	"""
	mutated = []
	charpos = dict( (c,i) for (i,c) in enumerate(alphabet))
	for c in seq:
		c = c.upper()
		r = random()
		if indels and r < 0.5 * rate * indel_rate:
			# insertion
			mutated.append(choice(alphabet))
			mutated.append(c)
		elif indels and r < rate * indel_rate:
			# deletion
			pass
		elif r < rate:
			# mutate base
			d = alphabet[(charpos[c] + randint(1, len(alphabet)-1)) % len(alphabet)]
			assert d != c
			mutated.append(d)
		else:
			# no change
			mutated.append(c)
	return ''.join(mutated)


def main():
	parser = HelpfulArgumentParser(description=__doc__)
	arg = parser.add_argument
	arg("--seed", type=int, default=None, help="seed for random number generator")
	arg("--erate", type=float, default=None,
		help="error rate for simulated errors, no indels (default: no errors)")
	arg("--adapter", "-a", default='GCCTAACTTCTTAGACTGCCTTAAGGACGT',
		help="Adapter sequence")
	arg("--probability", "-p", default=0.5,
		help="Fraction of reads (approximate) that should contain the adapter")
	arg("fasta", help="Input FASTA file. Use '-' for standard input.")

	args = parser.parse_args()
	if args.seed is not None:
		seed(args.seed)
	adapter = args.adapter
	adapter_prob = args.probability
	with FastaReader(args.fasta) as reader:
		for read in reader:
			if random() < adapter_prob:
				if args.erate is not None:
					a = mutate_sequence(adapter, args.erate, indels=False)
				else:
					a = adapter
				l = len(read)
				pos = randint(0, l-1)
				seq = read.sequence
				seq = seq[:pos] + a + seq[pos:]
				read.sequence = seq[:l]
				read.name = read.name + ' adapterpos={}'.format(pos+1)
			print('>{}\n{}'.format(read.name, read.sequence))



if __name__ == '__main__':
	main()