File: test_fasta.py

package info (click to toggle)
pyfastx 2.2.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,336 kB
  • sloc: ansic: 4,820; python: 1,817; sh: 505; perl: 66; makefile: 31
file content (327 lines) | stat: -rw-r--r-- 8,859 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
import os
import random
import pyfastx
import pyfaidx
import unittest

join = os.path.join
root_dir = join(os.path.dirname(os.path.abspath(__file__)), '..')
data_dir = join(os.path.dirname(os.path.abspath(__file__)), 'data')

gzip_fasta = join(data_dir, 'test.fa.gz')
flat_fasta = join(data_dir, 'test.fa')
rna_fasta = join(data_dir, 'rna.fa')
protein_fasta = join(data_dir, 'protein.fa')


class FastaTest(unittest.TestCase):
	def setUp(self):
		self.fastx = pyfastx.Fasta(gzip_fasta)

		self.fasta = pyfastx.Fasta(flat_fasta)

		self.faidx = pyfaidx.Fasta(flat_fasta, sequence_always_upper=True)

		self.count = len(self.fastx)

	def tearDown(self):
		#fix permission error on Windows
		del self.fastx
		del self.fasta
		del self.faidx

		if os.path.exists('{}.fxi'.format(gzip_fasta)):
			os.remove('{}.fxi'.format(gzip_fasta))

		if os.path.exists('{}.fxi'.format(flat_fasta)):
			os.remove('{}.fxi'.format(flat_fasta))

		if os.path.exists('{}.fai'.format(flat_fasta)):
			os.remove('{}.fai'.format(flat_fasta))

		if os.path.exists('{}.fxi'.format(rna_fasta)):
			os.remove('{}.fxi'.format(rna_fasta))

		if os.path.exists('{}.fxi'.format(protein_fasta)):
			os.remove('{}.fxi'.format(protein_fasta))

	def get_random_index(self):
		return random.randint(0, self.count-1)

	def test_module(self):
		# gzip check test
		self.assertEqual(pyfastx.gzip_check(gzip_fasta), self.fastx.is_gzip)

		# version test
		with open(join(root_dir, 'src', 'version.h')) as fh:
			version = fh.read().split()[2].strip('"')
			self.assertEqual(version, pyfastx.version())

		#reverse complement
		self.assertEqual(pyfastx.reverse_complement('ATGC'), 'GCAT')

	def test_build(self):
		self.fastx = pyfastx.Fasta(gzip_fasta, build_index=False)

		if os.path.exists('{}.fxi'.format(gzip_fasta)):
			os.remove('{}.fxi'.format(gzip_fasta))

		self.fastx.build_index()

	def test_fasta(self):
		#test gzip
		self.assertFalse(self.fasta.is_gzip)

		#seq counts
		self.assertEqual(len(self.fastx), len(self.faidx.keys()))

		#seq length
		expect_size = sum(len(s) for s in self.faidx)
		self.assertEqual(self.fastx.size, expect_size)

		#test composition
		expect = {'A': 0, 'C': 0, 'G': 0, 'T': 0}
		for s in self.faidx:
			expect['A'] += s[:].seq.count('A')
			expect['C'] += s[:].seq.count('C')
			expect['G'] += s[:].seq.count('G')
			expect['T'] += s[:].seq.count('T')
		self.assertEqual(self.fastx.composition, expect)

		#test GC content
		expect_gc = (expect['G']+expect['C'])/sum(expect.values())*100
		self.assertEqual(round(self.fastx.gc_content, 3), round(expect_gc, 3))

		#test GC skew
		expect_skew = (expect['G']-expect['C'])/(expect['G']+expect['C'])
		self.assertEqual(round(self.fastx.gc_skew, 3), round(expect_skew, 3))

		#test longest and shortest sequence
		longest = (None, 0)
		shortest = (None, expect_size)
		for seq in self.faidx:
			l = len(seq)
			if l > longest[1]:
				longest = (seq.name, l)

			if l < shortest[1]:
				shortest = (seq.name, l)

		long_seq = self.fastx.longest
		short_seq = self.fastx.shortest

		self.assertEqual(longest, (long_seq.name, len(long_seq)))
		self.assertEqual(shortest, (short_seq.name, len(short_seq)))

		#test contains
		idx = self.get_random_index()
		name = self.faidx[idx].name
		self.assertTrue(name in self.fastx)

	#test repr
	def test_repr(self):
		expect = "<Fasta> {} contains {} sequences".format(gzip_fasta, self.count)
		result = repr(self.fastx)
		self.assertEqual(expect, result)

		#without build index
		fa = pyfastx.Fasta(flat_fasta, build_index=False)
		expect = "<Fasta> {}".format(flat_fasta)
		result = repr(fa)
		self.assertEqual(expect, result)

	def test_seq_type(self):
		#test dna format
		self.assertEqual(self.fastx.type, 'DNA')

		#test rna format
		rna = pyfastx.Fasta(rna_fasta)
		self.assertEqual(rna.type, "RNA")

		#test protein format
		prot = pyfastx.Fasta(protein_fasta)
		self.assertEqual(prot.type, "protein")

	def test_iter_object(self):
		for seq in self.fastx:
			expect = self.faidx[seq.name][:].seq
			self.assertEqual(expect, seq.seq)

		for seq in pyfastx.Fasta(flat_fasta, uppercase=True):
			expect = self.faidx[seq.name][:].seq
			self.assertEqual(expect, seq.seq)

		#test reference of sequence made from loop
		for seq in self.fastx:
			break

		expect = self.faidx[seq.name][:].seq
		self.assertEqual(expect, seq.seq)
		self.assertEqual(expect, seq.seq)

	def test_iter_tuple(self):
		fa = pyfastx.Fasta(gzip_fasta, build_index=False)
		
		for name, seq in fa:
			expect = str(self.faidx[name])
			self.assertEqual(expect, seq)

	def test_iter_upper(self):
		fa = pyfastx.Fasta(flat_fasta, build_index=False, uppercase=True)

		for name, seq in fa:
			self.assertEqual(seq, self.faidx[name.split()[0]][:].seq)

	def test_iter_full_name(self):
		fa = pyfastx.Fasta(flat_fasta, build_index=False, full_name=True)

		for name, _ in fa:
			self.assertTrue(name, self.fastx[name.split()[0]].description)

	def test_iter_upper_full_name(self):
		fa = pyfastx.Fasta(flat_fasta, build_index=False, uppercase=True, full_name=True)

		for name, seq in fa:
			self.assertEqual(name, self.fastx[name.split()[0]].description)
			self.assertEqual(seq, self.fastx[name.split()[0]].seq)

	def test_key_func(self):
		del self.fastx

		#remove previously created index file
		if os.path.exists("{}.fxi".format(gzip_fasta)):
			os.remove("{}.fxi".format(gzip_fasta))

		self.fastx = pyfastx.Fasta(gzip_fasta, key_func=lambda x: x.split()[1])
		idx = self.get_random_index()
		self.assertEqual(self.fastx[idx].name, self.fastx[idx].description.split()[1])

	def test_statistics(self):
		lens = sorted([len(seq) for seq in self.faidx], reverse=True)
		half = sum(lens)/2
		tlen = 0
		l50 = 0
		for n50 in lens:
			l50 += 1
			tlen += n50

			if tlen >= half:
				break

		self.assertEqual(self.fastx.nl(50), (n50, l50))

		#test mean length
		expect = round(sum(lens)/len(lens), 3)
		result = round(self.fastx.mean, 3)
		self.assertEqual(expect, result)

		#test median length
		lens = sorted(lens)
		expect = lens[105]

		result = self.fastx.median
		self.assertEqual(expect, result)

		#test count squence
		expect = 0
		for l in lens:
			if l >= 200:
				expect += 1
		result = self.fastx.count(200)
		self.assertEqual(expect, result)

	def test_seq_fetch(self):
		idx = self.get_random_index()
		name = list(self.faidx.keys())[idx]
		l = len(self.fastx[idx])

		#test one interval
		a = int(l/2)
		interval = (random.randint(1, a), random.randint(a+1, l))

		expect = str(self.faidx[name])[interval[0]-1:interval[1]]
		result = self.fastx.fetch(name, interval)

		self.assertEqual(expect, result)

		#test multiple intervals
		intervals = []
		intervals.append((random.randint(1, int(a/2)), random.randint(int(a/2)+1, a)))
		intervals.append((random.randint(a+1, int((a+l)/2)), random.randint(int((a+l)/2)+1, l)))

		expect = "".join([str(self.faidx[name])[s-1:e] for s, e in intervals])
		result = self.fastx.fetch(name, intervals)

		self.assertEqual(expect, result)

	def test_seq_flank(self):
		idx = self.get_random_index()
		name = list(self.faidx.keys())[idx]
		l = len(self.fastx[idx])

		a = int(l/2)
		start = random.randint(1, a)
		end = random.randint(a+1, l)
		flen = 20
		left, right = self.fastx.flank(name, start, end, flen)
		s = start - flen - 1
		if s < 0:
			s = 0

		self.assertEqual(str(self.faidx[name])[s:start-1], left)
		self.assertEqual(str(self.faidx[name])[end:end+flen], right)

		left, right = self.fastx.flank(name, start, end, flank_length=flen, use_cache=True)
		self.assertEqual(str(self.faidx[name])[s:start-1], left)
		self.assertEqual(str(self.faidx[name])[end:end+flen], right)

		left, right = self.fastx.flank(name, 1, len(self.faidx[name]))
		self.assertEqual('', left)
		self.assertEqual('', right)

	def test_no_upper(self):
		fa = pyfastx.Fasta(flat_fasta, uppercase=False)
		self.assertEqual(self.fastx[0].seq, fa[0].seq)

	def test_exception(self):
		with self.assertRaises(TypeError):
			pyfastx.Fasta(flat_fasta, key_func=1)

		with self.assertRaises(FileExistsError):
			pyfastx.Fasta('a_file_not_exists')

		with self.assertRaises(ValueError):
			self.fastx.fetch('seq1', {'a':1})

		with self.assertRaises(NameError):
			self.fastx.fetch('seq1', (1,10))

		with self.assertRaises(ValueError):
			self.fastx.fetch(self.fastx[0].name, (1,10,20))

		with self.assertRaises(ValueError):
			self.fastx.fetch(self.fastx[0].name, (20, 10))

		with self.assertRaises(ValueError):
			self.fastx.fetch(self.fastx[0].name, [20, 10])

		with self.assertRaises(IndexError):
			_ = self.fastx[self.count]

		with self.assertRaises(KeyError):
			_ = self.fastx[list()]

		with self.assertRaises(ValueError):
			self.fastx.nl(101)

		with self.assertRaises(RuntimeError):
			non_fa = 'non.fa'
			with open(non_fa, 'w') as fw:
				fw.write('abc')

			pyfastx.Fasta(non_fa)

			os.remove(non_fa)

if __name__ == '__main__':
	unittest.main()