File: extract_reads_aligned_to_region.py

package info (click to toggle)
nanopolish 0.14.0-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 11,760 kB
  • sloc: cpp: 22,200; ansic: 1,478; python: 814; makefile: 210; sh: 43; perl: 17
file content (333 lines) | stat: -rwxr-xr-x 13,206 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
#!/usr/bin/env python3
"""
========================================================
Extract info on reads that align to a given region
in draft genome assembly.
========================================================
"""


try:
	from Bio.SeqIO.FastaIO import SimpleFastaParser
	from Bio.SeqIO.QualityIO import FastqGeneralIterator
	import pysam
	import argparse
	import subprocess
	import tarfile
	import gzip
	import sys,os
except ImportError:
	print('Missing package(s)')
	quit()

verbose = False
log = list()

def main():
	# --------------------------------------------------------
	# PART 0: 	Parse input
	# --------------------------------------------------------
	parser = argparse.ArgumentParser(description='Extract and package reads within region')
	parser.add_argument('-v', '--verbose', action="store_true", default=False, required=False, dest="verbose", help="Use for verbose output with info on progress.")
	parser.add_argument('-b', '--bam', action="store", required=True, dest="bam", help="Sorted bam file created by aligning reads to the draft genome (refer to reads.sorted.bam in Nanopolish README).")
	parser.add_argument('-r', '--reads', action="store", dest="fa_filename", help="Fasta, fastq, fasta.gz, or fastq.gz file (refer to reads.fa in Nanopolish README)")
	parser.add_argument('-g', '--genome',  action="store", required=True, dest="draft_ga", help="Draft genome assembly (refer to draft.fa in Nanopolish README).")
	parser.add_argument('-w', '--window', action="store", required=True, dest="draft_ga_coords", help="Draft genome assembly coordinates wrapped in quotes ex. \"tig000001:10000-20000\".")
	parser.add_argument('-o', '--output_prefix', action="store", required=False, default="reads_subset", dest="output_prefix", help="Output prefix for tar.gz file and log file.")
	args = parser.parse_args()
	
	# Check to see if user used verbose option
	global verbose
	if args.verbose:
		verbose = True

	# Infer readdb file from fasta/q file
	readdb = args.fa_filename + ".index.readdb"

	custom_print( "===================================================" )
	custom_print( "Extract reads that align to given region" )
	custom_print( "Package all necessary files to reproduce error" )
	custom_print( "===================================================" )

	# --------------------------------------------------------
	# PART 1: 	Validate input
	# --------------------------------------------------------
	custom_print( "[ Input ]" )
	custom_print( "[+] Extracting from draft genome assembly coords: " + args.draft_ga_coords )
	custom_print( "[+] BAM file (reads.fa aligned to draft.fa): " + args.bam )
	custom_print( "[+] Readdb file: " + readdb )
	custom_print( "[+] Draft genome assembly (draft.fa): " + args.draft_ga )
	custom_print( "[+] FASTA/Q file (reads.fa): " + args.fa_filename )
	custom_print( "[+] Output prefix: " + args.output_prefix ) 

	custom_print( "[ Input check ]" )
	files = list()
	files.append(args.bam)
	files.append(readdb)
	files.append(args.fa_filename)
	files.append(args.draft_ga)
	draft_ga_fai = args.draft_ga + ".fai"
	files.append(draft_ga_fai)

	for i in files:
		if not os.path.exists(i) or not os.path.getsize(i) > 0 or not os.access(i, os.R_OK):
			print( "Expecting " + i + ". But does not exist, is empty or is not readable." )
			sys.exit(1)

	custom_print( "[ Validated input ] All input files exist, are not-empty, and are readable." )

	# --------------------------------------------------------
	# PART 2: 	Reassign input argument values	
	# --------------------------------------------------------
	# o = old/original, ga = genome assembly, fa = fasta/q file
	# coords = coordinates, op = output
	o_bam = args.bam
	o_readdb = readdb
	o_fa = args.fa_filename
	op = args.output_prefix
	draft_ga_coords = args.draft_ga_coords

	# --------------------------------------------------------
	# PART 3: 	With user input ref coords, extract all 
	#		aligned reads within these coordinates, 
	#		store read_ids, and fast5 files.
	# --------------------------------------------------------
	custom_print( "[ Extracting info on reads aligned to region ] \t" + draft_ga_coords )
	samfile = pysam.AlignmentFile(o_bam, "rb")
	region_read_ids = list()
	region_num_reads = 0

	# get all read ids of reads that are aligned to region in draft assembly
	for read in samfile.fetch(region=draft_ga_coords):
		id = read.query_name
		# add to list if not already in list
		if not id in region_read_ids:
			# store read id in list
			region_read_ids.append(id)
			# count number of reads that were aligned to the given region
			region_num_reads+=1

	# --------------------------------------------------------
	# PART 4:   Parse readdb file and find path to fast5 files
	# 		associated with each read that aligned to region
	# --------------------------------------------------------
	# readdb file has 2 columns: one indicating read_id and another indicating the fast5 file the read came from
	# each row represents a read
	custom_print( "[ Reading readdb file ]" )
	region_fast5_files = dict()
	with open (o_readdb, "r") as file:
		for line in file:
			l = line.split("\t")
			read_id = l.pop(0)
			if read_id in region_read_ids:
				fast5_file = l.pop(0)
				region_fast5_files[str(read_id)] = fast5_file.rstrip()

	# --------------------------------------------------------
	# PART 5:   Make a region BAM and BAI file
	# --------------------------------------------------------
	new_bam = "reads.bam"
	custom_print( "[ Writing to a new BAM file ] \t" + new_bam )
	region_reads = pysam.view("-b", o_bam, draft_ga_coords, "-o", new_bam, catch_stdout=False)
	
	new_bam_index = new_bam + ".bai"
	custom_print( "[ Writing to a new BAI file ] \t" + new_bam_index )
	pysam.index(new_bam, new_bam_index)

	# --------------------------------------------------------
	# PART 6: 	With user input ref coords, extract all 
	#		aligned	reads within these coordinates 
	#		and make new FASTA file
	# --------------------------------------------------------
	# detect type of sequences file then handle accordingly
	file_type = detect_fa_filetype(o_fa)
	new_fa = "reads.fasta"
	custom_print( "[ Writing to a new fasta file ]\t" +  new_fa )
	with open (new_fa, "w") as fout:
		if ".gz" in file_type:
			with gzip.open(o_fa, "rt") as fin:
				if "fasta.gz" in file_type:
					for title, seq in SimpleFastaParser(fin):
						name = title.split(None, 1)[0]
						if name in region_read_ids:
							fout.write(">%s\n%s\n" % (name, seq))
				elif "fastq.gz" in file_type:
					for title, seq, qual in FastqGeneralIterator(fin):
						name = title.split(None, 1)[0]
						if name in region_read_ids:
							fout.write(">%s\n%s\n" % (name, seq))
		else:
			with open(o_fa, "rt") as fin:
				if "fasta" in file_type:
					for title, seq in SimpleFastaParser(fin):
						name = title.split(None, 1)[0]
						if name in region_read_ids:
							fout.write(">%s\n%s\n" % (name, seq))
				elif "fastq" in file_type:
					for title, seq, qual in FastqGeneralIterator(fin):
						name = title.split(None, 1)[0]
						if name in region_read_ids:
							fout.write(">%s\n%s\n" % (name, seq))

	# --------------------------------------------------------
	# PART 7: 	Let's get to tarring
	# --------------------------------------------------------
	# While tarring, we need to fix the directory structure
	# such that the original path to files are not saved.
	# For each fast5 file we need to extract the basename,
	# and save it in tar such that we save only the basename,
	# and not the whole path from the original source.
	tar_filename = op + ".tar.gz"
	archive = tarfile.open(tar_filename, "w:gz")
	custom_print( "[ Creating a tar.gz file ] \t" + tar_filename )
	custom_print( "[+] FAST5 files: " + op + "/fast5_files/<FAST5 file(s)>" )
	# track missing fast5 files
	bad_f5_found = False # true if missing fast5 file
	bad_read_id = ""
	bad_f5_path = ""
	num_bad_cases = 0
	for r in list(region_fast5_files.keys()):
		read_id = r
		f5 = region_fast5_files[r]

		# get basename of fast5 file
		f5_basename = extract_basename(f5)
		an = op + "/fast5_files/" + f5_basename
		try:
			archive.add(f5, arcname=an)
		except:
			bad_f5_found = True
			bad_read_id = read_id
			bad_f5_path = f5
			num_bad_cases += 1
	
	# handle missing fast5 files
	if bad_f5_found:
		print("\nERROR: For read " + read_id + ", could not add " + str(f5) + ".")
		print("This path is inferred from the readdb file.")
		print("Please check that this is the correct path in readdb file for this read.")
		if num_bad_cases > 1:
			print("There are " + str(num_bad_cases) + " other reads with this problem (out of " + str(len(region_fast5_files)) + ").")
		print("\n")
		sys.exit(1)

	# --------------------------------------------------------
	# PART 8:	Add new files to tar
	# 			new fasta, new bam, and new bai with reads 
	#			in the region given only
	# --------------------------------------------------------
	an = op + "/" + new_fa
	archive.add(new_fa, arcname=an)
	custom_print( "[+] New FASTA: " + an )
	
	an_new_bam = op + "/" + new_bam
	archive.add(new_bam, arcname=an_new_bam)
	custom_print( "[+] New BAM: " + an_new_bam )

	an_new_bam_index = op + "/" + new_bam_index
	archive.add(new_bam_index, arcname=an_new_bam_index)
	custom_print( "[+] New BAI: " + an_new_bam_index )

	# --------------------------------------------------------
	# PART 9:	Add original draft genome assembly file
	#			and the index file
	# --------------------------------------------------------
	an_draft_ga = op + "/draft.fa"
	archive.add(args.draft_ga, arcname=an_draft_ga)
	custom_print( "[+] Original draft ga: " + an_draft_ga )

	an_draft_ga_fai = op + "/draft.fa.fai"
	archive.add(i, arcname=an_draft_ga_fai)
	custom_print( "[+] Original draft ga index: " + an_draft_ga_fai )

	# --------------------------------------------------------
	# PART 10: 	Check the number of reads in all new files
	# --------------------------------------------------------
	custom_print( "[ Output check ] " )
	# check the length of bam file
	num_reads_bam = region_num_reads
	num_reads_fasta = int(float(file_length(new_fa))/2.0)
	num_fast5_files = len(region_fast5_files)
	values = list()
	values.append(num_reads_bam)
	values.append(num_reads_fasta)
	custom_print( "[+] Num reads in new BAM: \t" + str(num_reads_bam) )
	custom_print( "[+] Num reads in new FASTA: \t" + str(num_reads_fasta) )
	custom_print( "[+] Num files in fast5_files/: \t" + str(num_fast5_files))
	if not all( v == num_fast5_files for v in values ):
		print( "[!] WARNING: The number of reads in the new bam, new fasta, and num of fast5 files tarred are not equal..." )
	else:
		custom_print( "[ Validated output ] Number of reads in the new bam, new fasta, and num of fast5 files tarred are equal!" )

	# --------------------------------------------------------
	# FINAL: 	Output log if verbose flag not used
	# --------------------------------------------------------
	global log
	logfile = op + ".log"
	with open (logfile, "w") as lfile:
		for s in log:
			lfile.write(s + "\n")
	an_logfile = op + "/" + logfile
	custom_print( "[ Log file ] " +  an_logfile )
	custom_print( "[ Tar file ] " + str(tar_filename) )
	custom_print( "[ Finished ] " )
	archive.add(logfile, arcname=an_logfile)
	archive.close()

def file_length(filename):
	# ========================================================
	# Returns number of lines in a file
	# --------------------------------------------------------
	# Input: 	Filename
	# Output: 	Number of lines in the file ...
	# ========================================================
	with open(filename) as f:
		for i, l in enumerate(f):
			pass
	return int(i) + 1

def extract_basename(filename):
	# ========================================================
	# Returns base filename
	# --------------------------------------------------------
	# Input: 	Filenames with paths
	# Output: 	Base filename
	# ========================================================
	# remove backslashes at the end of the file names that could return empty basenames..
	a = filename.rstrip("\\")
	a = a.rstrip("//")
	b = os.path.basename(a)
	return str(b)

def detect_fa_filetype(fa_filename):
	# ========================================================
	# Detects filetype of sequences input
	# --------------------------------------------------------
	# Input: 	FASTA/Q filename
	# Output: 	Either ['fa.gz', 'fastq.gz', 'fasta.gz', 
	# 			'fastq', 'fasta']
	# ========================================================
	path = fa_filename
	if path.endswith('fa.gz'):
		print("Possibly using the reads file generated by nanopolish index? Use original reads file...")	
	for ext in ['fastq.gz', 'fasta.gz', 'fastq', 'fasta']:
		if path.endswith(ext):
			return ext
	print("Must be either fasta, fastq, fasta.gz, fastq.gz")
	sys.exit(1)

def custom_print(s):
	# ========================================================
	# Depending on verbose flag, will save all prints to 
	# log list, or will print to stdout
	# --------------------------------------------------------
	# Input: 	string to print
	# ========================================================
	global verbose
	global log
	if verbose:
		print(s)
	log.append(s)

if __name__ == "__main__":
	main()