File: repeat_annotate_reads.py

package info (click to toggle)
hinge 0.5.0-8
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 2,972 kB
  • sloc: cpp: 9,480; ansic: 8,826; python: 5,023; sh: 340; makefile: 10
file content (101 lines) | stat: -rwxr-xr-x 3,521 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/python3

import sys
import os



def reverse_complement(bases):
    rev_comp={'A':'T','C':'G','G':'C','T':'A','N':'N'}
    return ''.join(map(lambda x :rev_comp[x],bases[::-1]))

def run(multifasta_path,intermediate_repeat_file_path, gt_file_path, gt_annotated_file_path):

	##Read chromosomes from multifasta file
	chrom={}
	cur_chrom=''
	start=True
	chr_num=0
	with open(multifasta_path,'r') as f:
	    for lines in f:
	        if lines[0]=='>':
	            
	            if start:
	                start=False
	                chr_num=int(lines.split()[0][1:])-1
	                print 'detect chr '+ str(chr_num)
	            else:
	                chrom[chr_num]=cur_chrom
	                print len(cur_chrom)
	                chr_num=int(lines.split()[0][1:])-1
	                print 'detect chr '+ str(chr_num)
	            cur_chrom=''
	        else:
	            cur_chrom+=lines.strip()
	print len(cur_chrom)            
	chrom[chr_num]=cur_chrom

	##Run mummer to get repeats
	mummer_cmd='mummer -maxmatch -b -c -l 1000 -L '+multifasta_path+' '+multifasta_path +' > '+intermediate_repeat_file_path 
	os.system(mummer_cmd)


	#Put repeats discovered by mummer in right form
	chr_num=0
	chr_repeats={}
	rev_com=False
	with open (intermediate_repeat_file_path) as f:
	    for line in f:
	        if line[0]=='>':
	            line1=line.strip().split()
	            chr_num=int(line1[1])-1
	            #print len(line1)
	            if len(line1)==6:
	                rev_com=True
	                chr_len=int(line1[5])
	            else:
	                rev_com=False
	                chr_len=int(line1[4])
	        else:
	            line1=line.strip().split()
	            chr2_num=int(line1[0])-1
	            chr2_start=int(line1[1])-1
	            chr1_start=int(line1[2])-1
	            rep_len=int(line1[3])
	            if not rev_com:
	                if not (chrom[chr_num][chr1_start:chr1_start+rep_len]
	                        ==chrom[chr2_num][chr2_start:chr2_start+rep_len]):
	                    print chr_num+1,line
	                if chr1_start==0 and rep_len==chr_len:
	                    continue
	                chr_repeats.setdefault(chr_num,[]).append((chr1_start,chr1_start+rep_len))
	            else:
	                if not (chrom[chr_num][chr1_start-rep_len+1:chr1_start+1]
	                        == reverse_complement(chrom[chr2_num][chr2_start:chr2_start+rep_len])):
	                    print chr_num+1,line,rev_com
	                chr_repeats.setdefault(chr_num,[]).append((chr1_start-rep_len+1,chr1_start+1))

	#Go through gt file and annotate reads that intersect with repeats.
	with open(gt_file_path) as f:
	    with open(gt_annotated_file_path,'w') as g:
	        for line in f:
	            line1=line.split()
	            cr=int(line1[1])
	            rd_st=int(line1[2])
	            rd_end=int(line1[3])
	            is_repeat=0
	            for tup in chr_repeats[cr]:
	                if ((rd_st >= tup[0] and rd_st <= tup[1]) or
	                    (rd_end >= tup[0] and rd_end <= tup[1])):
	                    is_repeat=1
	            line2=line.strip()+"\t"+str(is_repeat)+"\n"
	            g.write(line2)

if __name__ == '__main__':
	multifasta_path=sys.argv[1]
	gt_file_path=sys.argv[2]
	gt_annotated_file_path=sys.argv[3]
	intermediate_repeat_file_path='./repeats_discovered.txt'
	if len(sys.argv) > 4:
		intermediate_repeat_file_path=sys.argv[4]
	run(multifasta_path,intermediate_repeat_file_path, gt_file_path, gt_annotated_file_path)ß