File: split_multiple_snps.py

package info (click to toggle)
discosnp 1%3A2.6.2-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 3,656 kB
  • sloc: python: 5,893; sh: 2,966; cpp: 2,692; makefile: 14
file content (85 lines) | stat: -rw-r--r-- 5,542 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import sys

""" 
Given a discoSnp fasta file result, splits the bubbles into bubbles containing a unique SNP. Upper and lower case of each SNP is STRICTLY the same excepted at the variant position. Thus the phasing is lost here.
This is a simplification needed for validation purposes

Examples: 
    >SNP_higher_path_701|P_1:30_C/T|high|nb_pol_1|left_unitig_length_226|right_unitig_length_164|left_contig_length_475|right_contig_length_1438|C1_31|C2_0|Q1_0|Q2_0|G1_0/0:6,98,624|G2_1/1:644,101,6|rank_1
taaaaatgaacatattaAGTACTCACAATTCATCTCTTCCCTGTCACCTGACTGCATCTGACTGCATACCACGTACACcag
>SNP_lower_path_701|P_1:30_C/T|high|nb_pol_1|left_unitig_length_226|right_unitig_length_164|left_contig_length_475|right_contig_length_1438|C1_0|C2_32|Q1_0|Q2_0|G1_0/0:6,98,624|G2_1/1:644,101,6|rank_1
taaaaatgaacatattaAGTACTCACAATTCATCTCTTCCCTGTCACTTGACTGCATCTGACTGCATACCACGTACACcagg
becomes: 
>SNP_higher_path_701_1|P_1:30_C/T
AGTACTCACAATTCATCTCTTCCCTGTCACCTGACTGCATCTGACTGCATACCACGTACA
>SNP_lower_path_701_1|P_1:30_C/T
AGTACTCACAATTCATCTCTTCCCTGTCACTTGACTGCATCTGACTGCATACCACGTACA

AND:
>SNP_higher_path_711|P_1:30_T/G,P_2:32_G/C|high|nb_pol_2|left_unitig_length_18|right_unitig_length_61|left_contig_length_649|right_contig_length_203|C1_0|C2_40|Q1_0|Q2_0|G1_1/1:464,74,5|G2_0/0:6,125,804|rank_1
tcattcactcattcgctcactcattcactcattcattctttactcattcactcattcactcactcactgattcactcattcattcattcactcacacattcacttattcactcacttactcattcactcactcactcacgtccagggcctgcttggtgtccactgctgctgcaggcaactggcttcagttgggggaatgtatttctagtttacctttccctaaaggtgtggggtagggacgatggtcctctcctgagtctggcacagtgctgcagtctctgcccttgtgtacccgaaaacctgaaggtcaaggtcactgctgcaaccgatgccctgctgggaaagagaaagccttgggctgcctcagctttggcctccacagggtctctgcggtgcccttcagggcaccagagaaggtcctttcctgctacccaggctcccagtgcctgggctgagcctttgccccatccacccctcactgcctgtctcacgccggggcctttgcacccgccgatccctctgccgggagcaccctgctgcgcctccttgctccagagaagcctcccctgcccccacccccacccgtgagctgctccctgtgattttctgtggcaccacctgtgtcactatgcaggtcacagtaacagcaATGCGCTGGCCCCGTCCAGACTTCCTGTCTTTGGCTCCATAGGACGGTCCCCAGGAAGAAGAAccctgtctacccggcccagttcccccttctccagcaggcctgggacagagccagagcagataggcaaggctccgtccccaaactcattctgaggaaaccaccacagccatcctcctggggtcagcctggggcttttcccgggcatggcttatatccacagaaatgagaactgcgccaggcgcggtggctcacacctgtaatcc
>SNP_lower_path_711|P_1:30_T/G,P_2:32_G/C|high|nb_pol_2|left_unitig_length_18|right_unitig_length_61|left_contig_length_649|right_contig_length_203|C1_23|C2_0|Q1_0|Q2_0|G1_1/1:464,74,5|G2_0/0:6,125,804|rank_1
tcattcactcattcgctcactcattcactcattcattctttactcattcactcattcactcactcactgattcactcattcattcattcactcacacattcacttattcactcacttactcattcactcactcactcacgtccagggcctgcttggtgtccactgctgctgcaggcaactggcttcagttgggggaatgtatttctagtttacctttccctaaaggtgtggggtagggacgatggtcctctcctgagtctggcacagtgctgcagtctctgcccttgtgtacccgaaaacctgaaggtcaaggtcactgctgcaaccgatgccctgctgggaaagagaaagccttgggctgcctcagctttggcctccacagggtctctgcggtgcccttcagggcaccagagaaggtcctttcctgctacccaggctcccagtgcctgggctgagcctttgccccatccacccctcactgcctgtctcacgccggggcctttgcacccgccgatccctctgccgggagcaccctgctgcgcctccttgctccagagaagcctcccctgcccccacccccacccgtgagctgctccctgtgattttctgtggcaccacctgtgtcactatgcaggtcacagtaacagcaATGCGCTGGCCCCGTCCAGACTTCCTGTCTGTCGCTCCATAGGACGGTCCCCAGGAAGAAGAAccctgtctacccggcccagttcccccttctccagcaggcctgggacagagccagagcagataggcaaggctccgtccccaaactcattctgaggaaaccaccacagccatcctcctggggtcagcctggggcttttcccgggcatggcttatatccacagaaatgagaactgcgccaggcgcggtggctcacacctgtaatcc
becomes 
>SNP_higher_path_711_1|P_1:30_T/G
ATGCGCTGGCCCCGTCCAGACTTCCTGTCTTTGGCTCCATAGGACGGTCCCCAGGAAGAAG
>SNP_lower_path_711_1|P_1:30_T/G
ATGCGCTGGCCCCGTCCAGACTTCCTGTCTGTGGCTCCATAGGACGGTCCCCAGGAAGAAG
>SNP_higher_path_711_2|P_2:32_G/C
GCGCTGGCCCCGTCCAGACTTCCTGTCTTTGGCTCCATAGGACGGTCCCCAGGAAGAAGAA
>SNP_lower_path_711_2|P_2:32_G/C
GCGCTGGCCCCGTCCAGACTTCCTGTCTTTCGCTCCATAGGACGGTCCCCAGGAAGAAGAA
"""
file = open(sys.argv[1])

def get_maj_seq(seq):
    """
    returns the upper case letters from a sequence @seq
    """
    res=""
    for l in seq:
        if l>='A' and l<='Z': 
            res+=l
    return res

delta=30
count=0
while True: 
    com1=file.readline()
    if not com1: break
    count+=1
    com1=com1.rstrip()               #>SNP_higher_path_30|P_1:30_A/T,P_2:55_G/T|low|nb_pol_2|left_unitig_length_6|right_unitig_length_0
    seq1=get_maj_seq(file.readline().rstrip())
    com2=file.readline().rstrip()
    seq2=get_maj_seq(file.readline().rstrip())
    str_nb_pol=com1.split('|')[3]
    assert str_nb_pol.startswith("nb_pol")
    nb_pol=int(com1.split('|')[3].split('_')[-1])
    if com1.startswith(">INDEL"):
        print (com1)
        print (seq1)
        print (com2)
        print (seq2)
        continue
    
    all_pos_pol=com1.split('|')[1].split(',')
    splited_com1=com1.split('|')
    splited_com2=com2.split('|')
    for pol in range(nb_pol): 
        pos_pol=int(all_pos_pol[pol].split(':')[1].split('_')[0])
        alt_allele=all_pos_pol[pol].split(':')[1].split('_')[1].split('/')[1]
        subseq1=seq1[pos_pol-delta:pos_pol+delta+1]                                     # get the upper case sequence, including the current variant and containing the sequence surrounding this variant
        subseq2=seq1[pos_pol-delta:pos_pol]+alt_allele+seq1[pos_pol+1:pos_pol+delta+1]  # get the upper case sequence, including the current variant and containing the sequence from sequence1 containing this variant. 
        subcom1=splited_com1[0]+'_'+str(pol+1)+'|'+all_pos_pol[pol]
        subcom2=splited_com2[0]+'_'+str(pol+1)+'|'+all_pos_pol[pol]
        print (subcom1)
        print (subseq1)
        print (subcom2)
        print (subseq2)