File: stats.py

package info (click to toggle)
discosnp 1%3A2.6.2-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 3,656 kB
  • sloc: python: 5,893; sh: 2,966; cpp: 2,692; makefile: 14
file content (47 lines) | stat: -rw-r--r-- 1,505 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import matplotlib.pyplot as plt
import sys


def get_nb_allele_distribution(gfa_file_name):
    gfa_file=open(gfa_file_name)
    sizes = []
    for line in gfa_file:
        if line[0]=='S':#S  2       34156l;-11363l;13698l;-26143h;10014l;   RC:i:144
            l=len(line.split()[2].split(';')[:-1])
            sizes.append(l)
    gfa_file.close()
    return sizes

def get_sequence_size_distribution(gfa_file_name):
    gfa_file=open(gfa_file_name)
    sizes = []
    for line in gfa_file:
        if line[0]=='S':#S       0       actgcaACAGCTGTTGAAAAGCCGGAATGTACTCTTCATTGCAAACATTTCAGGGATGAAGTGAAGAtgaattgCGACGTAGTATCCACACCAAGCCGGCGTTATCCGGTGAGGCGCAATGTTGCGGGGGCttt  RC:i:11
            sizes.append(len(line.split()[2]))
    gfa_file.close()
    return sizes
    
def plot_violin(sequence_sizes, nb_alleles, read_set_id):
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10, 10))
    axes[0].violinplot(sequence_sizes)
    axes[0].set_title('Sequence size distribution')
    axes[1].violinplot(nb_alleles)
    axes[1].set_title('Nb alleles per sequence')
    plt.savefig('distributions_'+read_set_id+'.png')
    
    

def main():
    '''
    Stats from a gfa file
    '''
    sequence_sizes=get_sequence_size_distribution(sys.argv[2]) #for each snp id: sequences[snp_id]=[left_unitig_len, right_unitig_len, upperseq, lowerseq] 
    nb_alleles=get_nb_allele_distribution(sys.argv[1])
    plot_violin(sequence_sizes, nb_alleles, sys.argv[3])
    
    



if __name__ == "__main__":
     main()