File: compute_pi.py

package info (click to toggle)
discosnp 1%3A2.6.2-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 3,656 kB
  • sloc: python: 5,893; sh: 2,966; cpp: 2,692; makefile: 14
file content (47 lines) | stat: -rw-r--r-- 1,660 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import sys
"""
Given a fasta file containing n sequences having all the same length
Returns the pi value https://en.wikipedia.org/wiki/Nucleotide_diversity


1/ stores the n sequences
2/ computes for all i<j in [1,n] the pi_i,j value
3/ computes and prints the pi value
"""


def stores_fasta_sequences(fasta_file_name):
    sequences=[]
    with  open(fasta_file_name) as fasta_file:
        while True: 
            line = fasta_file.readline()
            if not line: break                  # end of the file
            line = line.strip()
            if line[0] == ">": continue         # comment
            sequences.append(line)
    if len(sequences) == 0: return None
    l = len(sequences[0])
    for sequence in sequences:
        if len(sequence) != l: 
            sys.stderr.write(f"File {fasta_file_name} contains sequences of distinct sizes, unable to compute pi\n")
            sys.exit(1)
    return sequences
    

def main():
    if len(sys.argv) !=2:
        sys.stderr.write(f"Usage: {sys.argv[0]} fasta_file_name\n")
        sys.stderr.write("Given a fasta file containing n sequences having all the same length\n")
        sys.stderr.write("Returns the pi value https://en.wikipedia.org/wiki/Nucleotide_diversity\n")
        sys.stderr.write(" 1/ stores the n sequences\n")
        sys.stderr.write(" 2/ computes for all i<j in [1,n] the pi_i,j value\n")
        sys.stderr.write(" 3/ computes and prints the pi value\n")
    
    sequences = stores_fasta_sequences(sys.argv[1])
    if not sequences :
        sys.stderr.write(f"File {sys.argv[1]} contains no sequence\n")
    print (sequences)

if __name__ == '__main__':
    main()