File: K3000_paths_to_fa.py

package info (click to toggle)
discosnp 1%3A2.6.2-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 3,656 kB
  • sloc: python: 5,893; sh: 2,966; cpp: 2,692; makefile: 14
file content (88 lines) | stat: -rwxr-xr-x 3,283 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
'''
Creation of a FA file from a compacted fact int file. 
'''


__author__ = "Pierre Peterlongo"
__email__ = "pierre.peterlongo@inria.fr"

import sys
import K3000_common as kc





    
def generate_sequence_paths(sequences, compacted_fact_file_name, int_facts_format):
    '''
    Given a set of indexed sequences, the k value and some compacted facts, prints the corresponding fasta seequences
    Enables also to validate the compacted facts. 
    '''
    mfile = open(compacted_fact_file_name)
    nb_non_writen=0
    for line in mfile: 
        if line[0] == "#": continue
        # * int_facts_format:
        #   38772_0;-21479_1;27388_3;-494_28;-45551_36;-11894_10;-50927_7;-66981_10;29405_22;34837_1;20095_5;
        # * not int_facts_format:
        #  -1000l_0;-136l_-24;-254h_-18;493l_-16;  -577h_0;-977h_-26;1354h_-25;  =>  1
        
        line = line.split("=")[0].strip() # in case a line contains an abundance, it is indicated by "  =>  1"
        # from  "-1000l_0;-136l_-24;-254h_-18;493l_-16;  -577h_0;-977h_-26;1354h_-25;  =>  1"
        # to    "-1000l_0;-136l_-24;-254h_-18;493l_-16;  -577h_0;-977h_-26;1354h_-25;"

        # if the input fact contains a space it means that this is a paired fact, each part is treated
        for fact_as_ids in line.split():
            # print(fact_as_ids)
            toprint, header, bubble_facts_position_start_stops, full_seq = kc.line2seq(fact_as_ids, sequences, int_facts_format, 0)
            if toprint:
                print(f">{header}\t{bubble_facts_position_start_stops}\n{full_seq}")
            else: nb_non_writen+=1
            
    if nb_non_writen>0:
        sys.stderr.write("Warning, "+str(nb_non_writen)+" facts were removed as their sequence concatenation were not coherent or because they contained non coherent predictions\n")
        
    mfile.close()

def is_int_fact(file_name):
    '''
    Determines if a given file is under the format 
    2468_0;-2708_6;1954_-25;1154_-26; (called 'int_facts')
    or
    -577h_0;-977h_-26;1354h_-25;  =>  1
    '''
    with open(file_name) as my_file:
        while True:
            line = my_file.readline()
            if not line: 
                raise RuntimeError (f'input file {file_name} does not contain correctly formated facts')
            if line[0] == "#" : continue # comment
            if  ';' not in line:
                raise RuntimeError (f'input file {file_name} does not contain correctly formated facts. The line {line} should contain at least a \';\'')
            # here we are check the kind of file:
            line = line.split()[0]
            if 'h' in line or 'l' in line:
                return False
            return True



def main():
    '''
    Creation of a FA file from a compacted fact int file. 

    '''
    sequences=kc.index_sequences(sys.argv[1]) #for each snp id: sequences[snp_id]=[left_unitig_len, right_unitig_len, upperseq, lowerseq] 
    sequences=kc.index_sequences(sys.argv[2], sequences) #for each snp id: sequences[snp_id]=[left_unitig_len, right_unitig_len, upperseq, lowerseq] 
    int_facts_format = is_int_fact(sys.argv[3])
    generate_sequence_paths(sequences, sys.argv[3], int_facts_format)
    



if __name__ == "__main__":
     main()