File: get_lineage_positions.py

package info (click to toggle)
python-pangolearn 2022-07-09%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 184,720 kB
  • sloc: python: 801; sh: 77; makefile: 16
file content (52 lines) | stat: -rw-r--r-- 1,841 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import csv
import pickle
import collections
from Bio import SeqIO
from Bio.Align import MultipleSeqAlignment
from Bio.Align.AlignInfo import SummaryInfo

## outputs a dataframe of positions to input in training

def get_lineage_cns50_sites(lineage, lineage_taxa, sequences_index, reference_sequence): 
            
    ### output of list of sequence_IDs for the given lineage
    
    lineage_seqs = MultipleSeqAlignment([])
    for taxon in lineage_taxa:
        lineage_seqs.append(sequences_index[taxon])

    info = SummaryInfo(lineage_seqs)
    consensus_sequence =  info.gap_consensus(
    threshold=0.50, 
    ambiguous='N')
    
    nuc_position = []
    for i in range(len(consensus_sequence)):
        if reference_sequence[i] != "N":
            if consensus_sequence[i] != "N":
                if consensus_sequence[i] != reference_sequence[i]:
                    nuc_position.append(i)
    return nuc_position

def get_relevant_positions(designation_file,seq_file,ref_file,outfile):
    reference = SeqIO.read(ref_file, "fasta")
    sequences_index = SeqIO.index(seq_file, "fasta")
    lineage_designations = collections.defaultdict(list)
    lineage_set = set()

    with open(designation_file, "r") as f:
        reader = csv.DictReader(f)
        for row in reader:
            lineage_designations[row["lineage"]].append(row["sequence_name"])
            lineage_set.add(row["lineage"])
    
    final_positions = set()
    for lineage in lineage_set:
        print(f"Getting positions for lineage {lineage}")
        positions = get_lineage_cns50_sites(lineage, lineage_designations[lineage], sequences_index, reference)
        print(f"\tFound {len(positions)}")
        for i in positions:
            final_positions.add(i)

    with open(outfile, 'wb') as pickle_file:
        pickle.dump(final_positions, pickle_file)