File: align_read.py

package info (click to toggle)
igor 1.4.0%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye, sid
  • size: 4,116 kB
  • sloc: cpp: 12,453; python: 1,047; sh: 124; makefile: 33
file content (111 lines) | stat: -rw-r--r-- 4,066 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#      Author: Quentin Marcou
#
#  This source code is distributed as part of the IGoR software.
#  IGoR (Inference and Generation of Repertoires) is a versatile software to
#  analyze and model immune receptors generation, selection, mutation and all
#  other processes.
#   Copyright (C) 2017  Quentin Marcou
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.

#   You should have received a copy of the GNU General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.

import re as regex

import numpy
import pandas

from ..utils.utils import get_str_asarray


def extract_best_aligns(aligns_df):
    """Extracts alignments with highest score from the provided alignments
    dataframe.

    """
    mask = aligns_df.groupby('seq_index').agg({'score': 'idxmax'})  # get /!\ FIRST /!\ index of max align score for each sequence
    aligns_df_best = aligns_df.loc[mask['score']].reset_index(drop=True)
    return aligns_df_best


def get_misinsdel_asarray(misinsdel_str):
    """Convert a string with comma separated mismatches indices to an array
    of integers.

    """
    return get_str_asarray(misinsdel_str, dtype=int,
                           boundaries_char=["{", "}"], sep=',')


def read_alignments(filename):
    """Reads IGoR's alignments file as a panda DataFrame."""
    aligns = pandas.read_csv(filename, delimiter=';')
    # Convert the string of insertions into an array of integers
    tmp = aligns.apply(lambda x: get_misinsdel_asarray(x.insertions), axis=1)
    aligns.insertions = tmp
    # Convert the string of deletions into an array of integers
    tmp = aligns.apply(lambda x: get_misinsdel_asarray(x.deletions), axis=1)
    aligns.deletions = tmp
    # Convert the string of mismatches into an array of integers
    tmp = aligns.apply(lambda x: get_misinsdel_asarray(x.mismatches), axis=1)
    aligns.mismatches = tmp

    return aligns


def read_best_alignments(filename):
    """Reads IGoR's top score alignments from file as a panda DataFrame."""
    # Not efficient just faster to code
    aligns = pandas.read_csv(filename, delimiter=';')
    aligns = extract_best_aligns(aligns)

    # Convert the string of insertions into an array of integers
    tmp = aligns.insertions.apply(get_misinsdel_asarray)
    aligns.insertions = tmp
    # Convert the string of deletions into an array of integers
    tmp = aligns.deletions.apply(get_misinsdel_asarray)
    aligns.deletions = tmp
    # Convert the string of mismatches into an array of integers
    tmp = aligns.mismatches.apply(get_misinsdel_asarray)
    aligns.mismatches = tmp

    return aligns


# Import genomic template sequences
def read_FASTA_strings(filename):
    """Returns a dictionary whose entry keys are sequence labels and values
    are sequences.

    """
    seq = regex.compile('>')
    line = regex.compile('\n')
    with open(filename) as f:
        tmp = seq.split(f.read())
        final = {}
        for i in range(1, len(tmp)):
            split_seq = line.split(tmp[i])
            final[split_seq[0]] = split_seq[1].upper()
        return final


def has_mismatches(x):
    """Assess whether the SW alignment contains mismatches. Because IGoR's
    inference module uses mismatches outside the best SW alignment, not all
    reported mismatches are contained in the SW alignment. This function
    filters them before assessing whether the actual SW alignment contains any
    mismatch.

    """
    tmp = numpy.asarray(x.mismatches)
    return any(numpy.multiply(tmp >= x["5_p_align_offset"],
                              tmp <= x["3_p_align_offset"]))