File: Aligner.h

package info (click to toggle)
igor 1.4.0%2Bdfsg-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 4,124 kB
  • sloc: cpp: 12,453; python: 1,047; sh: 124; makefile: 32
file content (185 lines) | stat: -rw-r--r-- 11,846 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
/*
 * Aligner.h
 *
 *  Created on: Feb 16, 2015
 *      Author: Quentin Marcou
 *
 *  This source code is distributed as part of the IGoR software.
 *  IGoR (Inference and Generation of Repertoires) is a versatile software to analyze and model immune receptors
 *  generation, selection, mutation and all other processes.
 *   Copyright (C) 2017  Quentin Marcou
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.

 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */

#ifndef ALIGNER_H_
#define ALIGNER_H_

#include <forward_list>
#include <list>
#include <unordered_map>
#include <set>
#include <utility>
#include <fstream>
#include <algorithm>
#include <iostream>
#include "Utils.h"
#include <omp.h>
#include <stdexcept>
#include <random>
#include <chrono>

#include "IntStr.h"


/**
 * \class Alignment_data Aligner.h
 * \brief Stores information on the alignment of one genomic template against the target
 * \author Q.Marcou
 * \version 1.0
 *
 * Stores information on the alignment of one genomic template against the target.
 * It contains:
 * - the gene name
 * - the offset of the alignment (index on the target sequence on which the first letter of the FULL genomic template aligns (can be negative or lie outside the target)
 * - 5' and 3' offset positions of the best alignment first and last aligned nucleotide
 * - insertions : indices on the TARGET of inserted nucleotides
 * - deletions : indices on the GENOMIC TEMPLATE of deleted nucleotides
 * - alignment length
 * - list of mismatches (that lie event outside the best alignment to allow IGoR to know mismatch positions in advance while exploring different deletions numbers)
 * - the alignment score
 */
struct Alignment_data {
	std::string gene_name;
	int offset;
	size_t five_p_offset;
	size_t three_p_offset;
	std::forward_list<int> insertions; //gap in the genomic sequence
	std::forward_list<int> deletions; //gap in the data sequence
	size_t align_length;
	mutable std::vector<int> mismatches;
	double score;

	Alignment_data(std::string gene , int off): gene_name(gene) , offset(off) , insertions(*(new std::forward_list<int>)) , deletions(*(new std::forward_list<int>)) , score(0) {}
	Alignment_data(int off, size_t five_p_off , size_t three_p_off , size_t align_len , std::forward_list<int> ins , std::forward_list<int> del , std::vector<int> mis , double alignment_score): gene_name(std::string()) , offset(off) , five_p_offset(five_p_off) , three_p_offset(three_p_off) , insertions(ins) , deletions(del) , align_length(align_len) , mismatches(mis) , score(alignment_score) {}
	Alignment_data(std::string gene , int off , size_t align_len , std::forward_list<int> ins , std::forward_list<int> del , std::vector<int> mis , double alignment_score): gene_name(gene) , offset(off) , insertions(ins) , deletions(del) , align_length(align_len) , mismatches(mis) , score(alignment_score) {}
	Alignment_data(std::string gene , int off, size_t five_p_off , size_t three_p_off , size_t align_len , std::forward_list<int> ins , std::forward_list<int> del , std::vector<int> mis , double alignment_score): gene_name(gene) , offset(off) , five_p_offset(five_p_off) , three_p_offset(three_p_off) , insertions(ins) , deletions(del) , align_length(align_len) , mismatches(mis) , score(alignment_score) {}

/*	bool operator<(const Alignment_data& align){
		//Hardcode to get the alignments in descending order using sort()
		return this->score > align.score;
	}*/

};

/**
 * \class Aligner Aligner.h
 * \brief A modified Smith-Waterman alignment class
 * \author Q.Marcou
 * \version 1.0
 *
 * The Aligner class allows to perform SW alignments according to the parameters (substitution matrix,gap penalty) supplied upon construction of the object.
 * The SW alignments matrix has been altered for V and J in order to allow for deletions on the deleted side only.
 * Alignments can be made in parallel using openMP
 *
 */
class Aligner {
public:
	Aligner();
	Aligner(Matrix<double>,int,Gene_class);
	virtual ~Aligner();

	// Single sequence alignments methods
	std::forward_list<Alignment_data> align_seq(std::string , double , bool , int , int, bool=false);
	std::forward_list<Alignment_data> align_seq(std::string , double , bool , int , int, std::set<std::string> , bool=false);
	std::forward_list<Alignment_data> align_seq(std::string , double , bool , bool , int , int, bool=false);
	std::forward_list<Alignment_data> align_seq(std::string , double , bool , bool , int , int, std::set<std::string> , bool=false);
	std::forward_list<Alignment_data> align_seq(std::string , double , bool , std::unordered_map<std::string,std::pair<int,int>>, bool=false);
	std::forward_list<Alignment_data> align_seq(std::string , double , bool , std::unordered_map<std::string,std::pair<int,int>>, std::set<std::string> , bool=false);
	std::forward_list<Alignment_data> align_seq(std::string , double , bool , bool , std::unordered_map<std::string,std::pair<int,int>>, bool=false);
	std::forward_list<Alignment_data> align_seq(std::string , double , bool , bool , std::unordered_map<std::string,std::pair<int,int>> , std::set<std::string> , bool=false);

	// Multiple sequences alignments methods
	std::unordered_map<int,std::forward_list<Alignment_data>> align_seqs(std::vector<std::pair<const int , const std::string>> , double , bool);
	std::unordered_map<int,std::forward_list<Alignment_data>> align_seqs(std::vector<std::pair<const int , const std::string>> , double , bool , bool);
	std::unordered_map<int,std::forward_list<Alignment_data>> align_seqs(std::vector<std::pair<const int , const std::string>> , double , bool , int , int, bool=false);
	std::unordered_map<int,std::forward_list<Alignment_data>> align_seqs(std::vector<std::pair<const int , const std::string>> , double , bool , bool , int , int, bool=false);
	std::unordered_map<int,std::forward_list<Alignment_data>> align_seqs(std::vector<std::pair<const int , const std::string>> , double , bool , std::unordered_map<std::string,std::pair<int,int>>, bool=false);
	std::unordered_map<int,std::forward_list<Alignment_data>> align_seqs(std::vector<std::pair<const int , const std::string>> , double , bool , bool , std::unordered_map<std::string,std::pair<int,int>>, bool=false);
	void align_seqs( std::string , std::vector<std::pair<const int , const std::string>> , double , bool );
	void align_seqs( std::string , std::vector<std::pair<const int , const std::string>> , double , bool , bool );
	void align_seqs( std::string , std::vector<std::pair<const int , const std::string>> , double , bool , int , int, bool=false);
	void align_seqs( std::string , std::vector<std::pair<const int , const std::string>> , double , bool , bool , int , int, bool=false);
	void align_seqs( std::string , std::vector<std::pair<const int , const std::string>> , double , bool , std::unordered_map<std::string,std::pair<int,int>>, bool=false);
	void align_seqs( std::string , std::vector<std::pair<const int , const std::string>> , double , bool , bool , std::unordered_map<std::string,std::pair<int,int>>, bool=false);

	//I/O related methods
	void write_alignments_seq_csv(std::string , std::unordered_map<int,std::forward_list<Alignment_data>>);
	std::unordered_map<int,std::forward_list<Alignment_data>> read_alignments_seq_csv(std::string , double , bool);

	void set_genomic_sequences(std::vector< std::pair<std::string,std::string> >);
	int incorporate_in_dels( std::string& , std::string& , const std::forward_list<int> , const std::forward_list<int> , int );


private:
	std::forward_list<std::pair<std::string,std::string>> nt_genomic_sequences;
	std::forward_list<std::pair<std::string,Int_Str>> int_genomic_sequences;
	Matrix<double> substitution_matrix;
	int gap_penalty;
	Gene_class gene;
	bool local_align;
	bool flip_seqs;
	void sw_align_common(const Int_Str& ,const Int_Str& ,const int,const int , Matrix<double>& , Matrix<int>& , Matrix<int>& , Matrix<int>& , std::vector<int>& ,  std::vector<int>& , std::vector<int>&);
	std::list<std::pair<int,Alignment_data>> sw_align(const Int_Str& ,const Int_Str& , double , bool , int , int);
	std::unordered_map<std::string,std::pair<int,int>> build_genomic_bounds_map(int,int) const;

};


std::unordered_map<int,std::pair<std::string,std::unordered_map<Gene_class,std::vector<Alignment_data>>>> read_alignments_seq_csv(std::string , Gene_class , double , bool , std::vector<std::pair<const int,const std::string>>);
std::unordered_map<int,std::pair<std::string,std::unordered_map<Gene_class,std::vector<Alignment_data>>>> read_alignments_seq_csv(std::string , Gene_class , double , bool , std::vector<std::pair<const int,const std::string>>, std::unordered_map<int,std::pair<std::string,std::unordered_map<Gene_class,std::vector<Alignment_data>>>>);
std::unordered_map<int,std::pair<std::string,std::unordered_map<Gene_class,std::vector<Alignment_data>>>> read_alignments_seq_csv_score_range(std::string , Gene_class , double , bool , std::vector<std::pair<const int,const std::string>>);
std::unordered_map<int,std::pair<std::string,std::unordered_map<Gene_class,std::vector<Alignment_data>>>> read_alignments_seq_csv_score_range(std::string , Gene_class , double , bool , std::vector<std::pair<const int,const std::string>>, std::unordered_map<int,std::pair<std::string,std::unordered_map<Gene_class,std::vector<Alignment_data>>>>);
std::vector<std::tuple<int,std::string,std::unordered_map<Gene_class,std::vector<Alignment_data>>>> map2vect (std::unordered_map<int,std::pair<std::string,std::unordered_map<Gene_class,std::vector<Alignment_data>>>>);
std::forward_list<std::pair<const int,const std::string>> read_indexed_seq_csv(std::string);
std::vector<std::pair<const int , const std::string>> read_indexed_csv(std::string);
std::vector<std::pair<const int,const std::string>> read_fasta(std::string);
std::vector<std::pair<std::string,std::string>> read_genomic_fasta(std::string);
std::vector<std::pair<const int,const std::string>> read_txt(std::string);
std::unordered_map<std::string,size_t> read_gene_anchors_csv(std::string,std::string separator= ";");
std::unordered_map<std::string,std::pair<int,int>> read_template_specific_offset_csv(std::string,std::string separator= ";");
void write_indexed_seq_csv(std::string , std::vector<std::pair<const int,const std::string>>);
Int_Str nt2int(std::string);
bool comp_nt_int(const int& , const int&);
std::list<Int_nt> get_ambiguous_nt_list(const Int_nt&);
inline void write_single_seq_alignment( std::ofstream& , int , std::forward_list<Alignment_data> );
//Compare alignments (sort by score)
bool align_compare(Alignment_data , Alignment_data );
std::vector<std::pair<const int , const std::string>> sample_indexed_seq( std::vector<std::pair<const int , const std::string>>,const size_t);
Matrix<double> read_substitution_matrix(const std::string& , std::string sep=",");
std::tuple<bool,int,int> extract_min_max_genomic_templates_offsets(const std::unordered_map<std::string,std::pair<int,int>>& genomic_offset_bounds);
std::forward_list<Alignment_data> extract_best_gene_alignments(const std::forward_list<Alignment_data>&);

/*
	namespace substitution_matrices{
		//from: ftp://ftp.ncbi.nih.gov/blast/matrices/NUC.4.4
		static Matrix<int> nuc44_sub_matrix(4,4,{5,-4,-4,-4 , -4 ,5,-4,-4 , -4,-4,5,-4 , -4,-4,-4,5});


	}
	*/



#endif /* ALIGNER_H_ */