File: superalignment.h

package info (click to toggle)
iqtree 1.6.12%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 12,140 kB
  • sloc: cpp: 111,752; ansic: 53,619; python: 242; sh: 195; makefile: 52
file content (293 lines) | stat: -rw-r--r-- 10,531 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
/***************************************************************************
 *   Copyright (C) 2009 by BUI Quang Minh   *
 *   minh.bui@univie.ac.at   *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 *   This program is distributed in the hope that it will be useful,       *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 *   GNU General Public License for more details.                          *
 *                                                                         *
 *   You should have received a copy of the GNU General Public License     *
 *   along with this program; if not, write to the                         *
 *   Free Software Foundation, Inc.,                                       *
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
 ***************************************************************************/
#ifndef SUPERALIGNMENT_H
#define SUPERALIGNMENT_H

#include "alignment.h"


struct PartitionInfo {
	double cur_score;	// current log-likelihood
	double part_rate;	// partition heterogeneity rate
	int    evalNNIs;	// number of evaluated NNIs on subtree

	//DoubleVector null_score; // log-likelihood of each branch collapsed to zero
	//DoubleVector opt_score;  // optimized log-likelihood for every branch
	//DoubleVector nni1_score; // log-likelihood for 1st NNI for every branch
	//DoubleVector nni2_score; // log-likelihood for 2nd NNI for every branch

	vector<DoubleVector> cur_brlen;  // current branch lengths
	//DoubleVector opt_brlen;  // optimized branch lengths for every branch
	vector<DoubleVector> nni1_brlen; // branch length for 1st NNI for every branch
	vector<DoubleVector> nni2_brlen; // branch length for 2nd NNI for every branch

	//double *mem_ptnlh; // total memory allocated for all pattern likelihood vectors
	double *cur_ptnlh; // current pattern likelihoods of the tree
	//double *nni1_ptnlh; // pattern likelihoods of 1st NNI tree
	//double *nni2_ptnlh; // pattern likelihoods of 2nd NNI tree
	NNIMove nniMoves[2];
};

class PhyloSuperTree;

/**
Super alignment representing presence/absence of sequences in
k partitions for a total of n sequences. It has the form:
		Site_1 Site_2 ... Site_k
Seq_1     1      0    ...   1
Seq_2     0      1    ...   0
...      ...
Seq_n     1      1    ...   0

Where (i,j)=1 means Seq_i is present in partition j, 0 otherwise

So data is binary.

	@author BUI Quang Minh <minh.bui@univie.ac.at>
*/

class SuperAlignment : public Alignment
{
public:
	/** constructor initialize from a supertree */
    SuperAlignment(Params &params);

	/** constructor initialize empty alignment */
    SuperAlignment();

    /** destructor */
    ~SuperAlignment();

    void init(StrVector *sequence_names = NULL);
    
    /** return that this is a super-alignment structure */
	virtual bool isSuperAlignment() { return true; }

    /** read partition model file */
    void readPartition(Params &params);
    
    /** read RAxML-style partition file */
    void readPartitionRaxml(Params &params);
    
    /** read partition model file in NEXUS format into variable info */
    void readPartitionNexus(Params &params);
    
    void printPartition(const char *filename);
    
    void printPartitionRaxml(const char *filename);
    
    void printBestPartition(const char *filename);
    void printBestPartitionRaxml(const char *filename);

	/**
	 * create taxa_index from super-alignment to sub-alignment
	 * @param part index of sub-alignment
	 */
	void linkSubAlignment(int part);

	/**
	 * @param pattern_index (OUT) vector of size = alignment length storing pattern index of all sites
	 * the index of sites in 2nd, 3rd,... genes have to be increased by the number of patterns in previous genes
	 * so that all indices are distinguishable
	*/
	virtual void getSitePatternIndex(IntVector &pattern_index);

	/**
	 * @param freq (OUT) vector of site-pattern frequencies for all sub-alignments
	*/
	virtual void getPatternFreq(IntVector &pattern_freq);

    /**
     * @param[out] freq vector of site-pattern frequencies
     */
    virtual void getPatternFreq(int *freq);

    /**
        Print all site information to a file
        @param filename output file name
    */
    virtual void printSiteInfo(const char* filename);


    /**
            extract sub-alignment of a sub-set of sequences
            @param aln original input alignment
            @param seq_id ID of sequences to extract from
            @param min_true_cher the minimum number of non-gap characters, true_char<min_true_char -> delete the sequence
            @param min_taxa only keep alignment that has >= min_taxa sequences
            @param[out] kept_partitions (for SuperAlignment) indices of kept partitions
     */
    virtual void extractSubAlignment(Alignment *aln, IntVector &seq_id, int min_true_char, int min_taxa = 0, IntVector *kept_partitions = NULL);

    /**
     * remove identical sequences from alignment
     * @param not_remove name of sequence where removal is avoided
     * @param keep_two TRUE to keep 2 out of k identical sequences, false to keep only 1
     * @param removed_seqs (OUT) name of removed sequences
     * @param target_seqs (OUT) corresponding name of kept sequence that is identical to the removed sequences
     * @return this if no sequences were removed, or new alignment if at least 1 sequence was removed
     */
    virtual Alignment *removeIdenticalSeq(string not_remove, bool keep_two, StrVector &removed_seqs, StrVector &target_seqs);


    /*
        check if some state is absent, which may cause numerical issues
        @param msg additional message into the warning
        @return number of absent states in the alignment
    */
    virtual int checkAbsentStates(string msg);

	/**
		Quit if some sequences contain only gaps or missing data
	*/
	//virtual void checkGappySeq(bool force_error = true);

	/**
		create a non-parametric bootstrap alignment by resampling sites within partitions
		@param aln input alignment
		@param pattern_freq (OUT) if not NULL, will store the resampled pattern frequencies
        @param spec bootstrap specification of the form "l1:b1,l2:b2,...,lk:bk"
            	to randomly draw b1 sites from the first l1 sites, etc. Note that l1+l2+...+lk
            	must equal m, where m is the alignment length. Otherwise, an error will occur.
            	If spec == NULL, a standard procedure is applied, i.e., randomly draw m sites.
	*/
	virtual void createBootstrapAlignment(Alignment *aln, IntVector* pattern_freq = NULL, const char *spec = NULL);

	/**
		resampling pattern frequency by a non-parametric bootstrap 
		@param pattern_freq (OUT) resampled pattern frequencies
        @param spec bootstrap specification, see above
	*/
	virtual void createBootstrapAlignment(IntVector &pattern_freq, const char *spec = NULL);

	/**
		resampling pattern frequency by a non-parametric bootstrap
		@param pattern_freq (OUT) resampled pattern frequencies
        @param spec bootstrap specification, see above
        @param rstream random generator stream, NULL to use the global randstream
	*/
	virtual void createBootstrapAlignment(int *pattern_freq, const char *spec = NULL, int *rstream = NULL);

	/**
	 * shuffle alignment by randomizing the order of sites over all sub-alignments
	 */
	virtual void shuffleAlignment();

	/**
		compute the observed (Hamming) distance (number of different pairs of positions per site)
			between two sequences
		@param seq1 index of sequence 1
		@param seq2 index of sequence 2
		@return the observed distance between seq1 and seq2 (between 0.0 and 1.0)
	*/
	virtual double computeObsDist(int seq1, int seq2);

	/**
		compute the Juke-Cantor corrected distance between 2 sequences over all partitions
		@param seq1 index of sequence 1
		@param seq2 index of sequence 2		
		@return any distance between seq1 and seq2
	*/
	virtual double computeDist(int seq1, int seq2);

	/**
	 * print the super-alignment to a file
	 * @param filename
	 * @param append TRUE to append to this file, false to write new file
	 */
	void printCombinedAlignment(const char *filename, bool append = false);

    /**
	 * print the super-alignment to a stream
	 * @param out output stream
	 * @param print_taxid true to print taxa IDs instead of names, default: false
	 */
    
	void printCombinedAlignment(ostream &out, bool print_taxid = false);

	/**
	 * print all sub alignments into files with prefix, suffix is the charset name
	 * @param prefix prefix of output files
	 */
	void printSubAlignments(Params &params);

	/**
		@return unconstrained log-likelihood (without a tree)
	*/
	virtual double computeUnconstrainedLogL();

	/**
	 * @return proportion of missing data in super alignment
	 */
	double computeMissingData();

	/**
	 * build all patterns of super alignent from partitions and taxa_index
	 * it is in form of a binary alignment, where 0 means absence and 1 means presence
	 * of a gene in a sequence
	 */
	void buildPattern();

    /**
            count the fraction of constant sites in the alignment, update the variable frac_const_sites
     */
    virtual void countConstSite();

    /**
     * 	@return number of states, if it is a partition model, return max num_states across all partitions
     */
    virtual int getMaxNumStates() {
    	return max_num_states;
    }

    /** order pattern by number of character states and return in ptn_order
    */
    virtual void orderPatternByNumChars(int pat_type);

	/**
		actual partition alignments
	*/
	vector<Alignment*> partitions;

	/**
		matrix represents the index of taxon i in partition j, -1 if the taxon is not present
	*/
	vector<IntVector> taxa_index;

	/** maximum number of states across all partitions */
	int max_num_states;

	/**
	 * concatenate subset of alignments
	 * @param ids IDs of sub-alignments
	 * @return concatenated alignment
	 */
    Alignment *concatenateAlignments(set<int> &ids);

	/**
	 * concatenate all alignments
	 * @return concatenated alignment
	 */
    Alignment *concatenateAlignments();


};

#endif