File: ClassifyVariant.h

package info (click to toggle)
tvc 5.0.3%2Bgit20151221.80e144e%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 3,548 kB
  • sloc: cpp: 24,088; ansic: 3,933; python: 260; makefile: 16
file content (226 lines) | stat: -rw-r--r-- 8,252 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
/* Copyright (C) 2012 Ion Torrent Systems, Inc. All Rights Reserved */

//! @file     ClassifyVariant.h
//! @ingroup  VariantCaller
//! @brief    HP Indel detection


#ifndef CLASSIFYVARIANT_H
#define CLASSIFYVARIANT_H

#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <vector>
#include <math.h>
#include <ctype.h>
#include <algorithm>


#include "sys/types.h"
#include "sys/stat.h"
#include <map>
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>

#include <Variant.h>


#include "MiscUtil.h"
#include "LocalContext.h"
#include "InputStructures.h"
#include "ExtendParameters.h"
//@TODO: remove when moving SSE detector
#include "VariantAssist.h"

using namespace std;


class VarButton {
  public:
  // Describes the basic identity and sub-classification of our allele
  bool isSNP;              // Single base substitution
  bool isMNV;              // Multiple base substitution
  bool isPaddedSNP;        // MNV that is in fact an anchored or padded SNP.

  bool isIndel;            // Anchor base + one or more copies of the same base in the longer allele
  bool isInsertion;        // Alternative allele longer than reference allele
  bool isDeletion;         // Alternative allele shorter than reference allele
  bool isHPIndel;          // InDel occurs in a reference HP of length > 1
  bool isDyslexic;

  //bool isComplex;          // A complex allele is anything but snp, mnv and Indel
  //bool isComplexHP;        // This complex allele involves a ref. HP of length > 1

  bool isHotSpot;           // Signifies a hotspot variant (set per variant for all alleles, regardless of their specific origin)
  bool isProblematicAllele; // There is something wrong with this allele, we should filter it.
  bool doRealignment;       // Switch to turn realignment on or off 

  VarButton() {
    isHPIndel      = false;
    isSNP          = false;
    isInsertion    = false;
    isDeletion     = false;
    isDyslexic     = false;
    isMNV          = false;
    isPaddedSNP    = false;
    isIndel        = false;
    isHotSpot      = false;
    isProblematicAllele = false;
    doRealignment  = false;
  }
};

// ------------------------------------------------------------------------

// example variants:
// VarButton isSNP true
//  genome       44 45 46 47 48  49 (0 based)
//  ref is        A  A  A  A  T  T
//  alt is                    A
//  altAllele is              A
//  left_anchor               0 (always)
//  right_anchor              0 (always)
//  inDelLength               0 (always)
//  ref_hp_length             2 (>=1) -- T T satrting at 48
//  start_window  <=48 -- calculated as min over all alt Alleles
//  end_window    >=49 -- calculated as max over all alt Alleles

// VarButton isMNV true
//  genome       44 45 46 47 48  49 (0 based)
//  ref is        A  A  A  A  T  T
//  alt is              A  G  C
//  altAllele is        A  G  C
//  left_anchor         1
//  right_anchor                 0
//  inDelLength         0 (always)
//  ref_hp_length       2 (>=1 always) -- T T starting at 48
//  start_window   
//  end_window     

//  VarButton isIndel true, isDeletion false
//  genome       42 42 44 45 46 47 48 49 50 51 52   (0 based)
//  ref is        C  A  A  A  A  T  G  T  A  A  A
//  alt is                       d  C  G
//  altAllele is                 T  C  G 
//  left_anchor                  1
//  right_anchor                 0
//  inDelLength                  2
//  ref_hp_length                1 (G at 49)
//  start_window
//  end_window


// VarButton isIndel false, isInsertion true
//  genome       42 42 44 45 46 47  48 49 50 51 52   (0 based)
//  ref is        C  C  A  A  A  A   T  G  T  A  A  A
//  alt is                       G  GC
//  altAllele is                 G  G  C
//  left_anchor                  0
//  right_anchor                 0
//  inDelLength                  3
//  ref_hp_length                4  (A at 47)
//  start_window
//  end_window


class AlleleIdentity {
  public:
    VarButton     status;     //!< A bunch of flags saying what's going on with this allele
    string        altAllele;  //!< May contain left and/or right anchor bases, cannot be empty
    int           DEBUG;

    // useful context
    int left_anchor;        //!< Number of left bases that are common between the ref. and alt. allele
    int right_anchor;         //!< Number of right bases that are common between the ref. and alt. allele
                              //   left_anchor + right_anchor <= shorter allele length
    int inDelLength;          //!< Difference in length between longer and shorter allele
    int ref_hp_length;        //!< First base change is occurring in an HP of length ref_hp_length
    int start_window;         //!< Start of window of interest for this variant
    int end_window;           //!< End of window of interest for this variant

    // need to know when I do filtering
    float  sse_prob_positive_strand;
    float  sse_prob_negative_strand;
    vector<string> filterReasons;

    bool indelActAsHPIndel;   // Switch to make all indels match HPIndel behavior

    AlleleIdentity() {

      inDelLength = 0;
      ref_hp_length = 0;
      //modified_start_pos = 0;
      left_anchor = 0;
      right_anchor = 0;
      start_window = 0;
      end_window = 0;
      DEBUG = 0;
      
      // filterable statuses
      sse_prob_positive_strand = 0;
      sse_prob_negative_strand = 0;

      indelActAsHPIndel = false;
    };

    bool Ordinary() {
      return(status.isIndel && !(status.isHPIndel));
    };
    
    bool ActAsSNP(){
      // return(status.isSNP || status.isMNV || (status.isIndel && !status.isHPIndel));
      if (indelActAsHPIndel)
	return(status.isSNP || status.isPaddedSNP);
      else
	return(status.isSNP || status.isPaddedSNP || (status.isIndel && !status.isHPIndel));
    }
    bool ActAsMNP(){
      return(status.isMNV);
    }
    bool ActAsHPIndel(){
      if (indelActAsHPIndel)
	return(status.isIndel);
      else
	return(status.isIndel && status.isHPIndel);
    }
    //void DetectPotentialCorrelation(const LocalReferenceContext &reference_context);
    bool SubCategorizeInDel(const LocalReferenceContext &reference_context,
                            const ReferenceReader &ref_reader, int chr_idx);
    void IdentifyHPdeletion(const LocalReferenceContext& reference_context);
    void IdentifyHPinsertion(const LocalReferenceContext& reference_context,
        const ReferenceReader &ref_reader, int chr_idx);
    bool IdentifyDyslexicMotive(char base, int position,
        const ReferenceReader &ref_reader, int chr_idx);

    void SubCategorizeSNP(const LocalReferenceContext &reference_contextl);
    void SubCategorizeMNP(const LocalReferenceContext &reference_contextl);
    bool getVariantType(const string _altAllele, const LocalReferenceContext &reference_context,
                        const TIonMotifSet & ErrorMotifs,
                        const ClassifyFilters &filter_variant,
                        const ReferenceReader &ref_reader,
                        int chr_idx);
    bool CharacterizeVariantStatus(const LocalReferenceContext &reference_context,
                                   const ReferenceReader &ref_reader, int chr_idx);
    bool CheckValidAltAllele(const LocalReferenceContext &reference_context);
    //void ModifyStartPosForAllele(int variantPos);

    bool IdentifyMultiNucRepeatSection(const LocalReferenceContext &seq_context, unsigned int rep_period,
        const ReferenceReader &ref_reader, int chr_idx);
    void CalculateWindowForVariant(const LocalReferenceContext &seq_context, int DEBUG,
        const ReferenceReader &ref_reader, int chr_idx);

    void DetectCasesToForceNoCall(const LocalReferenceContext &seq_context, const ClassifyFilters &filter_variant,
        const VariantSpecificParams& variant_specific_params);
    void DetectLongHPThresholdCases(const LocalReferenceContext &seq_context, int maxHPLength);
    void DetectNotAVariant(const LocalReferenceContext &seq_context);
    void PredictSequenceMotifSSE(const LocalReferenceContext &reference_context, const TIonMotifSet & ErrorMotifs,
                                 const ReferenceReader &ref_reader, int chr_idx);
};




#endif //CLASSIFYVARIANT_H