File: tracediff_mutations.cpp

package info (click to toggle)
staden 2.0.0%2Bb11-5
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 21,568 kB
  • sloc: ansic: 240,605; tcl: 65,360; cpp: 12,854; makefile: 11,201; sh: 2,952; fortran: 2,033; perl: 63; awk: 46
file content (468 lines) | stat: -rw-r--r-- 14,079 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
/*
 * Copyright (c) Medical Research Council 2001. All rights reserved.
 *
 * Permission to use, copy, modify and distribute this software and its
 * documentation for any purpose is hereby granted without fee, provided that
 * this copyright and notice appears in all copies.
 *
 * This file was written as part of the Staden Package at the MRC Laboratory
 * of Molecular Biology, Hills Road, Cambridge, CB2 2QH, United Kingdom.
 *
 * MRC disclaims all warranties with regard to this software.
 *
 */


#include <cassert>
#include <cstdio>       // For std::printf(), debugging
#include <cstdlib>      // For std::abs()
#include <mutlib.h>
#include <list.hpp>
#include <array.hpp>
#include <trace.hpp>
#include <muttag.hpp>
#include <peakcall.hpp>
#include <tracediff_parameters.hpp>


//#define VERBOSE_DEBUG


void TraceDiffFindPotentialMutations( Trace& DiffTrace, mutlib_strand_t nStrand,
   int nBaseInterval, int nPosition, int nNoiseThreshold, int nPeakAlignmentThreshold,
   int nPeakTooWideThreshold, double nGlobalMean, List<MutTag>& Mutation )
{
/*
    Forms the core of the double-peak mutation scanning algorithm which is called
    once for each base. We look for double peaks above and below the mean, then
    apply a series of filters to remove the obviously invalid candidates. Each
    candidate is added as a mutation tag to the tag list. Subsequent phases
    whittle these down some.
*/
    int          nP1;
    int          nP2;
    int          x[2];
    int          a[2];
    int          l[2];
    int          r[2];
    int          p[2];
    MutTag*      pTag;
    int          nPeaks;
    int          nMeasurementThreshold;
    PeakCall     PosPeakCall;
    PeakCall     NegPeakCall;
    peak_call_t& PosPeak = PosPeakCall.Data;
    peak_call_t& NegPeak = NegPeakCall.Data;
    MutTag       TmpTag( "MUTA", MUTLIB_MUTATION_NONE, nPosition, nStrand );



    // Set search window limits
    DiffTrace.WindowCentredAt( nPosition, static_cast<int>(nBaseInterval*1.4), l[0], r[0] );



    // Search each trace for +ve and -ve peaks
    for( int j=0; j<4; j++ )
    {
        // No peaks found yet
        PosPeak.Position[j] = -1;
        NegPeak.Position[j] = -1;


        // Scan for nicely formed positive and negative peaks.
        x[0] = DiffTrace.PosPeakFindLargest( j, l[0], r[0], nPeaks, 2 );
        x[1] = DiffTrace.NegPeakFindLargest( j, l[0], r[0], nPeaks, 2 );


        // Save peak positions and amplitudes. The peak amplitude is stored
        // as a signed value centred around the baseline.
        if( x[0] >= 0 )
        {
            PosPeak.Position[j]  = x[0];
            PosPeak.Amplitude[j] = static_cast<int>(double(DiffTrace[j][PosPeak.Position[j]]) - nGlobalMean);
        }
        if( x[1] >= 0 )
        {
            NegPeak.Position[j]  = x[1];
            NegPeak.Amplitude[j] = static_cast<int>(double(DiffTrace[j][NegPeak.Position[j]]) - nGlobalMean);
        }
    }



    // Any valid peaks found at all?
    if( !PosPeakCall.IsValid() || !NegPeakCall.IsValid() )
        return;


    // Get the biggest pos/neg peaks and their corresponding positions
    nP1  = PosPeakCall.MaxAmplitudeAsIndex();
    nP2  = NegPeakCall.MinAmplitudeAsIndex();
    x[0] = PosPeak.Position[nP1];
    x[1] = NegPeak.Position[nP2];
    a[0] = PosPeak.Amplitude[nP1];
    a[1] = NegPeak.Amplitude[nP2];



    // Filter out noise and any single peaks
    if( (nP1==nP2) || (x[0]<0) || (x[1]<0) )
        return;



    // Filter out +ve peaks below the mean and -ve peaks above the mean
    if( (a[0]<=0) || (a[1]>=0) )
        return;



    // Filter out any doublets with either peak below the noise threshold
    a[0] = std::abs( a[0] );
    a[1] = std::abs( a[1] );
    if( (a[0]<nNoiseThreshold) || (a[1]<nNoiseThreshold) )
        return;



    // Measure peak widths at 1/3 of peak amplitude
    const double t = 0.33;
    nMeasurementThreshold = static_cast<int>( nGlobalMean + (a[0] * t) );
    x[0] = DiffTrace.PosPeakWidth( nP1, PosPeak.Position[nP1], l[0], r[0], nMeasurementThreshold );
    nMeasurementThreshold = static_cast<int>( nGlobalMean - (a[1] * t) );
    x[1] = DiffTrace.NegPeakWidth( nP2, NegPeak.Position[nP2], l[1], r[1], nMeasurementThreshold );



    // Save doublet width
    assert(nBaseInterval>0);
    double w = static_cast<double>( x[0]>x[1] ? x[0] : x[1] );
    TmpTag.Width( w/nBaseInterval );



    // Estimate peak centre positions, this is necessary because peaks are
    // often distorted during dynamic programming or because of trace noise
    p[0] = l[0] + ((r[0] - l[0]) / 2);
    p[1] = l[1] + ((r[1] - l[1]) / 2);



    // Filter out any doublets with poor peak centre alignment, save alignment
    int alignment = std::abs( p[0]-p[1] );
    if( alignment > nPeakAlignmentThreshold )
        return;
    TmpTag.Alignment( static_cast<double>(alignment)/static_cast<double>(nBaseInterval) );



    // Filter out any doublets that are too wide
    if( (x[0]>nPeakTooWideThreshold) || (x[1]>nPeakTooWideThreshold) )
        return;



    // Add mutation tag to list, sample position is midway between the two peaks
    pTag = new MutTag( TmpTag );
    pTag->Type( nP1, nP2 );
    pTag->Amplitude( 0, a[0] );
    pTag->Amplitude( 1, a[1] );
    x[0] = PosPeak.Position[nP1];
    x[1] = NegPeak.Position[nP2];
    pTag->Position( 0, (x[0]>x[1]) ? x[1]+((x[0]-x[1])/2) : x[0]+((x[1]-x[0])/2) );
    Mutation.Append( pTag );
}



void TraceDiffComputeLocalEnvelopeStatistics( Trace& DiffTrace, int nPosition, int nNoiseWindow, NumericArray<int>& rEnvelope, double& nMean, double& nSD )
{
/*
   Attempts to compute local envelope statistics over a window. It isn't always
   very successful in the presence of mutations which can cause undue bias.
*/
    int c;
    int n;
    int k;
    int nMin;
    int nMax;
    int nEnd;
    int nBegin;



    // Work out window limits in samples
    DiffTrace.WindowToLeftOf( nPosition, nNoiseWindow, nBegin, nEnd );



    // Ensure we have enough array space
    c = nEnd - nBegin + 1;
    if( c > rEnvelope.Capacity() )
    {
        rEnvelope.Empty();
        rEnvelope.Create( c );
    }
    else
    {
        rEnvelope.Length( c );
    }



    // Extract the envelope
    for( k=0, n=nBegin; n<=nEnd; n++, k++ )
    {
        DiffTrace.MaxAt( n, c, nMax );
        DiffTrace.MinAt( n, c, nMin );
        rEnvelope[k] = nMax - nMin;
    }



    // Compute envelope statistics over window
    nMean = rEnvelope.Mean();
    nSD   = rEnvelope.StandardDeviation( &nMean );
}



void TraceDiffMarkMutationsAboveThreshold( Trace& DiffTrace, double nSensitivity,
   int nNoiseWindow, MutTag& Tag, NumericArray<int>& DiffEnvelope, int& nLastMutation,
   double& nLocalMean, double& nLocalSD )
{
    // Sometimes there is too much variance at the start of a trace difference
    // which can cause us to miss obvious mutations. This might be because the
    // quality clipping is not severe enough, or it could be that the presence
    // of a mutation biases the statistics unfairly. Sometimes the dynamic
    // programming algorithm has problems at the endpoints. So for the first
    // region of the trace, we increase the window over which we compute the
    // statistics in order to get a better estimate.


    // Case 1: First window in trace
    if( Tag.Position() < nNoiseWindow )
    {
        TraceDiffComputeLocalEnvelopeStatistics( DiffTrace, Tag.Position(),
            3*nNoiseWindow, DiffEnvelope, nLocalMean, nLocalSD );
    }



    // Case 2: Normal case, no nearby mutations
    if( Tag.Position()-nLastMutation > nNoiseWindow )
    {
        TraceDiffComputeLocalEnvelopeStatistics( DiffTrace, Tag.Position(),
            nNoiseWindow, DiffEnvelope, nLocalMean, nLocalSD );
    }



    // Case 3: Nearby mutation, use previous window's statistics
    // Values already passed in from last time.



    // Filter out doublets below our threshold
    int nAmplitude        = Tag.Amplitude(0) + Tag.Amplitude(1);
    int nDoubletThreshold = static_cast<int>( nLocalMean + nSensitivity*nLocalSD );
    if( nAmplitude < nDoubletThreshold )
        return;



    // Update mutation tag
    Tag.Confidence( 100 );
    Tag.Sensitivity( (static_cast<double>(nAmplitude)-nLocalMean)/nLocalSD );
    nLastMutation = Tag.Position();
}



void TraceDiffMarkMutationsNearby( Trace& DiffTrace, int nNoiseWindow, MutTag& Tag, MutTag* pTagLast )
{
    // If there was no previous mutation, exit
    if( !pTagLast )
        return;


    // If we are already certain of the current mutation, exit
    if( Tag.Confidence() > 0 )
        return;


    // If distance between previous mutation and current candidate is too great, exit
    int nDistance = Tag.Position() - pTagLast->Position();
    if( nDistance > nNoiseWindow )
        return;


    // Mark as a mutation with reduced confidence
    Tag.Confidence( 50 );
}



void TraceDiffScanForMutations( Trace& DiffTrace, mutlib_strand_t nStrand, int nBaseInterval,
    int nFirstBase, TraceDiffParameters& Parameter, List<MutTag>& Mutation )
{
/*
    The overall automated mutation analysis algorithm.
*/
    assert(nFirstBase>=0);
    assert(nBaseInterval>0);


    // Unpack parameters
    double nSensitivity     = Parameter[TRACEDIFF_PARAMETER_SENSITIVITY].Value();
    double nNoiseThreshold_ = Parameter[TRACEDIFF_PARAMETER_NOISE_THRESHOLD].Value();
    int    nNoiseWindow     = static_cast<int>( Parameter[TRACEDIFF_PARAMETER_NOISE_WINDOW_LENGTH].Value() );
    double nPeakAlignment   = Parameter[TRACEDIFF_PARAMETER_PEAK_ALIGNMENT].Value();
    double nPeakWidthMax    = Parameter[TRACEDIFF_PARAMETER_PEAK_WIDTH_MAXIMUM].Value();



    // Collect some useful statistics
    double nLocalSD    = 0.0;
    double nLocalMean  = 0.0;
    double nGlobalMax  = DiffTrace.Max();
    double nGlobalMean = DiffTrace.Baseline();



    // Create some absolute thresholds/values from the fractional parameters
    nNoiseWindow                = nNoiseWindow * nBaseInterval;
    int nNoiseThreshold         = static_cast<int>( nGlobalMax * nNoiseThreshold_ / 2.0 );
    int nPeakAlignmentThreshold = static_cast<int>( nPeakAlignment * nBaseInterval );
    int nPeakTooWideThreshold   = static_cast<int>( nPeakWidthMax * nBaseInterval );



    // Variable initialisation
    int               n;
    NumericArray<int> DiffEnvelope;
    MutTag*           pTag          = 0;
    MutTag*           pTagLast      = 0;
    int               nSamples      = DiffTrace.Samples();
    int               nLastMutation = -nNoiseWindow;



    // PHASE 1:
    // Scan difference trace for double peaks which fit a sensible mask using a
    // window of size 'nBaseInterval' with 50% overlap. We cannot rely on base
    // call positions to be centred on peaks.
    for( int s=0; s<nSamples; s += nBaseInterval/2 )
    {
        TraceDiffFindPotentialMutations( DiffTrace, nStrand, nBaseInterval, s,
           nNoiseThreshold, nPeakAlignmentThreshold, nPeakTooWideThreshold,
           nGlobalMean, Mutation );
    }



    // PHASE 2:
    // Map mutation doublet sample positions to nearest input-trace base number.
    pTag = Mutation.First();
    while( pTag )
    {
        n = DiffTrace.BaseNumberFromSamplePosition( pTag->Position() );
        pTag->Position( 1, n+nFirstBase+1 );
        pTag = Mutation.Next();
    }



    // PHASE 3:
    // Eliminate duplicate tags that can occur as a result of the window overlap
    // Note, we use the base numbers as a basis for comparison, not sample numbers
    // otherwise we will run into problems.
    pTag = Mutation.First();
    while( pTag )
    {
        if( pTagLast && (pTag->Position(1)==pTagLast->Position(1)) )
        {
            // Keep mutation with largest variance
            n = Mutation.Index();
            if( pTag->Sensitivity() >= pTagLast->Sensitivity() )
               n--;
            delete Mutation.Remove( n );
            pTag = Mutation.Current();
        }
        pTagLast = pTag;
        pTag     = Mutation.Next();
    }



    // PHASE 3A:
    // Print out some useful debugging information
    #ifdef VERBOSE_DEBUG
    pTag = Mutation.First();
    while( pTag )
    {
        std::printf( "%4d (%3d): MUTA Width=%.2f, Alignment=%.2f, Amplitude=%d\n",
                     pTag->Position(), pTag->Position(1), pTag->Width(),
                     pTag->Alignment(), static_cast<int>(pTag->Amplitude(0)+pTag->Amplitude(1)) );
        pTag = Mutation.Next();
    }
    #endif



    // PHASE 4:
    // Compute the background noise in the local region specified by the 'nNoiseWindow'
    // parameter and use this to mark mutations that are above the noise threshold.
    pTag = Mutation.First();
    while( pTag )
    {
        TraceDiffMarkMutationsAboveThreshold( DiffTrace, nSensitivity, nNoiseWindow,
           *pTag, DiffEnvelope, nLastMutation, nLocalMean, nLocalSD );
        pTag = Mutation.Next();
    }



    #ifdef NOT_USED_ANYMORE
    //
    // After a lot of testing, phase 5 was found to have very little
    // beneficial effect - mainly contributing to the false +ve count.
    // It only seemed to be useful in christines LEA9xx dataset where
    // mutations were clustered together. In normal clinical data this
    // is rare.
    //

    // PHASE 5:
    // Reexamine potential mutations just to the right of the ones found in the
    // last phase. These can sometimes be missed due to bias in the statistics
    // caused by mutations.
    pTagLast = 0;
    pTag     = Mutation.First();
    while( pTag )
    {
        if( pTag->Confidence() == 100 )
            pTagLast = pTag;
        TraceDiffMarkMutationsNearby( DiffTrace, nNoiseWindow, *pTag, pTagLast );
        pTag = Mutation.Next();
    }
    #endif



    // PHASE 6:
    // Remove mutation tags from the list with zero confidence.
    pTag = Mutation.First();
    while( pTag )
    {
        if( pTag->Confidence() <= 0 )
        {
            delete Mutation.Remove( Mutation.Index() );
            pTag = Mutation.Current();
        }
        else
        {
            pTag = Mutation.Next();
        }
    }
}