File: kmer_functions.h

package info (click to toggle)
seqan2 2.5.2-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 228,748 kB
  • sloc: cpp: 257,602; ansic: 91,967; python: 8,326; sh: 1,056; xml: 570; makefile: 229; awk: 51; javascript: 21
file content (959 lines) | stat: -rw-r--r-- 37,409 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
// ==========================================================================
//                 SeqAn - The Library for Sequence Analysis
// ==========================================================================
// Copyright (c) 2006-2026, Knut Reinert, FU Berlin
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
//     * Redistributions of source code must retain the above copyright
//       notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above copyright
//       notice, this list of conditions and the following disclaimer in the
//       documentation and/or other materials provided with the distribution.
//     * Neither the name of Knut Reinert or the FU Berlin nor the names of
//       its contributors may be used to endorse or promote products derived
//       from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
// OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
// DAMAGE.
//
// ==========================================================================
// Author: Jonathan Goeke <goeke@molgen.mpg.de>
// ==========================================================================
// This file contains helper functions to count words in sequences and to
// calculate probabilities and variances of word occurrences.
// ==========================================================================

// TODO (goeke) const could be added below for the input variables but the function value() in matrix_base (align) is not defined for const.  Similarly, the function emittedProbabilty is not defined for const in statistics_markov_model.h

#ifndef SEQAN_INCLUDE_SEQAN_ALIGNMENT_FREE_KMER_FUNCTIONS_H_
#define SEQAN_INCLUDE_SEQAN_ALIGNMENT_FREE_KMER_FUNCTIONS_H_

namespace seqan2 {

template <typename TAlphabet>
struct UnmaskedAlphabet_
{
    typedef TAlphabet Type;
};

template <>
struct UnmaskedAlphabet_<Dna5>
{
    typedef Dna Type;
};

template <typename TAlphabet>
struct UnmaskedAlphabet_<const TAlphabet>
{
    typedef const typename UnmaskedAlphabet_<TAlphabet>::Type Type;
};

/*!
 * @fn countKmers
 * @headerfile <seqan/alignment_free.h>
 * @brief Counts kmers in a sequence. Optionally, a background model is returned.
 *
 * @signature void countKmers(kmerCounts, sequence, k);
 * @signature void countKmers(kmerCounts, bgFrequencies, sequence, k);
 * @signature void countKmers(kmerCounts, bgModel, sequence, k);
 *
 * @param[out] kmerCounts    @link String @endlink of <tt>unsigned</tt> with kmer counts for every k-mer.
 * @param[out] bgFrequencies @link String @endlink of background frequencies (<tt>double</tt>) representing the model.
 * @param[out] bgModel       @link MarkovModel @endlink to use.
 * @param[in]  sequence      @link String @endlink (sequence) where k-mers are counted.
 * @param[in]  k             k-mer length (<tt>unsigned</tt>).
 *
 * k-mers overlapping masked (aka 'N') letters are not counted in case of Dna5Strings.  A Bernoulli or Markov Model
 * can be choosen as a background model.
 *
 * @section Examples
 *
 * Calculate the alignment free sequence similarity o two masked DNA sequences.
 *
 * @code{.cpp}
 * using namespace seqan2;
 * // Masked sequence, we do not want to count words overlapping 'N'
 * Dna5String sequenceDna5 =
 *     "TAGGTTTTCCGAAAAGGTAGCAACTTTACGTGATCAAACCTCTGACGGGGTTTTCCCCGTCGAAATTGGGTG"
 *     "TTTCTTGTCTTGTTCTCACTTGGGGCATCTCCGTCAAGCCAAGAAAGTGCTCCCTGGATTCTGTTGCTAACG"
 *     "AGTCTCCTCTGCATTCCTGCTTGACTGATTGGGCGGACGGGGTGTCCACCTGACGCTGAGTATCGCCGTCAC"
 *     "GGTGCCACATGTCTTATCTATTCAGGGATCAGAATTTATTCAGGAAATCAGGAGATGCTACACTTGGGTTAT"
 *     "CGAAGCTCCTTCCAAGGCGTAGCAAGGGCGACTGAGCGCGTAAGCTCTAGATCTCCTCGTGTTGCAACTACA"
 *     "CGCGCGGGTCACTCGAAACACATAGTATGAACTTAACGACTGCTCGTACTGAACAATGCTGAGGCAGAAGAT"
 *     "CGCAGACCAGGCATCCCACTGCTTGAAAAAACTATNNNNCTACCCGCCTTTTTATTATCTCATCAGATCAAG";
 *
 * String<unsigned> kmerCounts;
 * unsigned k = 2;  // Count all 2-mers
 * countKmers(kmerCounts, sequenceDna5, k);
 *
 * for(unsigned i = 0; i<16; ++i)       // Print the 2-mer counts
 *     std::cout<<kmerCounts[i]<<"\n";  // 34 times AA; 30 times AC; 28 times AG; ...
 *
 *
 * String<double> nucleotideFrequencies;  // Defines a Bernoulli model for DNA sequences.
 * // Count all 2-mers and save the nucleotide frequencies
 * countKmers(kmerCounts, nucleotideFrequencies, sequenceDna5, k);
 *
 * for(unsigned i = 0; i<4; ++i)          // Print the nucleotide frequencies
 *     std::cout << nucleotideFrequencies[i] << "\n";
 * // => p(A) = 0.238; p(C) = 0.254; p(G) = 0.238; p(T) = 0.27;
 *
 * MarkovModel<Dna, double>  backgroundModel(1);  // Markov model of order 1
 * // Count all 2-mers and return a Markov model
 * countKmers(kmerCounts, backgroundModel, sequenceDna5, k);
 * std::cout<<backgroundModel.transition;  // Print the transition matrix of the markov model
 * @endcode
 *
 * @see alignmentFreeComparison
 * @see calculateProbability
 * @see calculateVariance
 * @see calculateCovariance
 * @see stringToStringSet
 * @see MarkovModel
 */

/*
 * Function to count kmers, Ns are not considered
 */
template <typename TString>
void countKmers(String<unsigned> & kmerCounts, TString const & sequence, unsigned const k)
{
    typedef typename Value<TString>::Type               TAlphabet;
    typedef typename UnmaskedAlphabet_<TAlphabet>::Type TUnmaskedAlphabet;
    typedef typename Iterator<TString const>::Type      TIterator;
    typedef typename Position<TIterator>::Type          TPosition;
    typedef Shape<TUnmaskedAlphabet, SimpleShape>       TShape;
    // Declare variables
    TShape myShape;  // Shape, length can be changed (kmer_length)
    resize(myShape, k);
    // Calculate the number of kmers, length of count vector
    int kmerNumber = _intPow((unsigned)ValueSize<TUnmaskedAlphabet>::VALUE, weight(myShape));
    clear(kmerCounts);
    resize(kmerCounts, kmerNumber, 0);
    TIterator itSequence = begin(sequence);
    int counterN = 0;

    // Check for any N that destroys the first kmers
    unsigned j = k - 1;
    for (TPosition i = position(itSequence); i <= j; ++i)
    {
        if (_repeatMaskValue(sequence[i]))
        {
            counterN = i + 1;
        }
    }
    for (; itSequence <= (end(sequence) - k); ++itSequence)
    {
        // Check if there is a "N" at the end of the new kmer
        if (_repeatMaskValue(value(itSequence + (k - 1))))
            counterN = k;  // Do not consider any kmer covering this "N"

        // If there is no "N" overlapping with the current kmer, count it
        if (counterN <= 0)
        {
            unsigned hashValue = hash(myShape, itSequence);
            ++kmerCounts[hashValue];
        }
        counterN--;
    }
}

/*
 * Function to count kmers and background nucleotide frequencies, Ns are not considered
 * (for zero order background model)
 */
template <typename TValueBG, typename TString>
void countKmers(String<unsigned> & kmerCounts, String<TValueBG> & backgroundFrequencies, TString const & sequence, unsigned const k)
{
    typedef typename Value<TString>::Type                    TAlphabet;
    typedef typename UnmaskedAlphabet_<TAlphabet>::Type      TUnmaskedAlphabet;
    typedef typename Iterator<TString const>::Type           TIterator;
    typedef typename Iterator<String<TValueBG> >::Type       TIteratorTStringBG;
    typedef typename Position<TIterator>::Type               TPosition;
    typedef Shape<TUnmaskedAlphabet, SimpleShape>            TShape;
    unsigned alphabetSize = ValueSize<TUnmaskedAlphabet>::VALUE;

    // Declare variables
    TShape myShape;         // Shape, length can be changed (kmer_length)
    TShape myShapeBG;       // Shape for background, set to markovlen+1, here zero order only
    resize(myShape, k);
    resize(myShapeBG, 1);   // Markov model of zero order (count background frequencies)

    // Calculate number of kmers/ length of count vector, respectively background vector
    unsigned kmerNumber = _intPow(alphabetSize, k);
    unsigned kmerNumberBG = alphabetSize;  // Zero order model for DNA sequences (Bernoulli model)
    clear(kmerCounts);
    resize(kmerCounts, kmerNumber, 0);
    resize(backgroundFrequencies, kmerNumberBG, (TValueBG) 0);
    TIterator itSequence = begin(sequence);

    int counterN = 0;  // Counter that counts how many kmers are effected by a N
    int counterNbg = 0;   // Counter for background model (different shape size)

    // Check for any N that destroys the first kmers
    unsigned j = k - 1;
    for (TPosition i = position(itSequence); i <= j; ++i)
    {
        if (_repeatMaskValue(sequence[i]))
            counterN = i + 1;
    }

    int sumBG = 0;  // Count the number of nucleotides for the nucleotide frequency calculation (Ns are not considered anymore).
    for (; itSequence <= (end(sequence) - k); ++itSequence)
    {
        // Check if there is a "N" at the end of the new kmer
        if (_repeatMaskValue(value(itSequence + (k - 1))))
        {
            counterN = k;  // Do not consider any kmer covering this "N"
        }
        // If there is no "N" overlapping with the current kmer, count it.
        if (counterN <= 0)
        {
            unsigned hashValue = hash(myShape, itSequence);
            ++kmerCounts[hashValue];
        }
        // Check if there is a "N" at the end of the new background word, here single letters only.
        if (_repeatMaskValue(value(itSequence)))
        {
            counterNbg = 1;
        }
        if (counterNbg <= 0)
        {
            unsigned hashValueBG = hash(myShapeBG, itSequence);
            backgroundFrequencies[hashValueBG] += 1.0;
            ++sumBG;
        }
        counterN--;
        counterNbg--;
    }
    // The background counts are updated until the last base is covered.
    for (; itSequence < end(sequence); ++itSequence)
    {
        if (_repeatMaskValue(value(itSequence)))
        {
            counterNbg = 1;
        }
        if (counterNbg <= 0)
        {
            unsigned hashValueBG = hash(myShapeBG, itSequence);
            ++backgroundFrequencies[hashValueBG];
            ++sumBG;
        }
        counterNbg--;
    }
    // Normalise the background counts to obtain the nucleotide frequencies (Bernoulli model of DNA sequences).
    TIteratorTStringBG itBackground = begin(backgroundFrequencies);
    for (; itBackground < end(backgroundFrequencies); ++itBackground)
        if (sumBG != 0)
            value(itBackground) /= ((TValueBG) sumBG);
}

/*
 * Function to count kmers and build a background markov model with masked sequences
 */
template <typename TString, typename TAlphabetBG, typename TValue>
void countKmers(String<unsigned> & kmerCounts, MarkovModel<TAlphabetBG, TValue> & backgroundModel, TString const & sequence, unsigned k)
{
    //typedef typename Value<TString>::Type                   TAlphabet;
    //typedef typename UnmaskedAlphabet_<TAlphabet>::Type     TUnmaskedAlphabet;
    typedef typename Iterator<TString const, Rooted>::Type  TIterator;
    //typedef typename Iterator<String<int>, Rooted>::Type    TIteratorInt;
    typedef typename Position<TIterator>::Type              TPosition;
    typedef Shape<TAlphabetBG, SimpleShape>                 TShape;

    // Declare variables
    TShape myShape;  // Shape, length can be changed (kmer_length)
    resize(myShape, k);
    // Only consider kmers without N
    int kmerNumber = _intPow((unsigned)ValueSize<TAlphabetBG>::VALUE, weight(myShape));
    clear(kmerCounts);
    resize(kmerCounts, kmerNumber, 0);

    // Create sequence set for the markov model, if Ns occur, the sequence is split and Ns are removed
    StringSet<String<TAlphabetBG> > seqSetMM;

    TIterator itSeq = begin(sequence);

    // Check for any N that destroys the first kmers
    unsigned j = (k - 1);
    for (TPosition i = position(itSeq); i <= j; ++i)
    {
        if (_repeatMaskValue(sequence[i]))
        {
            if ((i - position(itSeq)) > 0)
            {
                appendValue(seqSetMM, infix(sequence, position(itSeq), i));
            }
            goFurther(itSeq, i + 1 - position(itSeq));
            j = i + k - 1;
        }
    }

    int counterN = 0;
    TPosition startSplitSequence = position(itSeq);  // The position of possible start of a sequence after NNs is stored to split sequences.

    for (; itSeq <= (end(sequence) - k); ++itSeq)
    {
        if (_repeatMaskValue(value(itSeq + (k - 1))))
        {
            counterN = k;

            if (((position(itSeq) + k - 1) > startSplitSequence))
                appendValue(seqSetMM, infix(sequence, startSplitSequence, (position(itSeq) + k - 1)));

            startSplitSequence = (position(itSeq) + k);  // Position after N, possible start
        }
        if (counterN <= 0)
        {
            unsigned hashValue = hash(myShape, itSeq);
            ++kmerCounts[hashValue];
        }

        counterN--;
    }
    // Create a stringSet, needed to create the Markov model
    if ((position(itSeq) + k - 1) > startSplitSequence)
    {
        appendValue(seqSetMM, infix(sequence, startSplitSequence, (position(itSeq) + k - 1)));
    }
    // Build background Markov model
    buildMarkovModel(backgroundModel, seqSetMM);
}

/*!
 * @fn calculateProbability
 * @headerfile <seqan/alignment_free.h>
 * @brief Calculates the probability of a sequence given a Bernoulli model.
 *
 * @signature void calculateProbability(probability, sequence, bgFrequencies);
 *
 * @param[out] probability  Probability (<tt>double</tt>) of the sequence given the model.
 * @param[in] sequence      @link String @endlink, usually of Dna characters.
 * @param[in] bgFrequencies @link String @endlink of background frequencies (<tt>double</tt>) representing the model.
 *
 * @section Examples
 *
 * Calculate the probability for the word CCCAAGTTT with <i>p(A) = p(T) = 0.3</i> and <i>p(C) = p(G) = 0.2</i>.
 *
 * @code{.cpp}
 * using namespace seqan2;
 * double p = 0.0;
 * DnaString word = "CCCAAGTTT";
 * String<double> model;
 * resize(model, 4);
 * model[0] = 0.3;  // p(A)
 * model[1] = 0.2;  // p(C)
 * model[2] = 0.2;  // p(G)
 * model[3] = 0.3;  // p(T)
 * calculateProbability(p, word, model);  // p = 3.888e-06
 * @endcode
 *
 * @see calculateVariance
 * @see alignmentFreeComparison
 * @see calculateCovariance
 * @see countKmers
 */

template <typename TValue, typename TString, typename TStringBG>
void calculateProbability(TValue & probability, TString const & sequence, TStringBG const & backgroundFrequencies)
{
    typedef typename Iterator<TString const, Rooted>::Type  TIteratorTString;

    TIteratorTString itSequence = begin(sequence);
    probability = (TValue) 1;
    for (; itSequence < end(sequence); ++itSequence)
        probability *= backgroundFrequencies[ordValue(*itSequence)];
}

/*!
 * @fn calculateVariance
 * @headerfile <seqan/alignment_free.h>
 * @brief Calculates the variance for the number of word occurrences of a word in a sequence of length n given a
 *        background model.
 *
 * @signature void calculateVariance(variance, word, bgFrequencies, n);
 * @signature void calculateVariance(variance, word, bgModel, n);
 *
 * @param[out] variance     Variance of the number of occurrences of the word in a sequence of length n given the
 *                          model; <tt>double</tt>.
 * @param[in] word          @link String @endlink, usually of Dna to compute variance for.
 * @param[in] bgFrequencies @link String @endlink of bg frequencies representing the model.
 * @param[in] bgModel       @link MarkovModel @endlink to use.
 * @param[in] n             Length of the sequence where the occurrences of word are counted, <tt>int</tt>.
 *
 * Calculates the variance for the number of word occurrences of a word in a sequence of length n given a background
 * model (Markov model or Bernoulli model). The formula is obtained from (Robin et al., 2005).
 *
 * @section References
 *
 * Robin, S., Rodolphe, F., and Schbath, S.  (2005). DNA, Words and Models. Cambridge University Press. See Jonathan
 * Goeke et al (to appear) for details on the implementation.
 *
 * @section Examples
 *
 * Calculate the variance for the number of occurrences of CAAGTC in a sequence of length 10000bp with
 * <i>p(A) = p(T) = 0.3</i> and <i>p(C) = p(G) = 0.2</i>.
 *
 * @code{.cpp}
 * using namespace seqan2;
 * double var = 0.0;
 * int n = 10000;
 * DnaString word = "CAAGTC";
 * String<double> model;
 * resize(model, 4);
 * model[0] = 0.3;  // p(A)
 * model[1] = 0.2;  // p(C)
 * model[2] = 0.2;  // p(G)
 * model[3] = 0.3;  // p(T)
 * calculateVariance(var, word, model, n);  // var = 2.16
 * @endcode
 *
 * Estimate a Markov model on a set of sequences and calculate the variance for the number of occurrences of the word
 * CAAGTC in a sequence of length 10000bp.
 *
 * @code{.cpp}
 * using namespace seqan2;
 * double var = 0.0;
 * int n = 10000;
 * DnaString word = "CAAGTC";
 * StringSet<DnaString> sequences;
 * appendValue(sequences, "CAGAAAAAAACACTGATTAACAGGAATAAGCAGTTTACTTATTTTGGGCCTGGGACCCGTGTCTCTAATTTAATTAGGTGATCCCTGCGAAGTTTCTCCA");
 * MarkovModel<Dna, double> model(0);  // Bernoulli model
 * model.build(sequences);
 * calculateVariance(var, word, model, n);  // var = 2.16
 * MarkovModel<Dna, double> model1(1);  // First order Markov model
 * model1.build(sequences);
 * calculateVariance(var, word, model1, n);  // var = 1.69716
 * @endcode
 *
 * @see calculateProbability
 * @see calculateCovariance
 * @see MarkovModel
 * @see alignmentFreeComparison
 * @see calculatePeriodicity
 * @see countKmers
 * @see calculateOverlapIndicator
 */

template <typename TValue, typename TString, typename TStringBG>
void calculateVariance(TValue & variance, TString const & word, TStringBG const & backgroundFrequencies, int const n)
{
    typedef typename Value<TString>::Type                       TAlphabet;
    typedef typename Value<TStringBG>::Type                     TValueBG;
    typedef typename Iterator<String<int>, Rooted>::Type        TIteratorInt;

    int l = length(word);
    TValueBG p_w;
    calculateProbability(p_w, word, backgroundFrequencies);

    String<int> periodicity;
    calculatePeriodicity(periodicity, word, word);
    variance = (TValue) (n - l + 1) * p_w;
    for (TIteratorInt i = begin(periodicity); i < end(periodicity); ++i)
    {
        TValueBG p_clump;
        TValueBG p_tmp;
        calculateProbability(p_tmp, word, backgroundFrequencies);
        String<TAlphabet> wordPrefix = prefix(word, value(i));
        calculateProbability(p_clump, wordPrefix, backgroundFrequencies);
        p_clump *= p_tmp;
        variance += (TValue) 2 * (n - l + 1 - value(i)) * p_clump;
    }
    variance += (TValue) p_w * p_w * (n - 2 * n * l + 3 * l * l - 4 * l + 1);
}


template <typename TValue, typename TSpec, typename TAlphabet>
void calculateVariance(TValue & variance, String<TAlphabet, TSpec> const & word, MarkovModel<TAlphabet, TValue> /*const*/ & bgModel, int const n)
{
    typedef typename Iterator<String<int>, Rooted>::Type        TIteratorInt;

    int l = length(word);
    TValue p_w;
    p_w = emittedProbability(bgModel, word);
    String<int> periodicity;
    calculatePeriodicity(periodicity, word, word);
    variance = (TValue) (n - l + 1) * p_w;
    for (TIteratorInt i = begin(periodicity); i < end(periodicity); ++i)
    {
        TValue p_clump;
        String<TAlphabet> clump = prefix(word, value(i));
        append(clump, word);
        p_clump = emittedProbability(bgModel, clump);
        variance += (TValue) 2 * (n - l + 1 - value(i)) * p_clump;
    }
    variance += (TValue) p_w * p_w * (n - 2 * n * l + 3 * l * l - 4 * l + 1);
}

/*!
 * @fn calculateCovariance
 * @headerfile <seqan/alignment_free.h>
 * @brief Calculates the covariance for the number of word occurrences for two words in a sequence of length n, given a
 *        background model.
 *
 * @signature void calculateCovariance(covariance, word1, word2, bgFrequencies, n);
 * @signature void calculateCovariance(covariance, word1, word2, bgModel, n);
 *
 * @param[out] covariance    Variance of the number of occurrences of the word in a sequence of length n given the
 *                           model, <tt>double</tt>.
 * @param[in] word1          @link String @endlink, usually of Dna.
 * @param[in] word2          @link String @endlink, usually of Dna.
 * @param[in] bgFrequencies  @link String @endlink of <tt>double</tt> with the background frequencies representing
 * @param[in] bgModel        @link MarkovModel @endlink to use.
 * @param[in] n              Length of the sequence where the occurrences of word are counted, <tt>int</tt>.
 *
 * Calculates the covariance for the number of word occurrences for two words in a sequence of length n given a
 * background model (Markov model or Bernoulli model). The covariance is influenced by the property of words to overlap,
 * for example, the words ATAT and TATA have a high covariance since they are likely to overlap. The formula is based on
 * (Robin et al., 2005).
 *
 * @section References
 *
 * Robin, S., Rodolphe, F., and Schbath, S.  (2005). DNA, Words and Models. Cambridge University Press. See Jonathan
 * Goeke et al (to appear) for details on the implementation.
 *
 * @section Examples
 *
 * Calculate the covariance for the number of occurrences of ATATAT and TATATA in a sequence of length 10000bp with
 * <i>p(A) = p(T) = 0.3</i> and <i>p(C) = p(G) = 0.2</i>.
 *
 * @code{.cpp}
 * using namespace seqan2;
 * double covar = 0.0;
 * int n = 10000;
 * DnaString word1 = "ATATAT";
 * DnaString word2 = "TATATA";
 * String<double> model;
 * resize(model, 4);
 * model[0] = 0.3;  // p(A)
 * model[1] = 0.2;  // p(C)
 * model[2] = 0.2;  // p(G)
 * model[3] = 0.3;  // p(T)
 * calculateCovariance(covar, word1, word2, model, n);  // covar = 4.74
 * @endcode
 *
 * Estimate a Markov model on a set of sequences and calculate the covariance for the number of occurrences of ATATAT
 * and TATATA in a sequence of length 10000bp.
 *
 * @code{.cpp}
 * using namespace seqan2;
 * double covar = 0.0;
 * int n = 10000;
 * DnaString word1 = "ATATAT";
 * DnaString word2 = "TATATA";
 * StringSet<DnaString> sequences;
 * appendValue(sequences, "CAGCACTGATTAACAGGAATAAGCAGTTTACTTCTGTCAGAATATTGGGCATATATA"
 *                        "CTGGGACCCGTGTAATACTCTAATTTAATTAGGTGATCCCTGCGAAGTCTCCA");
 * MarkovModel<Dna, double> modelMM0(0);  // Bernoulli model
 * modelMM0.build(sequences);
 * calculateCovariance(covar, word1, word2, modelMM0, n);  // covar = 4.74
 * MarkovModel<Dna, double> modelMM1(1);  // First order Markov model
 * modelMM1.build(sequences);
 * calculateCovariance(covar, word1, word2, modelMM1, n);  // covar = 13.1541
 * @endcode
 *
 * @see calculateProbability
 * @see calculateVariance
 * @see MarkovModel
 * @see alignmentFreeComparison
 * @see calculatePeriodicity
 * @see countKmers
 * @see calculateOverlapIndicator
 */

template <typename TValue, typename TString, typename TStringBG>
void calculateCovariance(TValue & covariance, TString const & word1, TString const & word2, TStringBG const & backgroundFrequencies, int const n)
{
    if (word1 == word2)
    {
        calculateVariance(covariance, word1, backgroundFrequencies, n);
        return;
    }
    typedef typename Value<TString>::Type                   TAlphabet;
    typedef typename Value<TStringBG>::Type                 TValueBG;
    typedef typename Iterator<String<int>, Rooted>::Type    TIteratorInt;

    covariance = 0;
    int l1 = length(word1);
    TValueBG p_w1;
    calculateProbability(p_w1, word1, backgroundFrequencies);
    String<int> periodicity1;
    calculatePeriodicity(periodicity1, word1, word2);
    for (TIteratorInt i = begin(periodicity1); i < end(periodicity1); ++i)
    {
        TValueBG p_clump;
        TValueBG p_tmp;
        calculateProbability(p_tmp, word2, backgroundFrequencies);
        String<TAlphabet> wordPrefix = prefix(word1, value(i));
        calculateProbability(p_clump, wordPrefix, backgroundFrequencies);
        p_clump *= p_tmp;
        covariance += (TValue) (n - l1 + 1 - value(i)) * p_clump;
    }

    int l2 = length(word2);
    TValueBG p_w2;
    calculateProbability(p_w2, word2, backgroundFrequencies);
    String<int> periodicity2;
    calculatePeriodicity(periodicity2, word2, word1);
    for (TIteratorInt i = begin(periodicity2); i < end(periodicity2); ++i)
    {
        TValueBG p_clump;
        TValueBG p_tmp;
        calculateProbability(p_tmp, word1, backgroundFrequencies);
        String<TAlphabet> wordPrefix = prefix(word2, value(i));
        calculateProbability(p_clump, wordPrefix, backgroundFrequencies);
        p_clump *= p_tmp;
        covariance += (TValue) (n - l2 + 1 - value(i)) * p_clump;
    }
    covariance += (TValue) p_w1 * p_w2 * (n - 2 * n * l1 + 3 * l1 * l1 - 4 * l1 + 1);
}

template <typename TValue, typename TSpec, typename TAlphabet>
void calculateCovariance(TValue & covariance, String<TAlphabet, TSpec> const & word1, String<TAlphabet, TSpec> const & word2, MarkovModel<TAlphabet, TValue> /*const*/ & bgModel, int const n)
{
    if (word1 == word2)
    {
        calculateVariance(covariance, word1, bgModel, n);
        return;
    }
    typedef typename Iterator<String<int>, Rooted>::Type TIteratorInt;

    covariance = 0;
    int l1 = length(word1);
    TValue p_w1;
    p_w1 = emittedProbability(bgModel, word1);
    String<int> periodicity1;
    calculatePeriodicity(periodicity1, word1, word2);  // word2 is right
    for (TIteratorInt i = begin(periodicity1); i < end(periodicity1); ++i)
    {
        TValue p_clump;
        String<TAlphabet> clump = prefix(word1, value(i));
        append(clump, word2);

        p_clump = emittedProbability(bgModel, clump);

        covariance += (TValue) (n - l1 + 1 - value(i)) * p_clump;
    }
    TValue p_w2;
    p_w2 = emittedProbability(bgModel, word2);
    String<int> periodicity2;
    calculatePeriodicity(periodicity2, word2, word1);
    for (TIteratorInt i = begin(periodicity2); i < end(periodicity2); ++i)
    {
        TValue p_clump;
        String<TAlphabet> clump = prefix(word2, value(i));
        append(clump, word1);

        p_clump = emittedProbability(bgModel, clump);

        covariance += (TValue) (n - l1 + 1 - value(i)) * p_clump;
    }
    covariance += (TValue) p_w1 * p_w2 * (n - 2 * n * l1 + 3 * l1 * l1 - 4 * l1 + 1);
}

/*!
 * @fn calculatePeriodicity
 * @headerfile <seqan/alignment_free.h>
 * @brief Calculate word periodicity (indicator for overlaps)
 *
 * @signature void calculatePeriodicity(periodicity, word1, word2);
 *
 * @param[out] periodicity String of <tt>int</tt> values giving the periodicity (overlap indicator) of
 *                         <tt>word1</tt> and <tt>word2</tt>.
 * @param[int] word1       String, usually of Dna characters.
 * @param[int] word2       String, usually of Dna characters.
 *
 * Calculate word periodicity (indicator for overlaps) for two words.
 *
 * @section Examples
 *
 * Calculate the periodicity of two words (At which positions can they overlap?)
 *
 * @code{.cpp}
 * using namespace seqan2;
 * DnaString word1 = "ATATA";
 * DnaString word2 = "TATAT";
 * String<int> periodicity;
 * calculatePeriodicity(periodicity, word1, word2);
 * for(unsigned i = 0; i < length(periodicity); ++i)  // Print the periodicity
 *     std::cout << periodicity[i] << "\t";
 *
 * // periodocity[0] = 1:
 * // i =     01234
 * // word1 = ATATA
 * // word2 = -TATAT
 *
 * // periodocity[1] = 3:
 * // i =     01234
 * // word1 = ATATA
 * // word2 = ---TATAT
 * @endcode
 *
 * @see calculateVariance
 * @see calculateCovariance
 * @see calculateOverlapIndicator
 * @see alignmentFreeComparison
 */

template <typename TString>
void calculatePeriodicity(String<int> & periodicity, TString const & word1, TString const & word2)
{
    typedef typename Value<TString>::Type                   TAlphabet;
    //typedef typename Iterator<TString const, Rooted>::Type  TIterator;
    typedef typename Size<TString>::Type                    TSize;

    TSize length1 = length(word1);
    TSize length2 = length(word2);
    for (TSize i = 1; i < length1; ++i)
    {
        String<TAlphabet> my_suffix = suffix(word1, i);  // Overlap of suffix of word1 with prefix of word2
        TSize my_min = std::min(length2, (length1 - i));
        String<TAlphabet> my_prefix = prefix(word2, my_min);
        if (my_suffix == my_prefix)
        {
            appendValue(periodicity, i);
        }
    }
}

/*!
 * @fn calculateOverlapIndicator
 * @headerfile <seqan/alignment_free.h>
 * @brief Calculate word overlaps: <tt>epsilon(word1, word2) = 1</tt> where <tt>word2[j] = word1[j+p] for
 *        all j = 1..(k-p)</tt>.
 *
 * @signature void calculateOverlapIndicator(epsilon, word1, word2);
 *
 * @param[out] epsilon String of int giving the periodicity (overlap indicator) of word1 and word2.
 * @param[in]  word1   String (for example a DNA sequence).
 * @param[in]  word2   String (for example a DNA sequence).
 *
 * Calculate the indicator for overlaps of two words. The formula is based on (Robin et al., 2005)
 *
 * @section References
 *
 * Robin, S., Rodolphe, F., and Schbath, S. (2005). DNA, Words and Models.  Cambridge University Press. See Jonathan
 * Goeke et al (to appear) for details on the implementation.
 *
 * @section Examples
 *
 * Calculate the overlap indicator (epsilon) for two words
 *
 * @code{.cpp}
 * using namespace seqan2;
 * DnaString word1 = "ATATA";
 * DnaString word2 = "TATAT";
 * String<int> epsilon;
 * calculateOverlapIndicator(epsilon, word1, word2);
 * for(unsigned i = 0; i < length(epsilon); ++i)
 *     std::cout << epsilon[i] << "\t";
 * // epsilon =         01010:
 * // word1             ATATA
 * // word2 overlap 1:  -TATAT
 * // word2 overlap 2:  ---TATAT
 * @endcode
 *
 * @see calculateVariance
 * @see calculateCovariance
 * @see calculatePeriodicity
 * @see alignmentFreeComparison
 */

template <typename TString>
void calculateOverlapIndicator(String<int> & epsilon, TString const & word1, TString const & word2)
{
    typedef typename Value<TString>::Type                   TAlphabet;
    //typedef typename Iterator<TString const, Rooted>::Type  TIterator;
    typedef typename Size<TString>::Type                    TSize;

    TSize length1 = length(word1);
    TSize length2 = length(word2);
    clear(epsilon);
    resize(epsilon, length1, 0);
    for (TSize i = 0; i < length1; ++i)
    {
        String<TAlphabet> my_suffix = suffix(word1, length1 - i - 1);  // Overlap of suffix of word1 with prefix of word2
        TSize my_min = std::min(length2, i + 1);
        String<TAlphabet> my_prefix = prefix(word2, my_min);
        if (my_suffix == my_prefix)
            epsilon[i] = 1;
    }
}

/*!
 * @fn stringToStringSet
 * @headerfile <seqan/alignment_free.h>
 * @brief Transform a String into a StringSet containing this String.
 *
 * @signature void stringToStringSet(stringSet, string);
 * @signature void stringToStringSet(dnaStringSet, dna5String);
 *
 * @param[out] stringSet    @link StringSet @endlink to create with one sequence.
 * @param[in]  string       @link String @endlink to create the string set of.
 * @param[out] dnaStringSet @link StringSet @endlink of @link String Strings @endlink over the alphabet @link Dna @endlink.
 * @param[in]  dna5String   @link String @endlink over the alphabet @link Dna5 @endlink to convert.
 *
 * @note The second variant removes all N characters from the @link Dna5String @endlink.
 *
 * @section Examples
 *
 * Transform a masked DNA sequence into a set of sequences with all masked parts removed.
 *
 * @code{.cpp}
 * using namespace seqan2;
 * Dna5String sequenceDna5 =
 *     "NNNNNNTTTCCGAAAAGGTANNNNNGCAACTTTANNNCGTGATCAAAGTTTTCCCCGTCGAAATTGGGNNTG";
 * StringSet<DnaString> sequencesDna;
 * stringToStringSet(sequencesDna, sequenceDna5);
 * // Print the masked sequence
 * std::cout<<sequenceDna5<<"\n";
 * // Print the sequence with the masked parts removed
 * for(unsigned i = 0; i < length(sequencesDna); ++i)
 *     std::cout<<sequencesDna[i]<<"\n";
 * // sequencesDna[0] = "TTTCCGAAAAGTA"
 * // sequencesDna[1] = "GCAACTTTA"
 * // sequencesDna[2] = "CGTGATCAAAGTTTTCCCCGTCGAAATTGGG"
 * // sequencesDna[3] = "TG"
 * @endcode
 *
 * @see MarkovModel
 * @see alignmentFreeComparison
 * @see cutNs
 * @see countKmers
 */

template <typename TString>
void
stringToStringSet(StringSet<TString> & stringSet, TString const & sequence)
{
    resize(stringSet, 1);
    stringSet[0] = sequence;
}

inline void
stringToStringSet(StringSet<String<Dna> > & dnaStringSet, String<Dna5> const & sequence)
{
    typedef Iterator<String<Dna5> const, Rooted>::Type  TIterator;
    typedef Position<TIterator>::Type                   TPosition;

    TIterator itSeq = begin(sequence);
    // Check for any N that destroys the first kmers
    unsigned j = 0;
    for (TPosition i = position(itSeq); i <= j; ++i)
    {
        if (sequence[i] == 'N')
        {
            if ((i - position(itSeq)) > 0)
                appendValue(dnaStringSet, infix(sequence, position(itSeq), i));
            goFurther(itSeq, i + 1 - position(itSeq));
            j = i;
        }
    }
    TPosition startSplitSequence = position(itSeq);  // The position of possible starts of a sequence after Ns is stored to split the sequence.
    for (; itSeq <= (end(sequence) - 1); ++itSeq)
    {
        if (value(itSeq) == 'N')
        {
            if (((position(itSeq)) > startSplitSequence))
            {
                appendValue(dnaStringSet, infix(sequence, startSplitSequence, position(itSeq)));
            }
            startSplitSequence = (position(itSeq) + 1);  // Position after N, possible start
        }
    }
    // Create the stringSet, the stringSet can be used to create a Markov model
    if (position(itSeq) > startSplitSequence)
        appendValue(dnaStringSet, infix(sequence, startSplitSequence, position(itSeq)));
}

/*!
 * @fn cutNs
 * @headerfile <seqan/alignment_free.h>
 * @brief Cut out all masked sequences from a Dna5String.
 *
 * @signature void cutNs(sequenceCut, sequence);
 *
 * @param[out] sequenceCut Dna5String similar to sequence with all Ns cut out.
 * @param[in]  sequence    Masked DNA sequence.
 *
 * This function concatenates the nonmasked parts of the sequence, thereby changing the word content. If you want to
 * remove the masked parts of a sequence without concatenation, use stringToStringSet.
 *
 * @section Examples
 *
 * Transform a masked DNA sequence into an unmasked sequences with all masked parts cut out
 *
 * @code{.cpp}
 * using namespace seqan2;
 * Dna5String sequenceMasked =
 *     "NNNNNNTTTCCGAAAAGGTANNNNNGCAACTTTANNNCGTGATCAAAGTTTTCCCCGTCGAAATTGGGNNTG";
 * Dna5String sequenceMaskedPartsRemoved;
 * cutNs(sequenceMaskedPartsRemoved, sequenceMasked);
 * // Print the masked sequence
 * std::cout<<sequenceMasked<<"\n";
 * // Print the sequence with the masked parts removed
 * std::cout<<sequenceMaskedPartsRemoved<<"\n";
 * // sequenceMasked =
 * // "NNNNNNTTTCCGAAAAGGTANNNNNGCAACTTTANNNCGTGATCAAAGTTTTCCCCGTCGAAATTGGGNNTG"
 * // sequenceMaskedPartsRemoved =
 * // "TTTCCGAAAAGGTAGCAACTTTACGTGATCAAAGTTTTCCCCGTCGAAATTGGGTG"
 * @endcode
 *
 * @see MarkovModel
 * @see stringToStringSet
 * @see alignmentFreeComparison
 */

inline void
cutNs(String<Dna5> & sequenceCut, String<Dna5> const & sequence)
{
    typedef Iterator<String<Dna5> const, Rooted>::Type  TIterator;
    typedef Position<TIterator>::Type                   TPosition;

    sequenceCut = "";
    TIterator itSeq = begin(sequence);

    // Check for any N that destroys the first kmers
    unsigned j = 0;
    for (TPosition i = position(itSeq); i <= j; ++i)
    {
        if (sequence[i] == 'N')
        {
            if ((i - position(itSeq)) > 0)
                sequenceCut += infix(sequence, position(itSeq), i);
            goFurther(itSeq, i + 1 - position(itSeq));
            j = i;
        }
    }
    TPosition startSplitSequence = position(itSeq);  // The position of possible starts of a sequence after Ns is stored to split the sequence.
    for (; itSeq <= (end(sequence) - 1); ++itSeq)
    {
        if (value(itSeq) == 'N')
        {
            if (((position(itSeq)) > startSplitSequence))
                sequenceCut += infix(sequence, startSplitSequence, position(itSeq));
            startSplitSequence = (position(itSeq) + 1);  // Position after N, possible start
        }
    }
    // Create the sequence with any N cut out.
    if (position(itSeq) > startSplitSequence)
    {
        sequenceCut += infix(sequence, startSplitSequence, position(itSeq));
    }
}

}  // namespace seqan2

#endif  // SEQAN_INCLUDE_SEQAN_ALIGNMENT_FREE_KMER_FUNCTIONS_H_