File: test_index_swift.cpp

package info (click to toggle)
seqan2 2.5.2-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 228,748 kB
  • sloc: cpp: 257,602; ansic: 91,967; python: 8,326; sh: 1,056; xml: 570; makefile: 229; awk: 51; javascript: 21
file content (81 lines) | stat: -rw-r--r-- 3,091 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#include <seqan/basic.h>
#include <seqan/index.h>
#include <seqan/sequence.h>

// Test SWIFT finder with empty pattern.
SEQAN_DEFINE_TEST(test_index_swift_find_empty_pattern)
{
    using namespace seqan2;

    typedef Finder<Dna5String, Swift<SwiftSemiGlobal> > TSwiftFinder;

    typedef Dna5String TReadSeq_;
    typedef StringSet<TReadSeq_> TReadSet;
    typedef typename Value<TReadSet>::Type TReadSeq;
    typedef typename Value<TReadSeq>::Type TAlphabet;
    typedef Shape<TAlphabet, UngappedShape<10> > TShape;
    typedef Index<TReadSet, IndexQGram<TShape, OpenAddressing> > TQGramIndex;
    typedef Pattern<TQGramIndex, Swift<SwiftSemiGlobal> > TSwiftPattern;

    Dna5String contigSeq = "CGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGAT";

    StringSet<Dna5String> readSeqs;

    TQGramIndex index(readSeqs);
    TSwiftPattern swiftPattern(index);

    TSwiftFinder swiftFinder(contigSeq);
    while (find(swiftFinder, swiftPattern, 0.1))
        continue;
}

// Test SWIFT finder with edge case hash collision
// DNA 32-mers are mapped into 64-bit space
//     enc(AA..AA) = 0
//     enc(TT..TT) = 2^64-1 = uint64_t::max()
// Because an empty bucket (TBucketMap::EMPTY) is also marked by uint64_t::max(), there is no way to know if a bucket
// was filled with the TT..TT k-mer or still empty.
// This test checks that the index constructor throws an exception if the q-gram size is too large.
// It only applies to shapes with variable size (SimpleShape) as the fixed size shapes (UngappedShape, GappedShape)
// are checked via a static assert.
SEQAN_DEFINE_TEST(test_index_swift_edge_case_hash_collision)
{
    using namespace seqan2;

    typedef DnaString TReadSeq_;
    typedef StringSet<TReadSeq_> TReadSet;
    typedef typename Value<TReadSet>::Type TReadSeq;
    typedef typename Value<TReadSeq>::Type TAlphabet;
    typedef Shape<TAlphabet, SimpleShape> TShape;
    typedef Index<TReadSet, IndexQGram<TShape, OpenAddressing> > TQGramIndex;

    TReadSet readSeqs;
    TReadSeq lastKmerCollision = "AGGTATTTTTTTTTTTTTTTTTTTTTTTTTTTT"
                                  "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"
                                  "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"
                                  "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"
                                  "CCGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"
                                  "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"
                                  "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT";

    appendValue(readSeqs, lastKmerCollision);

    try
    {
        TQGramIndex index(readSeqs, TShape{32});
        SEQAN_FAIL("The expected exception was not thrown.");
    }
    catch(const std::runtime_error & e)
    {
        std::string expected = "Incompatible q-gram size. Must be in range [1, 31].";
        SEQAN_ASSERT_EQ(expected, e.what());
        TQGramIndex index(readSeqs, TShape{31});
    }
}

SEQAN_BEGIN_TESTSUITE(test_index_swift)
{
	SEQAN_CALL_TEST(test_index_swift_find_empty_pattern);
    SEQAN_CALL_TEST(test_index_swift_edge_case_hash_collision);
}
SEQAN_END_TESTSUITE