File: kmer.d

package info (click to toggle)
sambamba 1.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 3,528 kB
  • sloc: sh: 220; python: 166; ruby: 147; makefile: 103
file content (113 lines) | stat: -rw-r--r-- 2,785 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
module bio.core.kmer;

import bio.core.base;
import std.range;

/// Represents k-mer of ACGT bases of length no more than 32.
struct KMer(uint K) 
    if (K <= 32)
{
    private ulong _id;

    static Base5 code2base(int code) {
        return Base5("ACGT"[code]);
    }

    static int char2code(char base) {
        switch (base) {
            case 'A': return 0;
            case 'C': return 1;
            case 'G': return 2;
            case 'T': return 3;
            default: return -1;
        }
    }

    /// Unique ID
    ulong id() @property const {
        return _id;
    }

    /// Construct by ID
    this(S)(S id) 
        if (is(S == ulong))
    {
        _id = id;
    }
       
    /// Construct from sequence. Takes bases from the provided sequence
    /// until K symbols 'A/C/G/T' are found. That is, 'N' and other ambiguous
    /// bases are skipped.
    ///
    /// If sequence does not contain at least K bases 'A/C/G/T', the result of
    /// operation is undefined.
    this(S)(S sequence) 
        if (isInputRange!S) 
    {
        size_t i = 0;
        foreach (nuc; sequence) {
            _id <<= 2;
            ++i;
            switch (cast(char)nuc) {
                case 'A':
                    break;
                case 'C':
                    _id += 1;
                    break;
                case 'G':
                    _id += 2;
                    break;
                case 'T':
                    _id += 3;
                    break;
                default:
                    _id >>= 2;
                    --i;
                    break;
            }

            if (i == K)
                break;
        }
    }

    struct KMerSequence {
        this(ulong number) {
            _n = number;
        }

        private ulong _n;
        private size_t _len = K;

        bool empty() @property const { return _len == 0; }
        void popFront() { --_len; }
        void popBack() { --_len; _n >>= 2; }

        Base5 opIndex(size_t i) const {
            return code2base((_n >> (2 * (_len - i - 1))) & 3);
        }

        size_t length() @property const { return _len; }
        Base5 front() @property const { return opIndex(0); }
        Base5 back() @property const { return opIndex(_len - 1); }
        KMerSequence save() @property const { 
            KMerSequence _seq = void;
            _seq._n = _n;
            _seq._len = _len;
            return _seq;
        }
    }

    /// Sequence corresponding to the k-mer
    KMerSequence sequence() @property const {
        return KMerSequence(_id);
    }
}

unittest {
    import std.algorithm;
    auto kmer = KMer!10("AACGTACGTG");
    assert(equal(kmer.sequence, "AACGTACGTG"));

    assert(KMer!5(KMer!5(0b1011001001UL).sequence).id == 0b1011001001UL);
}