File: test_trim.py

package info (click to toggle)
python-cutadapt 4.7-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,992 kB
  • sloc: python: 9,695; ansic: 177; makefile: 159
file content (65 lines) | stat: -rw-r--r-- 2,593 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from typing import Sequence

from dnaio import SequenceRecord
from cutadapt.adapters import (
    BackAdapter,
    AnywhereAdapter,
    BackAdapterStatistics,
    Adapter,
)
from cutadapt.modifiers import AdapterCutter, ModificationInfo


def test_statistics() -> None:
    read = SequenceRecord("name", "AAAACCCCAAAA")
    adapters: Sequence[Adapter] = [BackAdapter("CCCC", max_errors=0.1)]
    cutter = AdapterCutter(adapters, times=3)
    cutter(read, ModificationInfo(read))
    assert isinstance(cutter.adapter_statistics[adapters[0]], BackAdapterStatistics)
    lengths = cutter.adapter_statistics[adapters[0]].end.lengths
    trimmed_bp = sum(seqlen * count for (seqlen, count) in lengths.items())
    assert trimmed_bp <= len(read), trimmed_bp


def test_end_trim_with_mismatch():
    """
    Test the not-so-obvious case where an adapter of length 13 is trimmed from
    the end of a sequence with overlap 9 and there is one deletion.
    In this case the algorithm starts with 10 bases of the adapter to get
    the hit and so the match is considered good. An insertion or substitution
    at the same spot is not a match.
    """
    adapter = BackAdapter("TCGATCGATCGAT", max_errors=0.1)

    read = SequenceRecord("foo1", "AAAAAAAAAAATCGTCGATC")
    cutter = AdapterCutter([adapter], times=1)
    trimmed_read = cutter(read, ModificationInfo(read))

    assert trimmed_read.sequence == "AAAAAAAAAAA"
    assert cutter.adapter_statistics[adapter].end.lengths == {9: 1}
    # We see 1 error at length 9 even though the number of allowed mismatches at
    # length 9 is 0.
    assert cutter.adapter_statistics[adapter].end.errors[9][1] == 1

    read = SequenceRecord("foo2", "AAAAAAAAAAATCGAACGA")
    cutter = AdapterCutter([adapter], times=1)
    trimmed_read = cutter(read, ModificationInfo(read))

    assert trimmed_read.sequence == read.sequence
    assert cutter.adapter_statistics[adapter].end.lengths == {}


def test_anywhere_with_errors():
    adapter = AnywhereAdapter("CCGCATTTAG", max_errors=0.1)
    for seq, expected_trimmed in (
        ("AACCGGTTccgcatttagGATC", "AACCGGTT"),
        ("AACCGGTTccgcgtttagGATC", "AACCGGTT"),  # one mismatch
        ("AACCGGTTccgcatttag", "AACCGGTT"),
        ("ccgcatttagAACCGGTT", "AACCGGTT"),
        ("ccgtatttagAACCGGTT", "AACCGGTT"),  # one mismatch
        ("ccgatttagAACCGGTT", "AACCGGTT"),  # one deletion
    ):
        read = SequenceRecord("foo", seq)
        cutter = AdapterCutter([adapter], times=1)
        trimmed_read = cutter(read, ModificationInfo(read))
        assert trimmed_read.sequence == expected_trimmed