1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
|
# coding: utf-8
from __future__ import print_function, division, absolute_import
from cutadapt.colorspace import encode, decode
from cutadapt.scripts.cutadapt import main
from utils import run, datapath
# If there are any unknown characters in the test sequence,
# round tripping will only work if all characters after the
# first unknown character are also unknown:
# encode("TNGN") == "T444", but
# decode("T444") == "TNNN".
sequences = [
"",
"C",
"ACGGTC",
"TN",
"TN.",
"TNN.N",
"CCGGCAGCATTCATTACGACAACGTGGCACCGTGTTTTCTCGGTGGTA",
"TGCAGTTGATGATCGAAGAAAACGACATCATCAGCCAGCAAGTGC",
"CAGGGTTTGATGAGTGGCTGTGGGTGCTGGCGTATCCGGG"
]
def test_encode():
assert encode("AA") == "A0"
assert encode("AC") == "A1"
assert encode("AG") == "A2"
assert encode("AT") == "A3"
assert encode("CA") == "C1"
assert encode("CC") == "C0"
assert encode("CG") == "C3"
assert encode("CT") == "C2"
assert encode("GA") == "G2"
assert encode("GC") == "G3"
assert encode("GG") == "G0"
assert encode("GT") == "G1"
assert encode("TA") == "T3"
assert encode("TC") == "T2"
assert encode("TG") == "T1"
assert encode("TT") == "T0"
assert encode("TN") == "T4"
assert encode("NT") == "N4"
assert encode("NN") == "N4"
assert encode("ACGGTC") == "A13012"
assert encode("TTT.N") == "T0044"
assert encode("TTNT.N") == "T04444"
def test_decode():
for s in sequences:
expected = s.replace('.', 'N')
encoded = encode(s)
assert decode(encoded) == expected
assert decode('A.') == 'AN'
assert decode('C.') == 'CN'
assert decode('G.') == 'GN'
assert decode('T.') == 'TN'
def test_qualtrim_csfastaqual():
'''-q with csfasta/qual files'''
run("-c -q 10", "solidqual.fastq", "solid.csfasta", 'solid.qual')
def test_E3M():
'''Read the E3M dataset'''
# not really colorspace, but a fasta/qual file pair
main(['-o', '/dev/null', datapath("E3M.fasta"), datapath("E3M.qual")])
def test_bwa():
'''MAQ-/BWA-compatible output'''
run("-c -e 0.12 -a 330201030313112312 -x 552: --maq", "solidmaq.fastq", "solid.csfasta", 'solid.qual')
def test_bfast():
'''BFAST-compatible output'''
run("-c -e 0.12 -a 330201030313112312 -x abc: --strip-f3", "solidbfast.fastq", "solid.csfasta", 'solid.qual')
def test_trim_095():
'''some reads properly trimmed since cutadapt 0.9.5'''
run("-c -e 0.122 -a 330201030313112312", "solid.fasta", "solid.fasta")
def test_solid():
run("-c -e 0.122 -a 330201030313112312", "solid.fastq", "solid.fastq")
def test_solid_basespace_adapter():
'''colorspace adapter given in basespace'''
run("-c -e 0.122 -a CGCCTTGGCCGTACAGCAG", "solid.fastq", "solid.fastq")
def test_solid5p():
'''test 5' colorspace adapter'''
# this is not a real adapter, just a random string
# in colorspace: C0302201212322332333
run("-c -e 0.1 --trim-primer -g CCGGAGGTCAGCTCGCTATA", "solid5p.fasta", "solid5p.fasta")
def test_solid5p_prefix_notrim():
'''test anchored 5' colorspace adapter, no primer trimming'''
run("-c -e 0.1 -g ^CCGGAGGTCAGCTCGCTATA", "solid5p-anchored.notrim.fasta", "solid5p.fasta")
def test_solid5p_prefix():
'''test anchored 5' colorspace adapter'''
run("-c -e 0.1 --trim-primer -g ^CCGGAGGTCAGCTCGCTATA", "solid5p-anchored.fasta", "solid5p.fasta")
def test_solid5p_fastq():
'''test 5' colorspace adapter'''
# this is not a real adapter, just a random string
# in colorspace: C0302201212322332333
run("-c -e 0.1 --trim-primer -g CCGGAGGTCAGCTCGCTATA", "solid5p.fastq", "solid5p.fastq")
def test_solid5p_prefix_notrim_fastq():
'''test anchored 5' colorspace adapter, no primer trimming'''
run("-c -e 0.1 -g ^CCGGAGGTCAGCTCGCTATA", "solid5p-anchored.notrim.fastq", "solid5p.fastq")
def test_solid5p_prefix_fastq():
'''test anchored 5' colorspace adapter'''
run("-c -e 0.1 --trim-primer -g ^CCGGAGGTCAGCTCGCTATA", "solid5p-anchored.fastq", "solid5p.fastq")
def test_sra_fastq():
'''test SRA-formatted colorspace FASTQ'''
run("-c -e 0.1 --format sra-fastq -a CGCCTTGGCCGTACAGCAG", "sra.fastq", "sra.fastq")
def test_no_zero_cap():
run("--no-zero-cap -c -e 0.122 -a CGCCTTGGCCGTACAGCAG", "solid-no-zerocap.fastq", "solid.fastq")
|