1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
|
"""
Tests for the sqt.io.fasta module
"""
from io import StringIO
import pytest
from sqt.io.fasta import (FastaReader, FastaWriter, Sequence, FastqWriter,
SequenceReader, fastq_header)
import os.path
def dpath(path):
return os.path.join(os.path.dirname(__file__), path)
def test_fastqwriter():
tmp = dpath("tmp.fastq")
with FastqWriter(tmp) as fq:
fq.write("name", "CCATA", "!#!#!")
fq.write("name2", "HELLO", "&&&!&&")
assert fq._file.closed
with open(tmp) as t:
assert t.read() == '@name\nCCATA\n+\n!#!#!\n@name2\nHELLO\n+\n&&&!&&\n'
os.remove(tmp)
def test_fastqwriter_twoheaders():
tmp = dpath("tmp.fastq")
with FastqWriter(tmp, twoheaders=True) as fq:
fq.write("name", "CCATA", "!#!#!")
fq.write("name2", "HELLO", "&&&!&&")
assert fq._file.closed
with open(tmp) as t:
assert t.read() == '@name\nCCATA\n+name\n!#!#!\n@name2\nHELLO\n+name2\n&&&!&&\n'
os.remove(tmp)
def test_fastawriter():
tmp = dpath("tmp.fasta")
with FastaWriter(tmp) as fw:
fw.write("name", "CCATA")
fw.write("name2", "HELLO")
assert fw._file.closed
with open(tmp) as t:
assert t.read() == '>name\nCCATA\n>name2\nHELLO\n'
os.remove(tmp)
def test_fastawriter_linelength():
tmp = dpath("tmp.fasta")
with FastaWriter(tmp, line_length=3) as fw:
fw.write("name", "CCAT")
fw.write("name2", "TACCAG")
assert fw._file.closed
with open(tmp) as t:
d = t.read()
assert d == '>name\nCCA\nT\n>name2\nTAC\nCAG\n'
os.remove(tmp)
def test_fastawriter_sequence():
tmp = dpath("tmp.fasta")
with FastaWriter(tmp) as fw:
fw.write(Sequence("name", "CCATA"))
fw.write(Sequence("name2", "HELLO"))
assert fw._file.closed
with open(tmp) as t:
assert t.read() == '>name\nCCATA\n>name2\nHELLO\n'
os.remove(tmp)
def test_fastawriter_contextmanager():
with pytest.raises(ValueError):
tmp = dpath("tmp.fasta")
fr = FastaWriter(tmp)
os.remove(tmp)
with fr as frw:
pass
with fr as frw:
pass
def test_fastareader():
with FastaReader(dpath("seq.fa"), case='keep') as fr:
seqs = list(fr)
assert fr._file.closed
assert len(seqs) == 3
assert seqs[0].qualities is None
assert seqs[0].name == 'Chr1'
assert seqs[1].name == 'Chr2 CHROMOSOME dumped from ADB: Jun/20/09 14:54; last updated: 2009-02-02'
assert len(seqs[0].sequence) == 1235
assert seqs[0].sequence.startswith('CCCTAAACCCTAAACCCTAAACCCTAAACCTCTGAATCCTTAATC')
assert seqs[1].sequence.startswith('ctcgaccaggacgatgaatgggc')
assert seqs[2].sequence.endswith('AATCTTGCAAGTTCCAACTAATT')
def test_fastareader_upper():
with FastaReader(dpath("seq.fa")) as fr:
seqs = list(fr)
assert seqs[0].name == 'Chr1'
assert len(seqs[0].sequence) == 1235
assert seqs[0].sequence.startswith('CCCTAAACCCTAAACCCTAAACCCTAAACCTCTGAATCCTTAATC')
assert seqs[1].sequence.startswith('CTCGACCAGGACGATGAATGGGC')
def test_fastareader_lower():
with FastaReader(dpath("seq.fa"), case='lower') as fr:
seqs = list(fr)
assert seqs[0].name == 'Chr1'
assert len(seqs[0].sequence) == 1235
assert seqs[0].sequence.startswith('ccctaaaccctaaaccctaaaccctaaacctctgaatccttaatc')
assert seqs[1].sequence.startswith('ctcgaccaggacgatgaatgggc')
def test_fastareader_binary():
for wholefile in False, True:
print('wholefile:', wholefile)
with FastaReader(dpath("seq.fa"), binary=True, wholefile=wholefile, case='keep') as fr:
seqs1 = list(fr)
with FastaReader(dpath("seq.fa"), mode='rb', wholefile=wholefile, case='keep') as fr:
seqs2 = list(fr)
for seqs in seqs1, seqs2:
assert fr._file.closed
assert len(seqs) == 3
assert seqs[0].qualities is None
assert seqs[0].name == 'Chr1'
assert seqs[2].name == 'Chr3 CHROMOSOME dumped from ADB: Jun/20/09 14:54; last updated: 2009-02-02'
assert len(seqs[0].sequence) == 1235
assert seqs[0].sequence.startswith(b'CCCTAAACCCTAAACCCTAAACCCTAAACCTCTGAATCCTTAATC')
assert seqs[1].sequence.startswith(b'ctcgaccaggacgatgaatgggc')
assert seqs[2].sequence.endswith(b'AATCTTGCAAGTTCCAACTAATT')
def test_sequence_reader():
# should auto-detect FASTA vs FASTQ
with SequenceReader(dpath("seq.fa")) as sr:
assert sr.format == 'fasta'
with SequenceReader(dpath("seq.fastq")) as sr:
assert sr.format == 'fastq'
def test_fastareader_contextmanager():
with pytest.raises(ValueError):
fr = FastaReader(dpath("seq.fa"))
with fr as frw:
pass
with fr as frw:
pass
def test_fastq_header():
h = fastq_header(StringIO('@HWI-ST344:204:D14G8ACXX:8:1101:1638:2116 1:N:0:CGATGT'))
assert h.instrument == 'HWI-ST344'
assert h.run == 204
assert h.flowcell == 'D14G8ACXX'
assert h.lane == 8
assert h.barcode == 'CGATGT'
h = fastq_header(StringIO('@MISEQ:56:000000000-A4YM7:1:1101:15071:2257 1:N:0:CTTGTA'))
assert h.instrument == 'MISEQ'
assert h.run == 56
assert h.flowcell == '000000000-A4YM7'
assert h.lane == 1
assert h.barcode == 'CTTGTA'
h = fastq_header(StringIO('@HWI-ST552_0:4:1101:1179:1939#0/1'))
print(h)
assert h.instrument == 'HWI-ST552_0'
assert h.run is None
assert h.flowcell is None
assert h.lane == 4
assert h.barcode is None
h = fastq_header(StringIO('@HWI_ST139:8:1:1202:1874#GATCAG/1'))
assert h.instrument == 'HWI_ST139'
assert h.run is None
assert h.flowcell is None
assert h.lane == 8
assert h.barcode == 'GATCAG'
#h = fastq_header(StringIO('@FCD20MKACXX:8:1101:1215:2155#TCGTAAGC/1'))
#assert h.instrument is None
#assert h.run is None
#assert h.flowcell == 'FCD20MKACXX'
#assert h.lane == 8
#assert h.barcode == 'TCGTAAGC'
|