1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181
|
#include <fstream>
#include <gtest/gtest.h>
#include <unit_tests/test_main.hpp>
#include <jellyfish/mer_overlap_sequence_parser.hpp>
#include <jellyfish/whole_sequence_parser.hpp>
#include <jellyfish/mer_dna.hpp>
#include <jellyfish/mer_iterator.hpp>
#include <jellyfish/mer_qual_iterator.hpp>
namespace {
using std::string;
using jellyfish::mer_dna;
typedef std::vector<string> string_vector;
template<typename Iterator>
struct opened_streams {
Iterator begin_, end_;
opened_streams(Iterator begin, Iterator end) : begin_(begin), end_(end) { }
jellyfish::stream_type next() {
jellyfish::stream_type res;
if(begin_ != end_) {
res.standard.reset(*begin_);
++begin_;
}
return res;
}
};
typedef jellyfish::mer_overlap_sequence_parser<opened_streams<std::ifstream**> > parser_type;
typedef jellyfish::mer_iterator<parser_type, mer_dna> mer_iterator_type;
typedef jellyfish::whole_sequence_parser<opened_streams<std::ifstream**> > parser_qual_type;
typedef jellyfish::mer_qual_iterator<parser_qual_type, mer_dna> mer_qual_iterator_type;
static string generate_sequence(int len) {
static const char bases[4] = { 'A', 'C', 'G', 'T' };
string res;
res.reserve(len);
for(int i = 0; i < len; ++i)
res += bases[random() & 0x3];
return res;
}
static string generate_quals(int len) {
string res;
res.reserve(len);
for(int i = 0; i < len; ++i)
res += static_cast<char>('!' + (random() % 94));
return res;
}
TEST(MerIterator, Sequence) {
static const int nb_sequences = 100;
const char* file_name = "Sequence.fa";
file_unlink fu(file_name);
string_vector sequences;
mer_dna::k(35);
{
std::ofstream input_fasta(file_name);
for(int i = 0; i < nb_sequences; ++i) {
sequences.push_back(generate_sequence(20 + random() % 200));
input_fasta << ">" << i << "\n" << sequences.back() << "\n";
}
}
// Check that every mer of the iterator matches the sequence
auto input_fasta = new std::ifstream(file_name);
{
opened_streams<std::ifstream**> streams(&input_fasta, &input_fasta + 1);
parser_type parser(mer_dna::k(), 1, 10, 100, streams);
mer_iterator_type mit(parser);
for(string_vector::const_iterator it = sequences.begin(); it != sequences.end(); ++it) {
if(it->size() >= mer_dna::k()) {
for(size_t i = 0; i < it->size() - (mer_dna::k() - 1); ++i, ++mit) {
ASSERT_NE(mer_iterator_type(), mit);
EXPECT_EQ(it->substr(i, mer_dna::k()), mit->to_str());
}
}
}
EXPECT_EQ(mer_iterator_type(), mit);
}
}
// Same but with canonical mers
TEST(MerIterator, SequenceCanonical) {
const char* file_name = "SequenceCanonical.fa";
file_unlink fu(file_name);
string_vector sequences;
static const int nb_sequences = 100;
mer_dna::k(35);
{
std::ofstream input_fasta(file_name);
for(int i = 0; i < nb_sequences; ++i) {
sequences.push_back(generate_sequence(20 + random() % 200));
input_fasta << ">" << i << "\n" << sequences.back() << "\n";
}
}
auto input_fasta = new std::ifstream(file_name);
{
opened_streams<std::ifstream**> streams(&input_fasta, &input_fasta + 1);
parser_type parser(mer_dna::k(), 1, 10, 100, streams);
mer_iterator_type mit(parser, true);
for(string_vector::const_iterator it = sequences.begin(); it != sequences.end(); ++it) {
if(it->size() >= mer_dna::k()) {
for(size_t i = 0; i < it->size() - (mer_dna::k() - 1); ++i, ++mit) {
ASSERT_NE(mer_iterator_type(), mit);
ASSERT_EQ(*mit, mit->get_canonical());
EXPECT_TRUE((it->substr(i, mer_dna::k()) == mit->to_str()) ||
(it->substr(i, mer_dna::k()) == mit->get_reverse_complement().to_str()));
}
}
}
EXPECT_EQ(mer_iterator_type(), mit);
}
}
TEST(MerQualIterator, Sequence) {
const char* file_name = "Sequence.fq";
file_unlink fu(file_name);
string_vector sequences;
string_vector quals;
static const int nb_sequences = 100;
mer_dna::k(35);
for(int i = 0; i < nb_sequences; ++i) {
const int len = 20 + (random() % 200);
sequences.push_back(generate_sequence(len));
quals.push_back(generate_quals(len));
}
{
std::ofstream input_fasta(file_name);
for(size_t i = 0; i < sequences.size(); ++i)
input_fasta << "@" << i << "\n"
<< sequences[i] << "\n"
<< "+\n"
<< quals[i] << "\n";
}
auto input1_fastq = new std::ifstream(file_name);
auto input2_fastq = new std::ifstream(file_name);
{
opened_streams<std::ifstream**> streams1(&input1_fastq, &input1_fastq + 1);
parser_qual_type parser1(10, 10, 1, streams1);
mer_qual_iterator_type mit(parser1, '#', false);
opened_streams<std::ifstream**> streams2(&input2_fastq, &input2_fastq + 1);
parser_qual_type parser2(10, 10, 1, streams2);
mer_qual_iterator_type mit_canonical(parser2, '#', true);
for(string_vector::const_iterator it = sequences.begin(), qit = quals.begin(); it != sequences.end(); ++it, ++qit) {
if(it->size() >= mer_dna::k()) {
for(size_t i = 0; i < it->size() - (mer_dna::k() - 1); ++i) {
const size_t next_N = qit->find_first_of("!\"", i);
SCOPED_TRACE(::testing::Message() << "i:" << i << " next_N:" << next_N);
if(next_N != std::string::npos && next_N >= i && next_N < i + mer_dna::k())
continue;
ASSERT_NE(mer_iterator_type(), mit);
EXPECT_EQ(it->substr(i, mer_dna::k()), mit->to_str());
ASSERT_NE(mer_iterator_type(), mit_canonical);
ASSERT_EQ(*mit_canonical, mit_canonical->get_canonical());
EXPECT_TRUE((it->substr(i, mer_dna::k()) == mit_canonical->to_str()) ||
(it->substr(i, mer_dna::k()) == mit_canonical->get_reverse_complement().to_str()));
++mit;
++mit_canonical;
}
}
}
EXPECT_EQ(mer_iterator_type(), mit);
}
}
} // namespace {
|