1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148
|
#include <fstream>
#include <iostream>
#include <string>
#include <random>
#include <seqan/arg_parse.h>
#include <seqan/store.h>
#include <seqan/parallel.h>
#ifdef _OPENMP
#include <omp.h>
#endif
using namespace seqan2;
template <typename TReads, typename TErrorDist, typename TStore>
void simulateReads(TReads & reads, TErrorDist const & errorDist, TStore const & store)
{
String<std::mt19937 > rng;
#ifdef _OPENMP
resize(rng, omp_get_max_threads());
#else
resize(rng, 1);
#endif
std::uniform_int_distribution<unsigned> distContig(0, length(store.contigStore) - 1);
std::uniform_int_distribution<unsigned> distOrientation(0, 1);
std::uniform_real_distribution<double> distFrac(0.0, 1.0);
std::uniform_int_distribution<unsigned> distSubstitute(1, 3);
unsigned readLen = length(errorDist);
SEQAN_OMP_PRAGMA(parallel for)
for (int i = 0; i < (int)length(reads); ++i)
{
unsigned id = 0;
#ifdef _OPENMP
id = omp_get_thread_num();
#endif
unsigned contigId = distContig(rng[id]);
unsigned pos;
bool ok;
do
{
std::uniform_int_distribution<unsigned> distPos(0, length(store.contigStore[contigId].seq) - readLen);
pos = distPos(rng[id]);
reads[i] = infix(store.contigStore[contigId].seq, pos, pos + readLen);
ok = true;
for (unsigned j = 0; j < readLen; ++j)
if (ordValue(reads[i][j]) == 4)
{
ok = false;
break;
}
}
while (!ok);
if (distOrientation(rng[id]) == 1)
reverseComplement(reads[i]);
for (unsigned j = 0; j < readLen; ++j)
{
if (distFrac(rng[id]) < errorDist[j])
{
reads[i][j] = (Dna5)((ordValue(reads[i][j]) + distSubstitute(rng[id])) & 3);
assignQualityValue(reads[i][j], 0);
}
else
assignQualityValue(reads[i][j], 40);
}
}
}
int main(int argc, const char * argv[])
{
//////////////////////////////////////////////////////////////////////////////
// Define options
ArgumentParser parser;
addUsageLine(parser, "[OPTION]... <reference.fasta> <error_dist file>");
CharString outputFilename = "reads.fa";
unsigned numReads = 1000000;
String<double> errorProb;
addArgument(parser, ArgParseArgument(ArgParseArgument::INPUT_FILE, "REFERENCE FILE"));
addArgument(parser, ArgParseArgument(ArgParseArgument::INPUT_FILE, "ERROR DIST FILE"));
addOption(parser, ArgParseOption("n", "num-reads", "number of reads", ArgParseOption::INTEGER));
addOption(parser, ArgParseOption("o", "output", "output read filename", ArgParseOption::STRING));
ArgumentParser::ParseResult res = parse(parser, argc, argv);
if (res != ArgumentParser::PARSE_OK)
return res == ArgumentParser::PARSE_ERROR;
//////////////////////////////////////////////////////////////////////////////
// Extract and check options
getOptionValue(numReads, parser, "num-reads");
getOptionValue(outputFilename, parser, "output");
CharString distFile;
getArgumentValue(distFile, parser, 1);
std::ifstream errorDistFile(toCString(distFile), std::ios_base::in | std::ios_base::binary);
while (true)
{
double error;
errorDistFile >> error;
if (!errorDistFile.good())
break;
appendValue(errorProb, error);
}
FragmentStore<> store;
CharString referenceFile;
getArgumentValue(distFile, parser, 0);
if (!loadContigs(store, toCString(referenceFile)))
{
std::cerr << "Failed to load genome." << std::endl;
return 1;
}
//////////////////////////////////////////////////////////////////////////////
// Simulate reads
String<String<Dna5Q> > reads;
resize(reads, numReads);
simulateReads(reads, errorProb, store);
//////////////////////////////////////////////////////////////////////////////
// Output losses to file
std::ofstream readFile(toCString(outputFilename));
for (unsigned i = 0; i < length(reads); ++i)
{
readFile << "@read" << i << std::endl;
readFile << reads[i] << std::endl;
readFile << "+" << std::endl;
for (unsigned j = 0; j < length(reads[i]); ++j)
readFile << (char)('!' + getQualityValue(reads[i][j]));
readFile << std::endl;
}
return 0;
}
|