File: filter_by_previous_contigs.cpp

package info (click to toggle)
minia 3.2.1%2Bgit20200522.4960a99-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 3,012 kB
  • sloc: cpp: 1,376; sh: 397; python: 208; makefile: 8
file content (149 lines) | stat: -rw-r--r-- 5,609 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
// comments added a posteriori, 1 year later:
// script that keeps reads if they match at least one kmer from a contig file
// probably this was done for multi-k optimization? 

// to compile it is a bit tricky: go to dsk repository, put that script in dsk/utils folder and use that CMakeLists file: https://github.com/GATB/dsk/blob/12de3e1f49e04f372b80773e7c4bc829c28e5657/utils/CMakeLists.txt

#include <gatb/gatb_core.hpp>
#include <fstream>
#include <set>

// We use the required packages
using namespace std;


static /* probably important that it's static here too */
char revcomp (char s) {
	if (s == 'A') return 'T';
	else if (s == 'C') return 'G';
	else if (s == 'G') return 'C';
	else if (s == 'T') return 'A';
	else if (s == 'a') return 't';
	else if (s == 'c') return 'g';
	else if (s == 'g') return 'c';
	else if (s == 't') return 'a';
	return 'X';
}

static string revcomp (const string &s) {
	string rc;
	for (signed int i = s.length() - 1; i >= 0; i--) {rc += revcomp(s[i]);}
	return rc;
}

/********************************************************************************/
class FilterByPreviousContigs : public Tool
{
    public:

        /** */
        FilterByPreviousContigs () : Tool ("filter_by_previous_contigs")
    {
        getParser()->push_front (new OptionOneParam ("-previous-contigs",   "previous contigs"  , true));
        getParser()->push_front (new OptionOneParam ("-reads",              "reads"  , true));
        getParser()->push_front (new OptionOneParam (STR_KMER_SIZE,  "k-mer size used to generate previous contigs"  , true));
        getParser()->push_front (new OptionOneParam ("-out",                "output file that will contain filtered reads",           true));
    }

        /** */
        void execute ()
        {
            size_t kmerSize = getInput()->getInt (STR_KMER_SIZE);

            Integer::apply<Functor,Parameter> (kmerSize, Parameter(*this, kmerSize));
        }

        struct Parameter
        {
            Parameter (FilterByPreviousContigs& tool, size_t kmerSize) : tool(tool), kmerSize(kmerSize)  {}
            FilterByPreviousContigs& tool;
            size_t     kmerSize;
        };

        template<size_t span> struct Functor  {  void operator ()  (Parameter parameter)
            {
                FilterByPreviousContigs& tool     = parameter.tool;
                size_t                   kmerSize = parameter.kmerSize;
                unsigned int k = kmerSize;

                typedef typename Kmer<span>::Count Count;

                typename Kmer<span>::ModelCanonical model (kmerSize);

                BankFasta bank(tool.getInput()->getStr("-previous-contigs"));
                IBank* bankReads = Bank::open(tool.getInput()->getStr("-reads"));
                BankFasta bankOut(tool.getInput()->getStr("-out"));

                BankFasta::Iterator itCtg (bank);
                set<string> kmers;

                std::cout << "indexing previous contigs" << std::endl;

                for (itCtg.first(); !itCtg.isDone(); itCtg.next())
                {
                    string seq = itCtg->toString();
                    string kmerBegin = seq.substr(0, k );
                    string kmerEnd = seq.substr(seq.size() - k , k );

                    kmers.insert(kmerBegin);
                    kmers.insert(kmerEnd);
                    kmers.insert(revcomp(kmerBegin));
                    kmers.insert(revcomp(kmerEnd));
                }
                
                std::cout << "filtering reads" << std::endl;

                Iterator<Sequence>* itReads = bankReads->iterator();
                uint64_t nb_reads = 0, nb_kept = 0;

                for (itReads->first(); !itReads->isDone(); itReads->next())
                {
                    bool contains = false;
                    string seq = (*itReads)->toString();
                    nb_reads++;
                    int found_pos = 0;
                    for (int i = 0; i < seq.size() - k; i++)
                    {
                        string kmer = seq.substr(i, k );
                        contains = (kmers.find(kmer) != kmers.end());
                        if (contains) 
                        {
                            found_pos = i;
                            break;
                        }
                    }
                    if (contains)
                    {
                        Sequence s (Data::ASCII);
                        s.getData().setRef ((char*)seq.c_str(), seq.size());
                        s._comment = (*itReads)->getComment() + " contains previous-k kmer at pos " + to_string(found_pos);
                        bankOut.insert(s);
                        nb_kept++;
                    }
                }

                std::cout << "done filtering reads, kept " << nb_kept << "/" << nb_reads << std::endl;

                /** We gather some statistics. */
                tool.getInfo()->add (1, "stats");
                tool.getInfo()->add (2, "kmer_size", "%ld", kmerSize);
            }};
};

/********************************************************************************/
/*                       Dump solid kmers in ASCII format                       */
/********************************************************************************/
int main (int argc, char* argv[])
{
    /*    try
          {*/
    FilterByPreviousContigs().run (argc, argv);
    /*    }
          catch (Exception& e)
          {
          std::cout << "EXCEPTION: " << e.getMessage() << std::endl;
          return EXIT_FAILURE;
          }*/ 
    // gdb likes having clean exceptions
}