File: KissReadsGraph.cpp

package info (click to toggle)
mapsembler2 2.2.4%2Bdfsg-3
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 8,208 kB
  • sloc: cpp: 51,300; ansic: 13,434; sh: 483; makefile: 394; asm: 271; python: 28
file content (298 lines) | stat: -rw-r--r-- 11,660 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
//
//  KissReadsGraph.cpp
//  kissreads_g
//
//  Created by Pierre Peterlongo on 02/01/13.
//  Copyright (c) 2013 Pierre Peterlongo. All rights reserved.
//

#include"commons.h"
#include"Loader.h"
#include"ReadMapper.h"
#include <zlib.h> // Added by Pierre Peterlongo on 02/08/2012.


extern int size_seeds;
extern unsigned int nbits_nbseeds;
extern uint64_t  mask_nbseed ;
extern uint64_t  mask_offset_seed;

char * getVersion(){
	return (char *)"1.0.0 Beta - Copyright INRIA - CeCILL License";
}
//#define VERBOSE


void print_usage_and_exit(char * name){
	fprintf (stderr, "NAME\n%s, version %s\n", name, getVersion());
	fprintf (stderr, "\t -h prints this message and exit\n");
	fprintf (stderr, "\nUSAGE\n%s input_graph <readsC1.fasta/fastq[.gz]> [<readsC2.fasta/fastq[.gz]> [<readsC3.fasta/fastq[.gz] ...] -M -t type [-k value] [-c min_coverage] [-d max_substitutions] [-o name] [-h] \n", name);
	fprintf (stderr, "\nDESCRIPTION\n");
	fprintf (stderr, "%s maps the provided reads on the graph\n",name);
	fprintf (stderr, "With option \"-t coverage\": outputs an equivalent graph removing uncovered edges and adding:\n");
	fprintf (stderr, "\t for each node: the coverage per sample and per position\n");
    fprintf (stderr, "\t for each edge: the number of mapped reads per sample using this edge\n");
	fprintf (stderr, "With option \"-t modify\": outputs the same simplified graph:\n");
	fprintf (stderr, "\t removing low covered edges and nodes (less that min_coverage)\n");
    fprintf (stderr, "\t then phasing simple non branching paths\n");
    fprintf (stderr, "If -M option is specified, the input is considered as a Mapsembler output - thus composed of multiple independent graphes\n");
    
    
	fprintf (stderr, "\nMANDATORY\n");
	fprintf (stderr, "\t -t <type> \n");
	fprintf (stderr, "\t\t \"c\" \"coverage\" \n");
    fprintf (stderr, "\t\t \"m\" \"modify\" \n");
    
	fprintf (stderr, "\nOPTIONS\n");
    fprintf (stderr, "-M: the input is considered as a Mapsembler output\n");
	fprintf (stderr, "\t -o file_name: write obtained graph. Default: standard output \n");
	fprintf (stderr, "\t -k size_seed: will use seeds of length size_seed. Default: 25.\n");
	fprintf (stderr, "\t -c min_coverage: Will consider an edge as coherent if coverage (number of reads from all sets using this edge) is at least min_coverage. Default: 2 \n");
	fprintf (stderr, "\t -d max_substitutions: Will map a read on the graph with at most max_substitutions substitutions. Default: 1 \n");
    
    
	exit(0);
}




int main(int argc, char **argv) {
	setvbuf(stdout,(char*)NULL,_IONBF,0); // disable buffering for printf()
	size_seeds=25;
	int min_coverage=2;
    int max_substitutions = 1;
    bool modify=false;
    bool option_t_seen=false;
    bool input_mapsembler=false;
    
    if(argc<2) print_usage_and_exit(argv[0]);
    char * toCheck_file =strdup(argv[1]);
    char * output_file=NULL;
	// GET ALL THE READ FILES
	// find the number of read sets
    int number_of_read_sets=0;
	while(number_of_read_sets+2<argc && argv[number_of_read_sets+2][0]!='-') number_of_read_sets++;
	char ** reads_file_names = (char **) malloc(sizeof(char *)*number_of_read_sets);
    // copy the read set file names
	number_of_read_sets=0;
	while(number_of_read_sets+2<argc && argv[number_of_read_sets+2][0]!='-'){
		reads_file_names[number_of_read_sets]=strdup(argv[number_of_read_sets+2]);
		number_of_read_sets++;
	}
    
    if(number_of_read_sets==0)
        print_usage_and_exit(argv[0]);
    Bank *reads = new Bank(argv+2,number_of_read_sets);
    
	while (1)
	{
        int temoin = getopt (argc-number_of_read_sets-1, &argv[number_of_read_sets+1], "hd:c:k:o:t:M");
		if (temoin == -1){
			break;
		}
		switch (temoin)
		{
            case 'M':
                input_mapsembler=true;
                printf("-M option caught: The input graph is considered as generated by Mapsembler\n");
                break;
            case 'c':
                min_coverage=atoi(optarg);
                if(min_coverage<1) min_coverage=1;
                printf(" Will consider an edge as coherent if coverage (number of reads from all sets using this edge) is at least %d.\n", min_coverage);
                break;
            case 'd':
                max_substitutions=atoi(optarg);
                if(max_substitutions<0) max_substitutions=0;
                printf(" Will map a read on the graph with at most %d substitutions.\n", max_substitutions);
                break;
            case 'k':
                size_seeds=atoi(optarg);
                
                if(size_seeds<5){
                    fprintf(stderr,"%d too small to be used as seed length. kisSnpCheckReads continues with seeds of length 5\n",size_seeds);
                    size_seeds=10;
                }
                if(size_seeds%2==0){
                    size_seeds-=1;
                    printf("Need odd kmer size to avoid palindromes. I've set kmer size to %d.\n",size_seeds);
                }
                else
                    printf(" Will use seeds of length %d\n", size_seeds);
                break;
            case 'o':
                output_file=strdup(optarg);
                break;
            case 'h':
                print_usage_and_exit(argv[0]);
                break;
            case 't':
                if(strcmp(optarg,"m")==0 || strcmp(optarg,"modify")==0) {
                    printf("%s will modify the graph\n", argv[0]);
                    modify=true;
                }
                if(strcmp(optarg,"c")==0 || strcmp(optarg,"coverage")==0) {
                    printf("%s will compute the coverage of the graph\n", argv[0]);
                    modify=false;
                }
                option_t_seen=true;
                break;
            case '-':
                if(strcmp("version", optarg)==0){
                    printf("kissReads graph version %s\n", getVersion());
                    exit(0);
                }
                printf(" what next ? %s\n",optarg);
                break;
            default:
                //			printf ("Unknown option %c\n", temoin);
                print_usage_and_exit(argv[0]);
		}
	}
    
	if ( argc  - optind <2)
	{
		print_usage_and_exit(argv[0]);
	}
    
    if(!option_t_seen){
        fprintf(stderr,"Error, -t is mandatory, please chose between \"modify\" or \"coverage\"\n");
        print_usage_and_exit(argv[0]);
    }
    
    init_static_variables();
    nbits_nbseeds = 8*sizeof(uint64_t)- NBITS_OFFSET_SEED ;
    mask_nbseed  = ( 1ULL <<  (uint64_t) nbits_nbseeds  ) -1 ;
    mask_offset_seed = (1ULL << (NBITS_OFFSET_SEED)) -1 ;
    
    
    // LOAD and CHECK THE GRAPH
    printf("load graph(s)...");
    vector<DBG*> dbgs;
    if(input_mapsembler){
        dbgs = load_mapsembler_json(argv[1], number_of_read_sets);
    }
    else
        dbgs.push_back(load_json(argv[1], number_of_read_sets));
    
    
    for(int i=0;i<dbgs.size();i++){
        printf("graph %d...",i);
//        dbgs[i]->depth_first_print();
        dbgs[i]->alloc_coverage_quality(number_of_read_sets);
        dbgs[i]->threshold_substitutions=max_substitutions;
//        dbgs[i]->depth_first_print();
        
        printf(" checked...");
        dbgs[i]->depth_first_check();
        
//        dbgs[i]->init_boolean_vector();
        
        printf(" indexed\n");
        // INDEX kmers in the graph
        dbgs[i]->index_nodes();
        
        //    // Map the reads
        //    printf("\"Unpalindromize\" palindromic nodes... ");
        //    int nb_unp = dbgs[i]->unpalindromize();
        //    printf("done, %d nodes were \"Unpalindromized\"\n", nb_unp);
        
        //    exit(0);
        
        // DEBUG GET NEXT READ FROM FILE:
//        char * read;        int size_read;
//        for(int file_id=0;file_id<reads->nb_files;file_id++){
//            printf("file_id %d\n", file_id);
//            int nb=0;
//            reads->open_stream(file_id);
//            while(reads->get_next_seq_from_file(&read,&size_read, file_id)){ // each read
//                printf("%s\n", read);
//                if(nb%10==0)
//                    printf("\r %d reads mapped (file_id %d)", nb, file_id);
//                nb++;
//            }
//            reads->close_stream(file_id);
//            
//        }
        // END DEBUG GET NEXT READ FROM FILE:
            
        // Map the reads
        
        reads->rewind_all();
    }
    
    printf("Mapping the reads... \n");
    uint64_t nb_mapped_reads = map_reads(reads, dbgs);
    printf("done, %ld reads were mapped with a unique best match in at least one of the input graph(s)\n",(long int) nb_mapped_reads);
    
    for(int i=0;i<dbgs.size();i++){
        
        
        
        if(modify){
            
            printf("Remove uncovered edges... ");
            int number_edges_removed = dbgs[i]->remove_uncovered_edges(min_coverage, number_of_read_sets);
            printf("done, %d edge(s) covered with less than %d reads were removed\n", number_edges_removed, min_coverage);
            
            
            printf("Remove uncovered nodes... ");
            int number_nodes_removed = dbgs[i]->remove_uncovered_nodes(min_coverage, number_of_read_sets);
            printf("done, %d nodes(s) on which all positions are covered with less than %d reads %s removed\n", number_nodes_removed, min_coverage, number_nodes_removed>1?"were":"was");
            
            
            printf("Simplify simple paths... ");
            int number_simplified = dbgs[i]->simplify_simple_paths_couples ();
            printf("done, %d nodes couple%s merged: into [==]  \n", number_simplified, number_simplified>1?"s [=]->[=] were":" [=]->[=] was");
            
            
            
            printf("Remove unreachable nodes... ");
            number_nodes_removed = dbgs[i]->depth_first_clean();
            printf("done, %d unreachable nodes(s)  %s removed\n", number_nodes_removed, number_nodes_removed>1?"were":"was");
            
//            printf("Duplicate deaded paths... ");
//            int number_de_duplicated = dbgs[i]->duplicate_deadend_nodes();
//            printf("done, %d deadend path%s duplicated\n", number_de_duplicated, number_de_duplicated>1?"s were":" was");
//            
//            printf("Phase paths... ");
//            int number_duplicated = dbgs[i]->duplicate_simple_nodes();
//            printf("done, %d path%s detected and phased\n", number_duplicated, number_duplicated>1?"s were":" was");
            
            ////#define TRIO
            //#ifdef TRIO
            //        int number_simplified = dbgs[i]->simplify_simple_paths();
            //        printf("done, %d nodes trio ->[=]->[=]->[=]-> were merged: into ->[===]->  \n", number_simplified);
            //#else
            
            //#endif
        }
    } // ALL GRAPHS
    
    
    printf("Write the obtained graph%s... \n",(input_mapsembler&&dbgs.size()>1)?"s":"");
    
    FILE * res = stdout;
    if(output_file)
        res = fopen(output_file,"w");
    if(input_mapsembler){
        DBG::output_graphs(argv[1], res, number_of_read_sets, !modify, dbgs);
    }
    
    else{
        int edge_id=0;
        dbgs[0]->output_graph(res, number_of_read_sets, edge_id, !modify);
//        if(modify)
//            dbgs[0]->output_graph(res, number_of_read_sets, false);
//        else
//            dbgs[0]->output_graph(res, number_of_read_sets, true);
    }
    if(output_file) fclose(res);
    
    if(output_file)
        printf("All done, result graph is in file %s - good bye\n", output_file);
    else
        printf("All done, good bye\n");
    return 0;
}