File: coordMapper.cpp

package info (click to toggle)
libgenome 1.3.1-7
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 2,156 kB
  • ctags: 1,212
  • sloc: cpp: 10,910; sh: 8,264; makefile: 79
file content (353 lines) | stat: -rw-r--r-- 12,153 bytes parent folder | download | duplicates (8)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include "libGenome/gnSequence.h"
#include <iostream>
#include <fstream>
#include <algorithm>

struct ExMem{
	gnSeqI length;
	int64 ex_start;
	int64 in_start;

};

static boolean LocationLessthan(const gnLocation& a, const gnLocation& b);

static boolean LocationLessthan(const gnLocation& a, const gnLocation& b){
	return a.GetStart() < b.GetStart();
}

static boolean LocationEndLessthan(const gnLocation& a, const gnLocation& b);

static boolean LocationEndLessthan(const gnLocation& a, const gnLocation& b){
	return a.GetEnd() < b.GetStart();
}

static boolean LocationSizeLessthan(const gnLocation& a, const gnLocation& b);

static boolean LocationSizeLessthan(const gnLocation& a, const gnLocation& b){
	return a.GetEnd() - a.GetStart() < b.GetEnd() - b.GetStart();
}

static boolean ExMemLessthan(const ExMem& a, const ExMem& b){
	if(a.ex_start == b.ex_start){
		if(a.in_start == b.in_start){
			return a.length < b.length;
		}
		return a.in_start < b.in_start;
	}
	return a.ex_start < b.ex_start;
}

void print_usage(char* pname){
	cout << "Usage: " << pname << " <genbank file> <exon_list> <intron_list> <mem_list> <regulation_network> <minimum match size>\n";
}

void load_map_file(string& filename, vector<gnLocation>& loc_list){
	ifstream coord_file;
	coord_file.open(filename.c_str());
	if(!coord_file.is_open()){
		cout << "couldn't open file\n";
		return;
	}

	//read all the exon coordinates
	while(coord_file.good()){
		gnSeqI start_coord, end_coord;
		coord_file >> start_coord;
		coord_file >> end_coord;
		
		gnLocation new_location(start_coord, end_coord);
		loc_list.push_back(new_location);
	}
}

void map_coordinates(vector<gnLocation>& loc_list, vector<gnLocation>& map_list){
	gnSeqI curStart = 1;
	for(uint32 locationI = 0; locationI < loc_list.size(); locationI++){
		gnSeqI cur_len = loc_list[locationI].GetEnd() - loc_list[locationI].GetStart() + 1;
		gnLocation map_location(curStart, curStart+cur_len-1);
		map_list.push_back(map_location);
		curStart += cur_len;
	}
}

void map_list_coordinates(list<gnLocation>& loc_list, list<gnLocation>& map_list){
	gnSeqI curStart = 1;
	list<gnLocation>::iterator loc_iter = loc_list.begin();
	for(; loc_iter != loc_list.end(); loc_iter++){
		gnSeqI cur_len = loc_iter->GetEnd() - loc_iter->GetStart() + 1;
		gnLocation map_location(curStart, curStart+cur_len-1);
		map_list.push_back(map_location);
		curStart += cur_len;
	}
}

void print_feature(ostream& os, gnBaseFeature* cur_feat){
	os << cur_feat->GetName() << ": \n";
	for(uint32 i=0; i < cur_feat->GetQualifierListLength(); i++)
		os << cur_feat->GetQualifierName(i) << "\t" << cur_feat->GetQualifierValue(i) << "\n";
	os << "\n";

}

uint32 loc_binary_search(vector<gnLocation>& loc_list, uint32 startI, uint32 endI, gnLocation& query_loc){
	uint32 midI = ((endI - startI) / 2) + startI;
	if(startI == endI)
		return endI;

	if(loc_list[midI].Intersects(query_loc)){
		return midI;
	}else if(loc_list[midI].GetStart() < query_loc.GetStart())
		return loc_binary_search(loc_list, midI + 1, endI, query_loc);
	else
		return loc_binary_search(loc_list, startI, midI, query_loc);
}

int main(int argc, char* argv[]){
	
	boolean run_interactive = false;
	string seq_filename;
	string exon_list_filename;
	string intron_list_filename;
	string mem_list_filename;
	string reg_net_filename;
	vector<gnLocation> exon_list;
	vector<gnLocation> intron_list;
	vector<gnLocation> exon_map_list;
	vector<gnLocation> intron_map_list;
	vector<ExMem> mem_list;
	uint32 minimum_match_size;

	// check for correct calling semantics
	if(argc != 7){
		print_usage(argv[0]);
		return -1;
	}

	seq_filename = argv[1];
	exon_list_filename = argv[2];
	intron_list_filename = argv[3];
	mem_list_filename = argv[4];
	reg_net_filename = argv[5];
	minimum_match_size = atoi(argv[6]);
	
	ifstream mem_file(mem_list_filename.c_str());
	if(!mem_file.is_open()){
		cout << "Error opening " << mem_list_filename << "\n";
		return -1;
	}

	if(run_interactive){
		cout << "Give the name of the exon list to search\n";
		cin >> exon_list_filename;
		cout << "Give the name of the intron list to search\n";
		cin >> intron_list_filename;
		cout << "Give the name of the regulatory network to output\n";
		cin >> reg_net_filename;

	}
	ofstream net_file(reg_net_filename.c_str());
	
	if(!net_file.is_open()){
		cout << "Error opening regulatory network file: " << reg_net_filename << "\n";
		return -2;
	}
	
	load_map_file(exon_list_filename, exon_list);
	load_map_file(intron_list_filename, intron_list);
	
	cout << exon_list.size() << " unique exons loaded from file\n";
	cout << intron_list.size() << " unique introns loaded from file\n";
	
	//now load the genbank file
	gnSequence seq_file;
	if(run_interactive){
		cout << "Enter the name of the genbank sequence file you are using\n";
		cin >> seq_filename;
	}
	if(!seq_file.LoadSource(seq_filename)){
		cout << "Error loading file\n";
		return -1;
	}
	cout << "Sequence loaded successfully, " << seq_file.length() << " base pairs.\n";
	
	//construct a mapping between coordinates...
	map_coordinates(exon_list, exon_map_list);
	map_coordinates(intron_list, intron_map_list);
	
	//now read the mem file
	while(mem_file.good()){
		ExMem m;
		mem_file >> m.length;
		mem_file >> m.ex_start;
		mem_file >> m.in_start;
		if(m.length >= minimum_match_size)
			mem_list.push_back(m);
	}
	cout << mem_list.size() << " matches loaded.\n";
	//sort the mem list
	sort(mem_list.begin(), mem_list.end(), &ExMemLessthan);
	
	//now get the intersection for each mem in the list...
	uint32 exonmapI = 0;
	uint32 notify_percent = 10;
	uint32 notify_interval = mem_list.size() / notify_percent;
	uint32 percent_count = 0;
	cout << "Searching for complementary matches:\n";
	for(uint32 memI = 0; memI < mem_list.size(); memI++){
		if(memI % notify_interval == 0){
			cout << percent_count << "%.. ";
			percent_count += notify_percent;
		}
		//simple linear search for intersecting exon mapping
		gnLocation ex_map_loc(mem_list[memI].ex_start, mem_list[memI].ex_start + mem_list[memI].length - 1);
		for(; exonmapI < exon_map_list.size(); exonmapI++){
			if(exon_map_list[exonmapI].Intersects(ex_map_loc))
				break;
		}
		
		//continue to search for any other mappings that intersect
		uint32 mapEnd = exonmapI;
		for(; mapEnd < exon_map_list.size(); mapEnd++){
			if(!exon_map_list[exonmapI].Intersects(ex_map_loc))
				break;
		}
		mapEnd--;
		
		uint32 intronmapI = 0; // intronmapI will contain the index of the first intersecting intron
		//do a binary search for intersecting intron mappings
		int64 cur_in_start = mem_list[memI].in_start;
		if(cur_in_start < 0)
			cur_in_start = -cur_in_start;
		gnLocation in_map_loc(cur_in_start, cur_in_start + mem_list[memI].length - 1);
		uint32 search_mapI = loc_binary_search(intron_map_list, 0, intron_map_list.size()-1, in_map_loc);
		intronmapI = search_mapI;

		//search backwards for previous intersections
		for(; intronmapI >= 0; intronmapI--){
			if(!intron_map_list[intronmapI].Intersects(in_map_loc)){
				intronmapI++;
				break;
			}
			if(intronmapI == 0)
				break;
		}
		//continue to search for any other mappings that intersect
		uint32 intron_mapEnd = search_mapI;
		for(; intron_mapEnd < intron_map_list.size(); intron_mapEnd++){
			if(!intron_map_list[intronmapI].Intersects(in_map_loc))
				break;
		}
		intron_mapEnd--;
		
		//we have the mappings, now map the coordinates
		vector<uint32> ex_feat_index, in_feat_index;
		vector<gnBaseFeature*> ex_feat_list;
		vector<gnBaseFeature*> in_feat_list;
		gnSeqI cur_match_len = mem_list[memI].length;
		
		//find out how much of the first exon was matched
		//extra exon start has the number of unmatched bases at the beginning of the exon
		gnSeqI extra_exon_start = mem_list[memI].ex_start - exon_map_list[exonmapI].GetStart();
		gnSeqI cur_exon_len = exon_map_list[exonmapI].GetEnd() - exon_map_list[exonmapI].GetStart() + 1;
		gnSeqI max_exon_chunk = cur_exon_len - extra_exon_start;
		gnSeqI cur_exon_chunk = max_exon_chunk < cur_match_len ? max_exon_chunk : cur_match_len;
		
		//find out how much of the first intron was matched
		gnSeqI extra_intron_start, cur_intron_len, max_intron_chunk, cur_intron_chunk;
		boolean complement = false;
		if(mem_list[memI].in_start > 0){
			extra_intron_start = mem_list[memI].in_start - intron_map_list[intronmapI].GetStart();
			cur_intron_len = intron_map_list[intronmapI].GetEnd() - intron_map_list[intronmapI].GetStart() + 1;
			max_intron_chunk = cur_intron_len - extra_intron_start;
			cur_intron_chunk = max_intron_chunk < mem_list[memI].length ? max_intron_chunk : mem_list[memI].length;
		}else{
			//reverse complement, start at the end.
			if(cur_in_start >= intron_map_list[intron_mapEnd].GetStart())
				cur_intron_chunk = cur_match_len;
			else
				cur_intron_chunk = cur_in_start + cur_match_len - intron_map_list[intron_mapEnd].GetStart();
			complement = true;
			seq_file.getIntersectingFeatures(intron_list[intronmapI], in_feat_list, in_feat_index);
		}
		
		//the current chunk will be the smaller of the two mappings
		gnSeqI cur_chunk = cur_intron_chunk < cur_exon_chunk ? cur_intron_chunk : cur_exon_chunk;

		gnLocation cur_exon_loc(exon_list[exonmapI].GetStart() + extra_exon_start, exon_list[exonmapI].GetStart() + cur_chunk);
		seq_file.getIntersectingFeatures(cur_exon_loc, ex_feat_list, ex_feat_index);
		if(mem_list[memI].in_start > 0){
			gnLocation cur_intron_loc(intron_list[intronmapI].GetStart() - 1, intron_list[intronmapI].GetEnd() + 1);
			seq_file.getIntersectingFeatures(cur_intron_loc, in_feat_list, in_feat_index);
		}else{
			gnLocation cur_intron_loc(intron_list[intron_mapEnd].GetStart() - 1, intron_list[intron_mapEnd].GetEnd() + 1);
			seq_file.getIntersectingFeatures(cur_intron_loc, in_feat_list, in_feat_index);
		}
		vector<gnBaseFeature*> ex_forward;
		vector<gnBaseFeature*> ex_reverse;
		vector<gnBaseFeature*> in_forward;
		vector<gnBaseFeature*> in_reverse;

		for(uint32 featI = 0; featI < ex_feat_list.size(); featI++){
			string featName = ex_feat_list[featI]->GetName();
			if(featName == "mRNA" || featName == "CDS" || featName == "gene" )
				if(ex_feat_list[featI]->GetLocationType() == gnLocation::LT_Complement)
					ex_reverse.push_back(ex_feat_list[featI]);
				else
					ex_forward.push_back(ex_feat_list[featI]);
		}
		for(uint32 featI = 0; featI < in_feat_list.size(); featI++){
			string featName = in_feat_list[featI]->GetName();
			if(featName == "mRNA" || featName == "CDS" || featName == "gene" )
				if(in_feat_list[featI]->GetLocationType() == gnLocation::LT_Complement)
					in_reverse.push_back(in_feat_list[featI]);
				else
					in_forward.push_back(in_feat_list[featI]);
		}

		if(complement){
			vector<gnBaseFeature*> tmp_in = in_forward;
			in_forward = in_reverse;
			in_reverse = tmp_in;
		}

		//now print out all the complementary features
		if((ex_forward.size() > 0 && in_reverse.size() > 0) || (in_forward.size() > 0 && ex_reverse.size() > 0)){
			net_file << "================================\n";
			net_file << "Mem: " << mem_list[memI].length << "\n";
			net_file << "This exon/intron matching size: " << cur_chunk << "\n";
		}
		if(ex_forward.size() > 0 && in_reverse.size() > 0){
			net_file << "Forward Exons:\n";
			for(uint32 featI = 0; featI < ex_forward.size(); featI++)
				print_feature(net_file, ex_forward[featI]);
			net_file << "Matching introns:\n";
			for(uint32 featI = 0; featI < in_reverse.size(); featI++)
				print_feature(net_file, in_reverse[featI]);
		}
		if(in_forward.size() > 0 && ex_reverse.size() > 0){
			net_file << "Reverse Exons:\n";
			for(uint32 featI = 0; featI < ex_reverse.size(); featI++)
				print_feature(net_file, ex_reverse[featI]);
			net_file << "Matching introns:\n";
			for(uint32 featI = 0; featI < in_forward.size(); featI++)
				print_feature(net_file, in_forward[featI]);
		}
		
		//release memory
		for(uint32 featI = 0; featI < ex_feat_list.size(); featI++)
			delete ex_feat_list[featI];
		for(uint32 featI = 0; featI < in_feat_list.size(); featI++)
			delete in_feat_list[featI];
		
		//loop while there is stuff to match
//		while(cur_match_len > 0){
		
//		}
	}
}