File: gff_utils.cpp

package info (click to toggle)
gffread 0.12.1-4
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 1,412 kB
  • sloc: cpp: 2,563; sh: 96; makefile: 69
file content (1456 lines) | stat: -rw-r--r-- 48,084 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
#include "gff_utils.h"

GHash<GeneInfo> gene_ids;

bool verbose=false; //same with GffReader::showWarnings and GffLoader::beVserbose
bool debugMode=false;
bool ensembl_convert=false; //-L, assist in converting Ensembl GTF to GFF3

FILE* ffasta=NULL;
FILE* f_in=NULL;
FILE* f_out=NULL;
FILE* f_w=NULL; //writing fasta with spliced exons (transcripts)
int wPadding = 0; //padding for -w option
FILE* f_x=NULL; //writing fasta with spliced CDS
FILE* f_y=NULL; //wrting fasta with translated CDS


int maxintron=999000000;

bool wCDSonly=false;
bool wNConly=false;
int minLen=0; //minimum transcript length
bool validCDSonly=false; // translation with no in-frame STOP
bool bothStrands=false; //for single-exon mRNA validation, check the other strand too
bool altPhases=false; //if original phase fails translation validation,
                     //try the other 2 phases until one makes it
bool addCDSattrs=false;
bool add_hasCDS=false;
//bool streamIn=false; // --stream option
bool adjustStop=false; //automatic adjust the CDS stop coordinate
bool covInfo=false; // --cov-info : only report genome coverage
GStr tableFormat; //list of "attributes" to print in tab delimited format
bool spliceCheck=false; //only known splice-sites
bool decodeChars=false; //decode url-encoded chars in attrs (-D)
bool StarStop=false; //use * instead of . for stop codon translation
bool fullCDSonly=false; // starts with START, ends with STOP codon

bool multiExon=false;
bool writeExonSegs=false;
char* tracklabel=NULL;
char* rfltGSeq=NULL;
char rfltStrand=0;
uint rfltStart=0;
uint rfltEnd=MAX_UINT;
bool rfltWithin=false; //check for full containment within given range
bool addDescr=false;


bool fmtGFF3=true; //default output: GFF3
//other formats only make sense in transcriptOnly mode
bool fmtGTF=false;
bool fmtBED=false;
bool fmtTLF=false;
bool fmtTable=false;

GffPrintMode exonPrinting=pgffAny;

GFastaDb gfasta;

GHash<SeqInfo> seqinfo;
GVec<CTableField> tableCols;
GHash<RefTran> reftbl;

GHash<int> isoCounter; //counts the valid isoforms

void printFasta(FILE* f, GStr* defline, char* seq, int seqlen, bool useStar) {
 if (seq==NULL) return;
 int len=(seqlen>0)?seqlen:strlen(seq);
 if (len<=0) return;
 if (defline!=NULL)
     fprintf(f, ">%s\n",defline->chars());
 int ilen=0;
 for (int i=0; i < len; i++, ilen++) {
   if (ilen == 70) {
     fputc('\n', f);
     ilen = 0;
     }
   if (useStar && seq[i]=='.')
        putc('*', f);
   else putc(seq[i], f);
   } //for
 fputc('\n', f);
}

int qsearch_gloci(uint x, GList<GffLocus>& loci) {
  //binary search
  //do the simplest tests first:
  if (loci[0]->start>x) return 0;
  if (loci.Last()->start<x) return -1;
  uint istart=0;
  int i=0;
  int idx=-1;
  int maxh=loci.Count()-1;
  int l=0;
  int h = maxh;
  while (l <= h) {
     i = (l+h)>>1;
     istart=loci[i]->start;
     if (istart < x)  l = i + 1;
          else {
             if (istart == x) { //found matching coordinate here
                  idx=i;
                  while (idx<=maxh && loci[idx]->start==x) {
                     idx++;
                     }
                  return (idx>maxh) ? -1 : idx;
                  }
             h = i - 1;
             }
     } //while
 idx = l;
 while (idx<=maxh && loci[idx]->start<=x) {
    idx++;
    }
 return (idx>maxh) ? -1 : idx;
}

int qsearch_rnas(uint x, GList<GffObj>& rnas) {
  //binary search
  //do the simplest tests first:
  if (rnas[0]->start>x) return 0;
  if (rnas.Last()->start<x) return -1;
  uint istart=0;
  int i=0;
  int idx=-1;
  int maxh=rnas.Count()-1;
  int l=0;
  int h = maxh;
  while (l <= h) {
     i = (l+h)>>1;
     istart=rnas[i]->start;
     if (istart < x)  l = i + 1;
          else {
             if (istart == x) { //found matching coordinate here
                  idx=i;
                  while (idx<=maxh && rnas[idx]->start==x) {
                     idx++;
                     }
                  return (idx>maxh) ? -1 : idx;
                  }
             h = i - 1;
             }
     } //while
 idx = l;
 while (idx<=maxh && rnas[idx]->start<=x) {
    idx++;
    }
 return (idx>maxh) ? -1 : idx;
}

int cmpRedundant(GffObj& a, GffObj& b) {
  if (a.exons.Count()==b.exons.Count()) {
     if (a.covlen==b.covlen) {
       return strcmp(a.getID(), b.getID());
       }
     else return (a.covlen>b.covlen)? 1 : -1;
     }
   else return (a.exons.Count()>b.exons.Count())? 1: -1;
}


bool tMatch(GffObj& a, GffObj& b) {
  //strict intron chain match, or single-exon perfect match
  int imax=a.exons.Count()-1;
  int jmax=b.exons.Count()-1;
  int ovlen=0;
  if (imax!=jmax) return false; //different number of introns

  if (imax==0) { //single-exon mRNAs
    //if (equnspl) {
      //fuzz match for single-exon transfrags:
      // it's a match if they overlap at least 80% of max len
      ovlen=a.exons[0]->overlapLen(b.exons[0]);
      int maxlen=GMAX(a.covlen,b.covlen);
      return (ovlen>=maxlen*0.8);
    /*}
    else {
      //only exact match
      ovlen=a.covlen;
      return (a.exons[0]->start==b.exons[0]->start &&
          a.exons[0]->end==b.exons[0]->end);

       }*/
     }
  //check intron overlaps
  ovlen=a.exons[0]->end-(GMAX(a.start,b.start))+1;
  ovlen+=(GMIN(a.end,b.end))-a.exons.Last()->start;
  for (int i=1;i<=imax;i++) {
    if (i<imax) ovlen+=a.exons[i]->len();
    if ((a.exons[i-1]->end!=b.exons[i-1]->end) ||
      (a.exons[i]->start!=b.exons[i]->start)) {
            return false; //intron mismatch
    }
  }
  return true;
}


char* getSeqDescr(char* seqid) {
 static char charbuf[128];
 if (seqinfo.Count()==0) return NULL;
 char* suf=rstrchr(seqid, '.');
 if (suf!=NULL) *suf=0;
 SeqInfo* seqd=seqinfo.Find(seqid);
 if (suf!=NULL) *suf='.';
 if (seqd!=NULL) {
  GStr s(seqd->descr);
  //cleanup some Uniref gunk
  if (s[0]=='[') {
    int r=s.index(']');
    if (r>=0 && r<8 && isdigit(s[1]))
       s.remove(0,r+1);
    }
  if (s.length()>80) {
    int r=s.index(';');
    if (r>5) s.cut(r);
    }
  if (s.length()>127) {
   s.cut(127);
   int r=s.rindex(' ');
   if (r>0) s.cut(r);
   }
  strcpy(charbuf, s.chars());
  return charbuf;
  }
 else return NULL;
}

char* getSeqName(char* seqid) {
  static char charbuf[128];
  char* suf=rstrchr(seqid, '.');
  if (suf!=NULL) *suf=0;
  strcpy(charbuf, seqid);
  if (suf!=NULL) *suf='.';
  return charbuf;
}


int adjust_stopcodon(GffObj& gffrec, int adj, GList<GSeg>* seglst) {
  //adj>0, extend CDS to include a potential stop codon
  //when CDS is expanded, the terminal exon might have to be adjusted too
  int realadj=0;
  if (gffrec.strand=='-') {
       if ((int)gffrec.CDstart>adj) {
           gffrec.CDstart-=adj;
           realadj=adj;
           if (gffrec.exons.First()->start>gffrec.CDstart) {
                 gffrec.covlen+=gffrec.exons.First()->start - gffrec.CDstart;
                 gffrec.exons.First()->start=gffrec.CDstart;
                 gffrec.start=gffrec.CDstart;
                 }
             }
          }
        else { // forward strand
         //expand beyond
         realadj=adj;
         gffrec.CDend+=adj;
         if (adj<0) {//restore
           if (gffrec.exons.Last()->end==gffrec.CDend-adj) {
                        gffrec.exons.Last()->end+=adj;
                        gffrec.end=gffrec.exons.Last()->end;
                        gffrec.covlen+=adj;
                        }
         }
         else if (gffrec.exons.Last()->end<gffrec.CDend) {
             gffrec.covlen+=gffrec.CDend-gffrec.exons.Last()->end;
             gffrec.exons.Last()->end=gffrec.CDend;
             gffrec.end=gffrec.CDend;
             }
         }
  if (seglst!=NULL) seglst->Last()->end+=realadj;
  return realadj;
 }

void printTableData(FILE* f, GffObj& g, bool inFasta) {
 //using attribute list in tableCols
	char* av=NULL;
	for(int i=0;i<tableCols.Count();i++) {
		if (i>0 || inFasta) {
     	   if (!inFasta || tableCols[i].type!=ctfGFF_ID)
     		   fprintf(f,"\t");
		}
		switch(tableCols[i].type) {
		case ctfGFF_Attr:
			av=g.getAttr(tableCols[i].name.chars());
			fprintf(f,"%s",av!=NULL? av : ".");
			break;
		case ctfGFF_chr:
			fprintf(f,"%s",g.getGSeqName());
			break;
		case ctfGFF_ID:
			if (!inFasta)
			  fprintf(f,"%s",g.getID());
			break;
		case ctfGFF_geneID:
			fprintf(f,"%s",g.getGeneID()!=NULL ? g.getGeneID() : ".");
			break;
		case ctfGFF_geneName:
			fprintf(f,"%s",g.getGeneName()!=NULL ? g.getGeneName() : ".");
			break;
		case ctfGFF_Parent:
			fprintf(f,"%s",g.parent!=NULL ? g.parent->getID() : ".");
			break;
		case ctfGFF_feature:
			fprintf(f,"%s",g.getFeatureName());
			break;
		case ctfGFF_start:
			fprintf(f,"%d",g.start);
			break;
		case ctfGFF_end:
			fprintf(f,"%d",g.end);
			break;
		case ctfGFF_strand:
			fprintf(f,"%c",g.strand);
			break;
		case ctfGFF_numexons:
			fprintf(f,"%d",g.exons.Count());
			break;
		case ctfGFF_exons:
			if (g.exons.Count()>0) {
				for (int x=0;x<g.exons.Count();x++) {
					if (x>0) fprintf(f,",");
					fprintf(f,"%d-%d",g.exons[x]->start, g.exons[x]->end);
				}
			} else fprintf(f,".");
			break;
		case ctfGFF_cds:
			if (g.hasCDS()) {
				GVec<GffExon> cds;
				g.getCDSegs(cds);
				for (int x=0;x<cds.Count();x++) {
					if (x>0) fprintf(f,",");
				    fprintf(f,"%d-%d",cds[x].start, cds[x].end);
				}
			}
			else fprintf(f,".");
			break;
		case ctfGFF_covlen:
			fprintf(f, "%d", g.covlen);
			break;
		case ctfGFF_cdslen:
			if (g.hasCDS()) {
				GVec<GffExon> cds;
				g.getCDSegs(cds);
				int clen=0;
				for (int x=0;x<cds.Count();x++)
				    clen+=cds[x].end-cds[x].start+1;
				fprintf(f, "%d", clen);
			}
			else fprintf(f, "0");
			break;
		} //switch
	}
	fprintf(f,"\n");
}
bool GffLoader::validateGffRec(GffObj* gffrec) {
	if (reftbl.Count()>0) { //check if we need to reject by ref seq filter
		GStr refname(gffrec->getRefName());
		RefTran* rt=reftbl.Find(refname.chars());
		if (rt==NULL && refname.length()>2 && refname[-2]=='.' && isdigit(refname[-1])) {
			//try removing the version suffix
			refname.cut(-2);
			//GMessage("[DEBUG] Trying ref name '%s'...\n", refname.chars());
			rt=reftbl.Find(refname.chars());
		}
		if (rt) {
			gffrec->setRefName(rt->new_name);
		}
		/* //no, do not discard non-matching entries, let them pass through!
		else {
			if (verbose)
				GMessage("Info: %s discarded due to reference %s not being mapped\n",
						gffrec->getID(), refname.chars());
			return false; //discard, ref seq not in the given translation table
		}*/
	}
	if (transcriptsOnly && gffrec->isDiscarded()) {
		//discard generic "locus" features with no other detailed subfeatures
		//GMessage("Warning: discarding %s GFF generic gene/locus container %s\n",gffrec->getID());
		return false;
	}
	if (minLen>0 && gffrec->covlen<minLen) {
		if (verbose)
			GMessage("Info: %s discarded due to minimum length threshold %d\n",
					gffrec->getID(), minLen);
    	return false;
	}
	if (rfltGSeq!=NULL) { //filter by gseqName
		if (strcmp(gffrec->getGSeqName(),rfltGSeq)!=0) {
			return false;
		}
	}
	if (rfltStrand>0 && gffrec->strand !=rfltStrand) {
		return false;
	}
	//check coordinates
	if (rfltStart!=0 || rfltEnd!=MAX_UINT) {
		if (rfltWithin) {
			if (gffrec->start<rfltStart || gffrec->end>rfltEnd) {
				return false; //not within query range
			}
		}
		else {
			if (gffrec->start>rfltEnd || gffrec->end<rfltStart) {
				return false;
			}
		}
	}
	if (multiExon && gffrec->exons.Count()<=1) {
		return false;
	}
	if (wCDSonly && gffrec->CDstart==0) {
		return false;
	}
	if (wNConly && gffrec->hasCDS()) return false;
	return true;
}

bool process_transcript(GFastaDb& gfasta, GffObj& gffrec) {
 if (!gffrec.isTranscript()) return false; //shouldn't call this function unless it's a transcript
 //returns true if the transcript passed the filter
 char* gname=gffrec.getGeneName();
 if (gname==NULL) gname=gffrec.getGeneID();
 if (ensembl_convert && startsWith(gffrec.getID(), "ENS")) {
      const char* biotype=gffrec.getAttr("gene_biotype");
      if (biotype) {
         gffrec.addAttr("type", biotype);
         gffrec.removeAttr("gene_biotype");
         }
       else { //old Ensembl files lacking gene_biotype
         gffrec.addAttr("type", gffrec.getTrackName());
         }

      //bool is_gene=false;
      bool is_pseudo=false;
      if (strcmp(biotype, "protein_coding")==0 || gffrec.hasCDS())
                gffrec.setFeatureName("mRNA");
       else {
          if (strcmp(biotype, "processed_transcript")==0)
              gffrec.setFeatureName("proc_RNA");
            else {
              //is_gene=endsWith(biotype, "gene");
              is_pseudo=strifind(biotype, "pseudo");
              if (is_pseudo) {
                   gffrec.setFeatureName("pseudo_RNA");
                   }
                else if (endsWith(biotype, "RNA")) {
                   gffrec.setFeatureName(biotype);
                   } else gffrec.setFeatureName("misc_RNA");
              }
          }
      }
 if (gname && strcmp(gname, gffrec.getID())!=0) {
   int* isonum=isoCounter.Find(gname);
   if  (isonum==NULL) {
       isonum=new int(1);
       isoCounter.Add(gname,isonum);
       }
      else (*isonum)++;
   //defline.appendfmt(" gene=%s", gname);
   }
  int seqlen=0;

  const char* tlabel=tracklabel;
  if (tlabel==NULL) tlabel=gffrec.getTrackName();
  //defline.appendfmt(" track:%s",tlabel);
  char* cdsnt = NULL;
  char* cdsaa = NULL;
  int aalen=0;
  for (int i=1;i<gffrec.exons.Count();i++) {
     int ilen=gffrec.exons[i]->start-gffrec.exons[i-1]->end-1;
     if (verbose && ilen>4000000)
            GMessage("Warning: very large intron (%d) for transcript %s\n",
                           ilen, gffrec.getID());
     if (ilen>maxintron) {
         return false;
     }
  }
  GMapSegments seglst(gffrec.strand);
  GFaSeqGet* faseq=NULL;
  if (f_x!=NULL || f_y!=NULL || f_w!=NULL || spliceCheck || validCDSonly || addCDSattrs) {
	  faseq=fastaSeqGet(gfasta, gffrec.getGSeqName());
      if (faseq==NULL)
	    	GError("Error: no genomic sequence available (check -g option!).\n");
  }
  if (spliceCheck && gffrec.exons.Count()>1) {
    //check introns for splice site consensi ( GT-AG, GC-AG or AT-AC )
    int glen=gffrec.end-gffrec.start+1;
    const char* gseq=faseq->subseq(gffrec.start, glen);
    if (gseq==NULL) {
    	GMessage("Error at GFF ID %s : could not retrieve subsequence %s:%d-%d !\n",
    			  gffrec.getID(), gffrec.getRefName(), gffrec.start, gffrec.end);
    	return false;
    }
    bool revcompl=(gffrec.strand=='-');
    bool ssValid=true;
    for (int e=1;e<gffrec.exons.Count();e++) {
      const char* intron=gseq+gffrec.exons[e-1]->end+1-gffrec.start;
      int intronlen=gffrec.exons[e]->start-gffrec.exons[e-1]->end-1;
      GSpliceSite acceptorSite(intron,intronlen,true, revcompl);
      GSpliceSite    donorSite(intron,intronlen, false, revcompl);
      //GMessage("%c intron %d-%d : %s .. %s\n",
      //           gffrec.strand, istart, iend, donorSite.nt, acceptorSite.nt);
      if (acceptorSite=="AG") { // GT-AG or GC-AG
         if (!donorSite.canonicalDonor()) {
            ssValid=false;break;
            }
         }
      else if (acceptorSite=="AC") { //AT-AC also accepted
         if (donorSite!="AT") { ssValid=false; break; }
         }
      else { ssValid=false; break; }
      }
    if (!ssValid) {
      if (verbose)
         GMessage("Unrecognized splice sites found for '%s'\n",gffrec.getID());
      return false; //don't print this one!
    }
  }
  bool trprint=true;
  bool inframeStop=false;
  //int stopCodonAdjust=0;
  int mCDphase=0;
  bool fullCDS=false;
  bool endStop=false;
  bool stopAdjusted=false;
  if (add_hasCDS && gffrec.hasCDS()) gffrec.addAttr("hasCDS", "true");
  if (gffrec.CDphase=='1' || gffrec.CDphase=='2')
      mCDphase = gffrec.CDphase-'0';
  //CDS partialness only added when -y -x -V options are given
  if (gffrec.hasCDS() && (f_y!=NULL || f_x!=NULL || validCDSonly || addCDSattrs)) {
    int strandNum=0;
    int phaseNum=0;
  CDS_CHECK:
    uint cds_olen=0;
    cdsnt=gffrec.getSpliced(faseq, true, &seqlen, NULL, &cds_olen, &seglst, adjustStop);
    //if adjustStop, seqlen has the CDS+3'UTR length, but cds_olen still has the original CDS length
    if (cdsnt!=NULL && cdsnt[0]!='\0') { //has CDS
         cdsaa=translateDNA(cdsnt, aalen, seqlen);
         char* p=strchr(cdsaa,'.');
         int cds_aalen=aalen;
         if (adjustStop)
        	 cds_aalen=cds_olen/3; //originally stated CDS length
         endStop=false;
         if (p!=NULL) { //stop codon found
        	 if (p-cdsaa==cds_aalen-1) { //stop found as the stated last CDS codon
                  *p='\0';//remove it
                  endStop=true;
                  if (adjustStop) {
                	  seqlen=cds_aalen*3;
                	  aalen=cds_aalen;
                  }
                  cds_aalen--;
                  aalen--;
                  //no need to adjust stop codon
              }
              else {//stop found in a different position than the last codon
            	  if (p-cdsaa<cds_aalen-1 && !adjustStop) {
            		  inframeStop=true;
            	  }
            	  if (adjustStop) {
            		  *p='\0';
            		  cds_aalen=p-cdsaa+1; //adjusted CDS length
            		  seqlen=cds_aalen*3;
            		  aalen=cds_aalen;
            		  uint gc=seglst.gmap(seqlen);
            		  if (gffrec.strand=='-') gffrec.CDstart=gc;
            		  else gffrec.CDend=gc;
            		  endStop=true;
            		  stopAdjusted=true;
            	  }
              }
         }//stop codon found
         //if (trprint==false) { //failed CDS validity check
         if (inframeStop) {
           //in-frame stop codon found
           if (altPhases && phaseNum<3) {
              phaseNum++; //try a different phase
              gffrec.CDphase = '0'+((mCDphase+phaseNum)%3);
              GFREE(cdsaa);
              goto CDS_CHECK;
           }
           if (gffrec.exons.Count()==1 && bothStrands) {
              strandNum++;
              phaseNum=0;
              if (strandNum<2) {
                 GFREE(cdsaa);
                 gffrec.strand = (gffrec.strand=='-') ? '+':'-';
                 goto CDS_CHECK; //repeat the CDS check for a different frame
              }
           }
           if (verbose) GMessage("Warning: In-frame STOP found for '%s'\n",gffrec.getID());
           if (addCDSattrs) gffrec.addAttr("InFrameStop", "true");
         } //has in-frame STOP
         if (stopAdjusted) {
      	   if (addCDSattrs) gffrec.addAttr("CDStopAdjusted", "true");
      	   inframeStop=false; //pretend it's OK now that we've adjusted it
         }
         if (!inframeStop) {
			 bool hasStart=(cdsaa[0]=='M'); //for the regular eukaryotic translation table
			 fullCDS=(endStop && hasStart);
			 if (!fullCDS) {
				 const char* partialness=NULL;
				 if (hasStart) partialness="3";
				 else {
					partialness = endStop ? "5" : "5_3";
				 }
				 if (addCDSattrs) gffrec.addAttr("partialness", partialness);
			 }
         }
         if (trprint && ((fullCDSonly && !fullCDS) || (validCDSonly && inframeStop)) )
        	 trprint=false;
         //} // Valid CDS only requested?
      } //has CDS
  } //translation or codon check was requested
  if (!trprint) {
    GFREE(cdsnt);
    GFREE(cdsaa);
    //if (adjstop!=NULL) delete adjstop;
    return false;
  }
  /*
  if (validCDSonly) {
     int stopCodonAdjust=adjstop->restore();
     if (stopCodonAdjust!=0 && !endStop) {
        //restore stop codon location
        //adjust_stopcodon(gffrec, -stopCodonAdjust, &seglst);
	    if (seglst.Count()>0) seglst.Last()->end-=stopCodonAdjust;
        if (cdsnt!=NULL && seqlen>0) {
           seqlen-=stopCodonAdjust;
           cdsnt[seqlen]=0;
        }
        if (cdsaa!=NULL) aalen--;
     }
  }
  if (adjstop!=NULL) delete adjstop;
  */
  if (cdsnt!=NULL) { // && !inframeStop) {
	  if (f_y!=NULL) { //CDS translation fasta output requested
			 if (cdsaa==NULL) { //translate now if not done before
			   cdsaa=translateDNA(cdsnt, aalen, seqlen);
			 }
			 if (aalen>0) {
			   if (cdsaa[aalen-1]=='.' || cdsaa[aalen-1]=='\0') --aalen; //avoid printing the stop codon
 			   fprintf(f_y, ">%s", gffrec.getID());
 			   if (fmtTable) printTableData(f_y, gffrec, true);
 			   else fprintf(f_y, "\n");
			   printFasta(f_y, NULL, cdsaa, aalen, StarStop);
			 }
	  }
	  if (f_x!=NULL) { //CDS only
			 GStr defline(gffrec.getID(), 94);
			 if (writeExonSegs) {
				  defline.append(" loc:");
				  defline.append(gffrec.getGSeqName());
				  defline.appendfmt("(%c)",gffrec.strand);
				  //warning: not CDS coordinates are written here, but the exon ones
				  defline+=(int)gffrec.start;
				  defline+=(char)'-';
				  defline+=(int)gffrec.end;
				  // -- here these are CDS substring coordinates on the spliced sequence:
				  defline.append(" segs:");
				  for (int i=0;i<seglst.Count();i++) {
					  if (i>0) defline.append(",");
					  defline+=(int)seglst[i].start;
					  defline.append("-");
					  defline+=(int)seglst[i].end;
					  }
			 }
			 fprintf(f_x, ">%s", defline.chars());
			 if (fmtTable) printTableData(f_x, gffrec, true);
			 else fprintf(f_x, "\n");
			 printFasta(f_x, NULL, cdsnt, seqlen);
	  }
	  GFREE(cdsnt);
	  GFREE(cdsaa);
  } //writing CDS or its translation
  if (f_w!=NULL) { //write spliced exons
	  uint cds_start=0;
	  uint cds_end=0;
	  seglst.Clear();
	  int padLeft=0;
	  int padRight=0;
	  if (wPadding>0) {
		padLeft= (gffrec.start>(uint)wPadding) ? wPadding : gffrec.start - 1;
		int ediff=faseq->getseqlen()-gffrec.end;
	    padRight=(wPadding>ediff) ?  ediff : wPadding;
   	    gffrec.addPadding(padLeft, padRight);
	  }
	  char* exont=gffrec.getSpliced(faseq, false, &seqlen, &cds_start, &cds_end, &seglst);
	  //restore exons to normal (remove padding)
	  if (wPadding>0)
		  gffrec.removePadding(padLeft, padRight);

	  GStr defline(gffrec.getID());
	  if (exont!=NULL) {
		  if (gffrec.CDstart>0) {
			  defline.appendfmt(" CDS=%d-%d", cds_start, cds_end);
		  }
		  if (writeExonSegs) {
			  defline.append(" loc:");
			  defline.append(gffrec.getGSeqName());
			  defline+=(char)'|';
			  defline+=(int)gffrec.start;
			  defline+=(char)'-';
			  defline+=(int)gffrec.end;
			  defline+=(char)'|';
			  defline+=(char)gffrec.strand;
			  defline.append(" exons:");
			  for (int i=0;i<gffrec.exons.Count();i++) {
				  if (i>0) defline.append(",");
				  defline+=(int)gffrec.exons[i]->start;
				  defline.append("-");
				  defline+=(int)gffrec.exons[i]->end;
			  }
			if (wPadding>0) {
				defline.append(" padding:");
				defline.append(padLeft);
				defline+=(char)'|';
				defline.append(padRight);
			}

			defline.append(" segs:");
			for (int i=0;i<seglst.Count();i++) {
				if (i>0) defline.append(",");
				defline+=(int)seglst[i].start;
				defline.append("-");
				defline+=(int)seglst[i].end;
				}
		  }

		  fprintf(f_w, ">%s", defline.chars());
		  if (fmtTable) printTableData(f_w, gffrec, true);
		    else fprintf(f_w, "\n");
		  printFasta(f_w, NULL, exont, seqlen);
		  GFREE(exont);
	  }
  } //writing f_w (spliced exons)
  return true;
}


GTData::GTData(GffObj* t, GenomicSeqData* gd):rna(t),gdata(gd), locus(NULL), replaced_by(NULL), geneinfo(NULL) {
    if (rna!=NULL) {
        //geneinfo=(GeneInfo*)rna->uptr; //take over geneinfo, if there
        rna->uptr=this;
    }
    if (gdata!=NULL)
 	   gdata->tdata.Add(this);
}



bool GffLoader::unsplContained(GffObj& ti, GffObj&  tj) {
 //returns true only if ti (which MUST be single-exon) is "almost" contained in any of tj's exons
 //but it does not cross any intron-exon boundary of tj
  int imax=ti.exons.Count()-1;
  int jmax=tj.exons.Count()-1;
  if (imax>0) GError("Error: bad unsplContained() call, 1st parameter must be single-exon transcript!\n");
  if (fuzzSpan) {
    int maxIntronOvl=dOvlSET ? 25 : 0;
    //int minovl = dOvlSET ? 5 : (int)(0.8 * ti.len()); //minimum overlap to declare "redundancy"
    for (int j=0;j<=jmax;j++) {
       bool exonOverlap=false;
       if (dOvlSET) {
    	   exonOverlap= (tj.exons[j]->overlapLen(ti.start-1, ti.end+1) > 0);
       } else {
    	   exonOverlap=(ti.overlapLen(tj.exons[j])>=0.8 * ti.len());
       }
       if (exonOverlap) {
          //must not overlap the introns
          if ((j>0 && ti.start+maxIntronOvl<tj.exons[j]->start)
             || (j<jmax && ti.end>tj.exons[j]->end+maxIntronOvl))
             return false;
          return true;
       }
    } //for each exon
  } else { // not fuzzSpan, strict containment required
    for (int j=0;j<=jmax;j++) {
        if (ti.end<=tj.exons[j]->end && ti.start>=tj.exons[j]->start)
          return true;
    }
 }
 return false;
}

GffObj* GffLoader::redundantTranscripts(GffObj& ti, GffObj&  tj) {
  // matchAllIntrons==true:  transcripts are considered "redundant" only if
  //                   they have the exact same number of introns and same splice sites (or none)
  //                   (single-exon transcripts should be also fully contained to be considered matching)
  // matchAllIntrons==false: an intron chain could be a subset of a "container" chain,
  //                   as long as no intron-exon boundaries are violated; also, a single-exon
  //                   transcript will be collapsed if it's contained in one of the exons of the another transcript
  // fuzzSpan==false: the genomic span of one transcript MUST BE contained in or equal with the genomic
  //                  span of the other
  //
  // fuzzSpan==true: then genomic spans of transcripts are no longer required to be fully contained
  //                 (i.e. they may extend each-other in opposite directions)

  //if redundancy is detected, the "bigger" transcript is returned (otherwise NULL is returned)
 int adj=dOvlSET ? 1 : 0;
 if (ti.start>tj.end+adj || tj.start>ti.end+adj ||
		 (tj.strand!='.' && ti.strand!='.' && tj.strand!=ti.strand)) return NULL; //no span overlap
 int imax=ti.exons.Count()-1;
 int jmax=tj.exons.Count()-1;
 GffObj* bigger=NULL;
 GffObj* smaller=NULL;
 if (matchAllIntrons) { //full intron chain match expected, or full containment for SET
   if (imax!=jmax) return NULL; //must have the same number of exons!
   if (ti.covlen>tj.covlen) {
      bigger=&ti;
      if (!fuzzSpan && (ti.start>tj.start || ti.end<tj.end))
        return NULL; //no containment
   }
   else { //ti.covlen<=tj.covlen
      bigger=&tj;
      if (!fuzzSpan && (tj.start>ti.start || tj.end<ti.end))
         return NULL; //no containment
   }
   //check that all introns really match
   for (int i=0;i<imax;i++) {
     if (ti.exons[i]->end!=tj.exons[i]->end ||
         ti.exons[i+1]->start!=tj.exons[i+1]->start) return NULL;
     }
   return bigger;
 }
 //--- matchAllIntrons==false: intron-chain containment is also considered redundancy
 int minlen=0;
 if (ti.covlen>tj.covlen) {
      if (tj.exons.Count()>ti.exons.Count()) {
          //exon count override
          bigger=&tj;
          smaller=&ti;
      } else {
          bigger=&ti;
          smaller=&tj;
      }
      //maxlen=ti.covlen;
      minlen=tj.covlen;
 } else { //tj has more bases covered
      if (ti.exons.Count()>tj.exons.Count()) {
          //exon count override
          bigger=&ti;
          smaller=&tj;
      } else {
          bigger=&tj;
          smaller=&ti;
      }
      //maxlen=tj.covlen;
      minlen=ti.covlen;
 }
 if (imax==0 && jmax==0) {
     //single-exon transcripts: if fuzzSpan, at least 80% of the shortest one must be overlapped by the other
     if (fuzzSpan) {
       if (dOvlSET) {
           return (ti.exons[0]->overlapLen(tj.exons[0]->start-1, tj.exons[0]->end+1)>0) ? bigger : NULL;
       } else {
          return (ti.exons[0]->overlapLen(tj.exons[0])>=minlen*0.8) ? bigger : NULL;
       }
     } else { //boundary containment required
       return (smaller->start>=bigger->start && smaller->end<=bigger->end) ? bigger : NULL;
     }
 }
 //containment is also considered redundancy
 if (smaller->exons.Count()==1) {
   //check if this single exon is contained in any of tj exons
   //without violating any intron-exon boundaries
   return (unsplContained(*smaller, *bigger) ? bigger : NULL);
 }

 //--- from here on: both are multi-exon transcripts: imax>0 && jmax>0
  if (ti.exons[imax]->start<tj.exons[0]->end ||
     tj.exons[jmax]->start<ti.exons[0]->end )
         return NULL; //intron chains do not overlap at all
 //checking full intron chain containment
 uint eistart=0, eiend=0, ejstart=0, ejend=0; //exon boundaries
 int i=1; //exon idx to the right of the current intron of ti
 int j=1; //exon idx to the right of the current intron of tj
 //find the first intron overlap:
 while (i<=imax && j<=jmax) {
    eistart=ti.exons[i-1]->end;
    eiend=ti.exons[i]->start;
    ejstart=tj.exons[j-1]->end;
    ejend=tj.exons[j]->start;
    if (ejend<eistart) { j++; continue; }
    if (eiend<ejstart) { i++; continue; }
    //we found an intron overlap
    break;
 }
 if (!fuzzSpan && (bigger->start>smaller->start || bigger->end < smaller->end)) return NULL;
 if ((i>1 && j>1) || i>imax || j>jmax) {
     return NULL; //either no intron overlaps found at all
                  //or it's not the first intron for at least one of the transcripts
 }
 if (eistart!=ejstart || eiend!=ejend) return NULL; //not an exact intron match
 int maxIntronOvl=dOvlSET ? 25 : 0;
 if (j>i) {
   //i==1, ti's start must not conflict with the previous intron of tj
   if (ti.start+maxIntronOvl<tj.exons[j-1]->start) return NULL;
   //comment out the line above if you just want "intron compatibility" (i.e. extension of intron chains )
   //so i's first intron starts AFTER j's first intron
   // then j must contain i, so i's last intron must end with or before j's last intron
   if (ti.exons[imax]->start>tj.exons[jmax]->start) return NULL;
 }
 else if (i>j) {
   //j==1, tj's start must not conflict with the previous intron of ti
   if (tj.start+maxIntronOvl<ti.exons[i-1]->start) return NULL;
   //comment out the line above for just "intronCompatible()" check (allowing extension of intron chain)
   //so j's intron chain starts AFTER i's
   // then i must contain j, so j's last intron must end with or before j's last intron
   if (tj.exons[jmax]->start>ti.exons[imax]->start) return NULL;
 }
 //now check if the rest of the introns overlap, in the same sequence
 i++;
 j++;
 while (i<=imax && j<=jmax) {
   if (ti.exons[i-1]->end!=tj.exons[j-1]->end ||
      ti.exons[i]->start!=tj.exons[j]->start) return NULL;
   i++;
   j++;
 }
 i--;
 j--;
 if (i==imax && j<jmax) {
   // tj has more introns to the right, check if ti's end doesn't conflict with the current tj exon boundary
   if (ti.end>tj.exons[j]->end+maxIntronOvl) return NULL;
   }
 else if (j==jmax && i<imax) {
   if (tj.end>ti.exons[i]->end+maxIntronOvl) return NULL;
   }
 return bigger;
}


int gseqCmpName(const pointer p1, const pointer p2) {
 return strcmp(((GenomicSeqData*)p1)->gseq_name, ((GenomicSeqData*)p2)->gseq_name);
}


void printLocus(GffLocus* loc, const char* pre) {
  if (pre!=NULL) fprintf(stderr, "%s", pre);
  GMessage(" [%d-%d] : ", loc->start, loc->end);
  GMessage("%s",loc->rnas[0]->getID());
  for (int i=1;i<loc->rnas.Count();i++) {
    GMessage(",%s",loc->rnas[i]->getID());
    }
  GMessage("\n");
}

void preserveContainedCDS(GffObj* tcontainer, GffObj* t) {
 //transfer contained CDS info to the container if t has a CDS but container does not
 if (!t->hasCDS()) return;
  if (!tcontainer->hasCDS())//no CDS info on container, just copy it from the contained
	 tcontainer->setCDS(t);
}

bool exonOverlap2Gene(GffObj* t, GffObj& g) {
	if (t->exons.Count()>0) {
		return t->exonOverlap(g.start, g.end);
	}
	else return g.overlap(*t);
}
bool GffLoader::placeGf(GffObj* t, GenomicSeqData* gdata) {
  bool keep=false;
  GTData* tdata=NULL;
  //int tidx=-1;
  /*
  if (debug) {
     GMessage(">>Placing transcript %s\n", t->getID());
     debugState=true;
     }
    else debugState=false;
   */
  //dumb TRNA case for RefSeq: gene parent link missing
  //try to restore it here; BUT this only works if gene feature comes first
  ////DEBUG ONLY:
  //if (strcmp(t->getID(),"id24448")==0) { //&& t->start==309180) {
  //	 GMessage("placeGf %s (%d, %d) (%d exons)\n", t->getID(),t->start, t->end, t->exons.Count());
  //}
  //GMessage("DBG>>Placing transcript %s(%d-%d, %d exons)\n", t->getID(), t->start, t->end, t->exons.Count());

  if (t->parent==NULL && t->isTranscript() && trAdoption) {
  	int gidx=gdata->gfs.Count()-1;
  	while (gidx>=0 && gdata->gfs[gidx]->end>=t->start) {
  		GffObj& g = *(gdata->gfs[gidx]);
  		//try to find a container gene object for this transcript
  		//if (g.isGene() && t->strand==g.strand && exonOverlap2Gene(t, g)) {
  		if (g.isGene() && (t->strand=='.' || t->strand==g.strand) && g.exons.Count()==0
  				  && t->start>=g.start && t->end<=g.end) {
  			if (g.children.IndexOf(t)<0)
  				g.children.Add(t);
  			keep=true;
  			if (tdata==NULL) {
  		       tdata=new GTData(t, gdata); //additional transcript data
  			}
  			t->parent=&g;
  			//disable printing of gene if transcriptsOnly and --keep-genes wasn't given
  			if (transcriptsOnly && !keepGenes) {
  				T_NO_PRINT(g.udata); //tag it as non-printable
  				//keep gene ID and Name into transcript, when we don't print genes
  	  			const char* geneName=g.getAttr("Name");
  	  			if (t->getAttr("Name")==NULL && geneName) {
  	  				t->addAttr("Name", geneName);
  	  				if (t->getAttr("gene_name")==NULL)
  	  					t->addAttr("gene_name", geneName);
  	  			}
  	  			t->addAttr("geneID", g.getID());
  			}
  			break;
  		}
  		--gidx;
  	}
  }
  bool noexon_gfs=false;
  if (t->exons.Count()>0) { //treating this entry as a transcript
	gdata->rnas.Add(t); //added it in sorted order
	if (tdata==NULL) {
	   tdata=new GTData(t, gdata); //additional transcript data
	   //gdata->tdata.Add(tdata);
	}
	keep=true;
  }
   else {
    if (t->isGene() || !this->transcriptsOnly) {
	   gdata->gfs.Add(t);
	   keep=true;
	   if (tdata==NULL) {
		   tdata=new GTData(t, gdata); //additional transcript data
		   //gdata->tdata.Add(tdata);
	   }
	   noexon_gfs=true; //gene-like record, no exons defined
	   keep=true;
    } else {
       return false; //nothing to do with these non-transcript objects
    }
  }
  //keeping track of genes in special cases
	char* geneid=t->getGeneID();
	bool trackGenes=!t->isGene() && ( (keepGenes && t->parent==NULL) ||
			    (ensembl_convert && startsWith(t->getID(), "ENS") ) ) ;
	if (trackGenes) {
		GTData* tdata=(GTData*)(t->uptr);
		//keep track of chr|gene_id data and coordinate range
		if (geneid!=NULL) {
			GeneInfo* ginfo=gene_ids.Find(geneid);
			if (ginfo==NULL) {//first time seeing this gene ID
				GeneInfo* geneinfo=new GeneInfo(t, tdata->gdata, ensembl_convert);
				gene_ids.Add(geneid, geneinfo);
				//if (gfnew!=NULL) //new gene features
				//  gfnew->Add(geneinfo->gf);
			}
			else ginfo->update(t);
		}
	}



  if (!doCluster) return keep;

  if (!keep) return false;

  //---- place into a locus
  if (dOvlSET && t->exons.Count()==1) {
	  //for single exon transcripts temporarily set the strand to '.'
	  //so we can check both strands for overlap/locus
      T_SET_OSTRAND(t->udata, t->strand);
      t->strand='.';
  }
  if (gdata->loci.Count()==0) {
       gdata->loci.Add(new GffLocus(t));
       return true; //new locus on this ref seq
  }
  //--- look for any existing loci overlapping t
  uint t_end=t->end;
  uint t_start=t->start;
  if (dOvlSET) {
	  t_end++;
	  t_start--;
  }
  int nidx=qsearch_gloci(t_end, gdata->loci); //get index of nearest locus starting just ABOVE t->end
  //GMessage("\tlooking up end coord %d in gdata->loci.. (qsearch got nidx=%d)\n", t->end, nidx);
  if (nidx==0) {
     //cannot have any overlapping loci
     //if (debug) GMessage("  <<no ovls possible, create locus %d-%d \n",t->start, t->end);
     gdata->loci.Add(new GffLocus(t));
     return true;
  }
  if (nidx==-1) nidx=gdata->loci.Count();//all loci start below t->end
  int lfound=0; //count of parent loci
  GArray<int> mrgloci(false);
  GList<GffLocus> tloci(true); //candidate parent loci to adopt this
  //if (debug) GMessage("\tchecking all loci from %d to 0\n",nidx-1);
  for (int l=nidx-1;l>=0;l--) {
      GffLocus& loc=*(gdata->loci[l]);
      if ((loc.strand=='+' || loc.strand=='-') && t->strand!='.'&& loc.strand!=t->strand) continue;
      if (t_start>loc.end) {
           if (t->start-loc.start>GFF_MAX_LOCUS) break; //give up already
           continue;
      }
      if (loc.start>t_end) {
               //this should never be the case if nidx was found correctly
               GMessage("Warning: qsearch_gloci found loc.start>t.end!(t=%s)\n", t->getID());
               continue;
      }

      if (loc.add_gfobj(t, dOvlSET)) {
         //will add this transcript to loc
         lfound++;
         mrgloci.Add(l);
         if (collapseRedundant && !noexon_gfs) {
           //compare to every single transcript in this locus
           for (int ti=0;ti<loc.rnas.Count();ti++) {
                 if (loc.rnas[ti]==t) continue;
                 GTData* odata=(GTData*)(loc.rnas[ti]->uptr);
                 //GMessage("  ..redundant check vs overlapping transcript %s\n",loc.rnas[ti]->getID());
                 GffObj* container=NULL;
                 if (odata->replaced_by==NULL &&
                      (container=redundantTranscripts(*t, *(loc.rnas[ti])))!=NULL) {
                     if (container==t) {
                        odata->replaced_by=t;
                        preserveContainedCDS(t, loc.rnas[ti]);
                     }
                     else {// t is being replaced by previously defined transcript
                        tdata->replaced_by=loc.rnas[ti];
                        preserveContainedCDS(loc.rnas[ti], t);
                     }
                 }
           }//for each transcript in the exon-overlapping locus
         } //if doCollapseRedundant
      } //overlapping locus
  } //for each existing locus
  if (lfound==0) {
      //overlapping loci not found, create a locus with only this mRNA
      int addidx=gdata->loci.Add(new GffLocus(t));
      if (addidx<0) {
         //should never be the case!
         GMessage("  WARNING: new GffLocus(%s:%d-%d) not added!\n",t->getID(), t->start, t->end);
      }
   }
   else { //found at least one overlapping locus
     lfound--;
     int locidx=mrgloci[lfound];
     GffLocus& loc=*(gdata->loci[locidx]);
     //last locus index found is also the smallest index
     if (lfound>0) {
       //more than one loci found parenting this mRNA, merge loci
       /* if (debug)
          GMessage(" merging %d loci \n",lfound);
       */
       for (int l=0;l<lfound;l++) {
          int mlidx=mrgloci[l];
          loc.addMerge(*(gdata->loci[mlidx]), t);
          gdata->loci.Delete(mlidx); //highest indices first, so it's safe to remove
       }
     }
     int i=locidx;
     while (i>0 && loc<*(gdata->loci[i-1])) {
       //bubble down until it's in the proper order
       i--;
       gdata->loci.Swap(i,i+1);
     }
  }//found at least one overlapping locus
  return true;
}

void collectLocusData(GList<GenomicSeqData>& ref_data, bool covInfo) {
	int locus_num=0;
	for (int g=0;g<ref_data.Count();g++) {
		GenomicSeqData* gdata=ref_data[g];
		for (int l=0;l<gdata->loci.Count();l++) {
			GffLocus& loc=*(gdata->loci[l]);
			GHash<int> gnames(true); //gene names in this locus
			//GHash<int> geneids(true); //Entrez GeneID: numbers
			GHash<int> geneids(true);
			int fstrand=0,rstrand=0,ustrand=0;
			for (int i=0;i<loc.rnas.Count();i++) {
				GffObj& t=*(loc.rnas[i]);
				char tstrand=(char) T_OSTRAND(t.udata);
				if (tstrand==0) tstrand=t.strand;
				if (tstrand=='+') fstrand++;
				 else if (tstrand=='-') rstrand++;
				   else ustrand++;
				GStr gname(t.getGeneName());
				if (!gname.is_empty()) {
					gname.upper();
					int* prevg=gnames.Find(gname.chars());
					if (prevg!=NULL) (*prevg)++;
					else gnames.Add(gname, new int(1));
				}
				GStr geneid(t.getGeneID());
				if (!geneid.is_empty())
					geneids.Add(geneid.chars());
				//parse GeneID xrefs, if any (RefSeq):
				/*
				GStr xrefs(t.getAttr("xrefs"));
				if (!xrefs.is_empty()) {
					xrefs.startTokenize(",");
					GStr token;
					while (xrefs.nextToken(token)) {
						token.upper();
						if (token.startsWith("GENEID:")) {
							token.cut(0,token.index(':')+1);
							int* prevg=geneids.Find(token.chars());
							if (prevg!=NULL) (*prevg)++;
							else geneids.Add(token, new int(1));
						}
					} //for each xref
				} //xrefs parsing
				*/
			}//for each transcript
            if ((fstrand>0 && rstrand>0) ||
            		 (fstrand==0 && rstrand==0)) loc.strand='.';
            else if (fstrand==0 && rstrand>0) loc.strand='-';
            else loc.strand='+';
			for (int i=0;i<loc.gfs.Count();i++) {
				GffObj& nt=*(loc.gfs[i]);
				if (nt.isGene()) {
					GStr gname(nt.getGeneName());
					if (!gname.is_empty()) {
						gname.upper();
						int* prevg=gnames.Find(gname.chars());
						if (prevg!=NULL) (*prevg)++;
						else gnames.Add(gname, new int(1));
					}
					GStr geneid(nt.getID());
					if (!geneid.is_empty()) {
						geneids.Add(geneid.chars(), new int(1));
					}
				}
				//parse GeneID xrefs, if any (RefSeq):
				/*
				GStr xrefs(nt.getAttr("xrefs"));
				if (!xrefs.is_empty()) {
					xrefs.startTokenize(",");
					GStr token;
					while (xrefs.nextToken(token)) {
						token.upper();
						if (token.startsWith("GENEID:")) {
							token.cut(0,token.index(':')+1);
							int* prevg=geneids.Find(token.chars());
							if (prevg!=NULL) (*prevg)++;
							else geneids.Add(token, new int(1));
						}
					} //for each xref
				} //xrefs parsing
				*/
			}//for each non-transcript (genes?)
			if (covInfo) {
				for (int m=0;m<loc.mexons.Count();m++) {
					if (loc.strand=='+')
						gdata->f_bases+=loc.mexons[m].len();
					else if (loc.strand=='-')
						gdata->r_bases+=loc.mexons[m].len();
					else gdata->u_bases+=loc.mexons[m].len();
				}
			}
			locus_num++;
			loc.locus_num=locus_num;
			if (gnames.Count()>0) { //collect all gene names associated to this locus
				gnames.startIterate();
				int* gfreq=NULL;
				char* key=NULL;
				while ((gfreq=gnames.NextData(key))!=NULL) {
					loc.gene_names.AddIfNew(new CGeneSym(key,*gfreq));
				}
			} //added collected gene_names
			if (geneids.Count()>0) { //collect all GeneIDs names associated to this locus
				geneids.startIterate();
				int* gfreq=NULL;
				char* key=NULL;
				while ((gfreq=geneids.NextData(key))!=NULL) {
					loc.gene_ids.AddIfNew(new CGeneSym(key,*gfreq));
				}
			}
		} //for each locus
	}//for each genomic sequence
}

void GffLoader::loadRefNames(GStr& flst) {
 //load the whole file and split by (' \t\n\r,'
	int64_t fsize=fileSize(flst.chars());
	if (fsize<0) GError("Error: could not get file size for %s !\n",
			flst.chars());
	GStr slurp("", fsize+1);
	//sanity check for file size?
	FILE* f=fopen(flst.chars(), "r");
	if (f==NULL)
		GError("Error: could not open file %s !\n", flst.chars());
	slurp.read(f, NULL);
	fclose(f);
	slurp.startTokenize(" ,;\t\r\n", tkCharSet);
	GStr refname;
	while (slurp.nextToken(refname)) {
		if (refname.is_empty()) continue;
		names->gseqs.addName(refname.chars());
	}
}

GenomicSeqData* getGSeqData(GList<GenomicSeqData>& seqdata, int gseq_id) {
	int i=-1;
	GenomicSeqData f(gseq_id);
	GenomicSeqData* gdata=NULL;
	if (seqdata.Found(&f,i)) gdata=seqdata[i];
	else { //entry not created yet for this genomic seq
		gdata=new GenomicSeqData(gseq_id);
		seqdata.Add(gdata);
	}
	return gdata;
}

void warnPseudo(GffObj& m) {
	GMessage("Info: pseudo gene/transcript record with ID=%s discarded.\n",m.getID());
}

void GffLoader::load(GList<GenomicSeqData>& seqdata, GFFCommentParser* gf_parsecomment) {
	if (f==NULL) GError("Error: GffLoader::load() cannot be called before ::openFile()!\n");
	GffReader* gffr=new GffReader(f, this->transcriptsOnly, true); //not only mRNA features, sorted
	clearHeaderLines();
	gffr->showWarnings(verbose);
	//           keepAttrs   mergeCloseExons  noExonAttr
	gffr->gene2Exon(gene2exon);
	if (BEDinput) gffr->isBED(true);
	//if (TLFinput) gffr->isTLF(true);
	gffr->mergeCloseExons(mergeCloseExons);
	gffr->keepAttrs(fullAttributes, gatherExonAttrs, keep_AllExonAttrs);
	gffr->keepGenes(keepGenes);
	gffr->setIgnoreLocus(ignoreLocus);
	gffr->setRefAlphaSorted(this->sortRefsAlpha);
	if (keepGff3Comments && gf_parsecomment!=NULL) gffr->setCommentParser(gf_parsecomment);
    int outcounter=0;
	if (streamIn) { //this will ignore any clustering options
		GffObj* t=NULL;
		while ((t=gffr->readNext())!=NULL) {
			if (!validateGffRec(t)) {
				delete t;
				continue;
			}
			if (process_transcript(gfasta, *t)) {
				outcounter++;
				if (f_out) {
				  if (fmtTable)
						printTableData(f_out, *t);
				  else //GFF3, GTF, BED, TLF
					t->printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
				}
			}
			delete t;
		}
		delete gffr;
		return;
	}


	gffr->readAll();
	GVec<int> pseudoFeatureIds; //feature type: pseudo*
	GVec<int> pseudoAttrIds;  // attribute: [is]pseudo*=true/yes/1
	GVec<int> pseudoTypeAttrIds;  // attribute: *_type=pseudo*

	if (this->noPseudo) {
		GffNameList& fnames = GffObj::names->feats; //gffr->names->feats;
		for (int i=0;i<fnames.Count();i++) {
			char* n=fnames[i]->name;
			if (startsWith(n, "pseudo")) {
				pseudoFeatureIds.Add(fnames[i]->idx);
			}
		}
		GffNameList& attrnames = GffObj::names->attrs;//gffr->names->attrs;
		for (int i=0;i<attrnames.Count();i++) {
			char* n=attrnames[i]->name;
			if (endsiWith(n, "type")) {
				pseudoTypeAttrIds.Add(attrnames[i]->idx);
			}// else {
			char* p=strifind(n, "pseudo");
			if (p==n || (p==n+2 && tolower(n[0])=='i' && tolower(n[1])=='s') ||
					(p==n+3 && startsiWith(n, "is_")) ) {
				pseudoAttrIds.Add(attrnames[i]->idx);
			}
			//}
		}
	}

	if (verbose) GMessage("   .. loaded %d genomic features from %s\n", gffr->gflst.Count(), fname.chars());
	//add to GenomicSeqData, adding to existing loci and identifying intron-chain duplicates
	for (int k=0;k<gffr->gflst.Count();k++) {
		GffObj* m=gffr->gflst[k];
		if (strcmp(m->getFeatureName(), "locus")==0 &&
				m->getAttr("transcripts")!=NULL) {
			continue; //discard locus meta-features
		}
		if (this->noPseudo) {
			bool is_pseudo=false;
			for (int i=0;i<pseudoFeatureIds.Count();++i) {
				if (pseudoFeatureIds[i]==m->ftype_id) {
					is_pseudo=true;
					break;
				}
			}
			if (is_pseudo) {
				if (verbose) warnPseudo(*m);
				continue;
			}
			for (int i=0;i<pseudoAttrIds.Count();++i) {
				char* attrv=NULL;
				if (m->attrs!=NULL) attrv=m->attrs->getAttr(pseudoAttrIds[i]);
				if (attrv!=NULL) {
					char fc=tolower(attrv[0]);
					if (fc=='t' || fc=='y' || fc=='1') {
						is_pseudo=true;
						break;
					}
				}
			}
			if (is_pseudo) {
				if (verbose) warnPseudo(*m);
				continue;
			}
			//  *type=*_pseudogene
            //find all attributes ending with _type and have value like: *_pseudogene
			for (int i=0;i<pseudoTypeAttrIds.Count();++i) {
				char* attrv=NULL;
				if (m->attrs!=NULL) attrv=m->attrs->getAttr(pseudoTypeAttrIds[i]);
				if (attrv!=NULL &&
						(startsWith(attrv, "pseudogene") || endsWith(attrv, "_pseudogene")) ) {
					is_pseudo=true;
					break;
				}
			}
			if (is_pseudo) {
				if (verbose) warnPseudo(*m);
				continue;
			}
		} //pseudogene detection requested
		char* rloc=m->getAttr("locus");
		if (rloc!=NULL && startsWith(rloc, "RLOC_")) {
			m->removeAttr("locus", rloc);
		}
		if (forceExons) {
			m->subftype_id=gff_fid_exon;
		}
		//GList<GffObj> gfadd(false,false); -- for gf_validate()?
		if (!validateGffRec(m)) {
			continue;
		}
		m->isUsed(true); //so the gffreader won't destroy it
		GenomicSeqData* gdata=getGSeqData(seqdata, m->gseq_id);
		bool keep=placeGf(m, gdata);
		if (!keep) {
			m->isUsed(false);
			//DEBUG
			//GMessage("Feature %s(%d-%d) is going to be discarded..\n",m->getID(), m->start, m->end);
		}
	} //for each read gffObj
	//if (verbose) GMessage("  .. %d records from %s clustered into loci.\n", gffr->gflst.Count(), fname.chars());
	//if (f && f!=stdin) { fclose(f); f=NULL; }
	delete gffr;
}