File: readreps.cpp

package info (click to toggle)
piler 0~20140707-3
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye
  • size: 360 kB
  • sloc: cpp: 5,369; makefile: 39
file content (126 lines) | stat: -rwxr-xr-x 3,050 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#include "piler2.h"

// Destructive: pokes Rec.Attrs
static char *GetRepeat(GFFRecord &Rec)
	{
	char *Attrs = Rec.Attrs;
	char *Start = strstr(Attrs, "Repeat");
	if (0 == Start)
		Quit("GFF line %d, repeat record does not have Repeat attribute", GFFLineNr);

	char *SemiColon = strchr(Start, ';');
	if (0 != SemiColon)
		*SemiColon = 0;
	char *Space = strchr(Start, ' ');
	if (0 == Space)
		Quit("GFF repeat record line %d, missing space in attrs", GFFLineNr);
	return Space + 1;
	}

// "Repeat HETRP_DM Satellite 1518 1669 0"
//            0         1       2    3  4
void ParseRepeat(char *Str, RepData &Rep)
	{
	char *Fields[5];
	int FieldCount = GetFields(Str, Fields, 5, ' ');
	if (FieldCount != 5)
		Quit("GFF line %d, Repeat attribute does not have 5 fields");

	const char *RepeatName = Fields[0];
	const char *RepeatClass = Fields[1];
	const char *RepeatFrom = Fields[2];
	const char *RepeatTo = Fields[3];
	const char *RepeatLeft = Fields[4];

	Rep.RepeatName = strsave(RepeatName);
	Rep.RepeatClass = strsave(RepeatClass);
	if (0 == strcmp(RepeatFrom, "."))
		{
		Rep.RepeatFrom = -1;
		Rep.RepeatTo = -1;
		Rep.RepeatLeft = -1;
		}
	else
		{
		Rep.RepeatFrom = atoi(RepeatFrom) - 1;
		Rep.RepeatTo = atoi(RepeatTo) - 1;
		Rep.RepeatLeft = atoi(RepeatLeft);
		}
	}

static int ReadRepsPass1(FILE *f)
	{
	GFFLineNr = 0;
	int RepCount = 0;
	GFFRecord Rec;
	while (GetNextGFFRecord(f, Rec))
		{
		if (0 != strcmp(Rec.Feature, "repeat"))
			{
			static bool WarningIssued = false;
			if (!WarningIssued)
				{
				Warning("GFF record feature '%s' is not a repeat", Rec.Feature);
				WarningIssued = true;
				}
			continue;
			}
		if (Rec.Start <= 0 || Rec.End <= 0 || Rec.Start > Rec.End)
			Warning("GFF line %d: invalid start %d / end %d",
			  GFFLineNr, Rec.Start, Rec.End);
		++RepCount;
		}
	return RepCount;
	}

static int ReadRepsPass2(FILE *f, RepData *Reps)
	{
	rewind(f);

	GFFLineNr = 0;
	int RepCount = 0;
	GFFRecord Rec;
	while (GetNextGFFRecord(f, Rec))
		{
		if (0 != strcmp(Rec.Feature, "repeat"))
			continue;

		static char *Repeat = GetRepeat(Rec);

		RepData &Rep = Reps[RepCount];
		ParseRepeat(Repeat, Rep);

		if (Rec.Start <= 0 || Rec.End <= 0 || Rec.Start > Rec.End)
			Warning("GFF line %d: invalid start %d / end %d",
			  GFFLineNr, Rec.Start, Rec.End);

		const int Length = Rec.End - Rec.Start + 1;

		Rep.ContigLabel = strsave(Rec.SeqName);
		Rep.ContigFrom = Rec.Start - 1;
		Rep.ContigTo = Rep.ContigFrom + Length - 1;

		if (Rec.Strand == '+')
			Rep.Rev = false;
		else if (Rec.Strand == '-')
			Rep.Rev = true;
		else
			Quit("GFF line %d, Invalid strand %c", GFFLineNr, Rec.Strand);

		++RepCount;
		}
	return RepCount;
	}

RepData *ReadReps(const char *FileName, int *ptrRepCount)
	{
	FILE *f = OpenStdioFile(FileName);

	int RepCount = ReadRepsPass1(f);
	RepData *Reps = all(RepData, RepCount);
	ReadRepsPass2(f, Reps);
	fclose(f);

	*ptrRepCount = RepCount;
	return Reps;
	}