File: fasta2.cpp

package info (click to toggle)
libmuscle 3.7%2B4565-7
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye
  • size: 1,912 kB
  • sloc: cpp: 27,959; makefile: 58; sh: 26
file content (137 lines) | stat: -rw-r--r-- 2,997 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#include "libMUSCLE/muscle.h"
#include <stdio.h>
#include <errno.h>

namespace muscle {

//const int BUFFER_BYTES = 16*1024;
const int BUFFER_BYTES = 128;
const int CR = '\r';
const int NL = '\n';

#define ADD(c)															\
		{																\
		if (Pos >= BufferLength)										\
			{															\
			const int NewBufferLength = BufferLength + BUFFER_BYTES;	\
			char *NewBuffer	= new char[NewBufferLength];				\
			memcpy(NewBuffer, Buffer, BufferLength);					\
			delete[] Buffer;											\
			Buffer = NewBuffer;											\
			BufferLength = NewBufferLength;								\
			}															\
		Buffer[Pos++] = c;												\
		}

// Get next sequence from file.
char *GetFastaSeq(FILE *f, unsigned *ptrSeqLength, char **ptrLabel, bool DeleteGaps)
	{
	unsigned BufferLength = 0;
	unsigned Pos = 0;
	char *Buffer = 0;

	int c = fgetc(f);
	if (EOF == c)
		return 0;
	// = on a line by itself indicates the end of a Multi-FastA entry in an eXtended Multi-FastA file
	if ('=' == c)
	{
		int c = fgetc(f);
		if (NL == c)
			return 0;
		if (CR == c)
		{
			c = fgetc(f);
			if (NL == c)
				return 0;
		}
		Quit("Invalid file format, '=' may only be used on a line by itself to indicate the end of a MFA entry in an XMFA");
	}

	if ('>' != c)
		Quit("Invalid file format, expected '>' to start FASTA label");

	for (;;)
		{
		int c = fgetc(f);
		if (EOF == c)
			Quit("End-of-file or input error in FASTA label");

	// NL or CR terminates label
		if (NL == c || CR == c)
			break;

	// All other characters added to label
		ADD(c)
		}

// Nul-terminate label
	ADD(0)
	*ptrLabel = Buffer;

	BufferLength = 0;
	Pos = 0;
	Buffer = 0;
	int PreviousChar = NL;
	for (;;)
		{
		int c = fgetc(f);
		if( '=' == c && NL == PreviousChar )
		{
			ungetc(c, f);
			break;	// use = on a line by itself to signify the end of a multi-fasta entry
		}
		if (EOF == c)
			{
			if (feof(f))
				break;
			else if (ferror(f))
				Quit("Error reading FASTA file, ferror=TRUE feof=FALSE errno=%d %s",
				  errno, strerror(errno));
			else
				Quit("Error reading FASTA file, fgetc=EOF feof=FALSE ferror=FALSE errno=%d %s",
				  errno, strerror(errno));
			}

		if ('>' == c)
			{
			if (NL == PreviousChar || CR == PreviousChar)
				{
				ungetc(c, f);
				break;
				}
			else
				Quit("Unexpected '>' in FASTA sequence data");
			}
		else if (isspace(c))
			;
		else if (IsGapChar(c))
			{
			if (!DeleteGaps)
				ADD(c)
			}
		else if (isalpha(c))
			{
			c = toupper(c);
			ADD(c)
			}
		else if (isprint(c))
			{
			Warning("Invalid character '%c' in FASTA sequence data, ignored", c);
			continue;
			}
		else
			{
			Warning("Invalid byte hex %02x in FASTA sequence data, ignored", (unsigned char) c);
			continue;
			}
		PreviousChar = c;
		}

	if (0 == Pos)
		return GetFastaSeq(f, ptrSeqLength, ptrLabel, DeleteGaps);

	*ptrSeqLength = Pos;
	return Buffer;
	}
}