1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
|
#include "libMUSCLE/muscle.h"
#include <stdio.h>
#include <errno.h>
namespace muscle {
//const int BUFFER_BYTES = 16*1024;
const int BUFFER_BYTES = 128;
const int CR = '\r';
const int NL = '\n';
#define ADD(c) \
{ \
if (Pos >= BufferLength) \
{ \
const int NewBufferLength = BufferLength + BUFFER_BYTES; \
char *NewBuffer = new char[NewBufferLength]; \
memcpy(NewBuffer, Buffer, BufferLength); \
delete[] Buffer; \
Buffer = NewBuffer; \
BufferLength = NewBufferLength; \
} \
Buffer[Pos++] = c; \
}
// Get next sequence from file.
char *GetFastaSeq(FILE *f, unsigned *ptrSeqLength, char **ptrLabel, bool DeleteGaps)
{
unsigned BufferLength = 0;
unsigned Pos = 0;
char *Buffer = 0;
int c = fgetc(f);
if (EOF == c)
return 0;
// = on a line by itself indicates the end of a Multi-FastA entry in an eXtended Multi-FastA file
if ('=' == c)
{
int c = fgetc(f);
if (NL == c)
return 0;
if (CR == c)
{
c = fgetc(f);
if (NL == c)
return 0;
}
Quit("Invalid file format, '=' may only be used on a line by itself to indicate the end of a MFA entry in an XMFA");
}
if ('>' != c)
Quit("Invalid file format, expected '>' to start FASTA label");
for (;;)
{
int c = fgetc(f);
if (EOF == c)
Quit("End-of-file or input error in FASTA label");
// NL or CR terminates label
if (NL == c || CR == c)
break;
// All other characters added to label
ADD(c)
}
// Nul-terminate label
ADD(0)
*ptrLabel = Buffer;
BufferLength = 0;
Pos = 0;
Buffer = 0;
int PreviousChar = NL;
for (;;)
{
int c = fgetc(f);
if( '=' == c && NL == PreviousChar )
{
ungetc(c, f);
break; // use = on a line by itself to signify the end of a multi-fasta entry
}
if (EOF == c)
{
if (feof(f))
break;
else if (ferror(f))
Quit("Error reading FASTA file, ferror=TRUE feof=FALSE errno=%d %s",
errno, strerror(errno));
else
Quit("Error reading FASTA file, fgetc=EOF feof=FALSE ferror=FALSE errno=%d %s",
errno, strerror(errno));
}
if ('>' == c)
{
if (NL == PreviousChar || CR == PreviousChar)
{
ungetc(c, f);
break;
}
else
Quit("Unexpected '>' in FASTA sequence data");
}
else if (isspace(c))
;
else if (IsGapChar(c))
{
if (!DeleteGaps)
ADD(c)
}
else if (isalpha(c))
{
c = toupper(c);
ADD(c)
}
else if (isprint(c))
{
Warning("Invalid character '%c' in FASTA sequence data, ignored", c);
continue;
}
else
{
Warning("Invalid byte hex %02x in FASTA sequence data, ignored", (unsigned char) c);
continue;
}
PreviousChar = c;
}
if (0 == Pos)
return GetFastaSeq(f, ptrSeqLength, ptrLabel, DeleteGaps);
*ptrSeqLength = Pos;
return Buffer;
}
}
|