File: seq.c

package info (click to toggle)
maq 0.7.1-10
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,500 kB
  • sloc: cpp: 5,025; ansic: 3,333; sh: 3,303; perl: 2,547; makefile: 31
file content (109 lines) | stat: -rw-r--r-- 2,947 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include "seq.h"

static int SEQ_BLOCK_SIZE = 512;

void seq_set_block_size(int size)
{
	SEQ_BLOCK_SIZE = size;
}

/* Read sequences from file "fp" in FASTA format. Sequence will be saved
 * in "seq", sequence ID in "locus", and comment saved in "comment",
 * provided "comment != 0". Sequence length will be returned. If -1 is
 * returned, no sequence is left in the file. */
int seq_read_fasta(FILE *fp, seq_t *seq, char *locus, char *comment)
{
	int c, l, max;
	char *p;
	
	c = 0;
	while (!feof(fp) && fgetc(fp) != '>');
	if (feof(fp)) return -1;
	p = locus;
	while (!feof(fp) && (c = fgetc(fp)) != ' ' && c != '\t' && c != '\n')
		if (c != '\r') *p++ = c;
	*p = '\0';
	if (comment) {
		p = comment;
		if (c != '\n') {
			while (!feof(fp) && ((c = fgetc(fp)) == ' ' || c == '\t'));
			if (c != '\n') {
				*p++ = c;
				while (!feof(fp) && (c = fgetc(fp)) != '\n')
					if (c != '\r') *p++ = c;
			}
		}
		*p = '\0';
	} else if (c != '\n') while (!feof(fp) && fgetc(fp) != '\n');
	l = 0; max = seq->m;
	while (!feof(fp) && (c = fgetc(fp)) != '>') {
		if (isalpha(c) || c == '-' || c == '.') {
			if (l + 1 >= max) {
				max += SEQ_BLOCK_SIZE;
				seq->s = (unsigned char*)realloc(seq->s, sizeof(char) * max);
			}
			seq->s[l++] = (unsigned char)c;
		}
	}
	if (c == '>') ungetc(c,fp);
	seq->s[l] = 0;
	seq->m = max; seq->l = l;
	return l;
}
int seq_read_fastq(FILE *fp, seq_t *seq, seq_t *qual, char *name)
{
	int c, l, max;
	char *p;
	
	c = 0;
	/* first read the sequence */
	while (!feof(fp) && fgetc(fp) != '@');
	if (feof(fp)) return -1;
	p = name;
	while (!feof(fp) && (c = fgetc(fp)) != ' ' && c != '\t' && c != '\n')
		if (c != '\r') *p++ = c;
	*p = '\0';
	if (c != '\n') while (!feof(fp) && fgetc(fp) != '\n');
	l = 0; max = seq->m;
	while (!feof(fp) && (c = fgetc(fp)) != '+') {
		if (isalpha(c) || c == '-' || c == '.') {
			if (l + 1 >= max) {
				max += SEQ_BLOCK_SIZE;
				seq->s = (unsigned char*)realloc(seq->s, sizeof(char) * max);
			}
			seq->s[l++] = (unsigned char)c;
		}
	}
	if (c == '+') ungetc(c, fp);
	seq->s[l] = 0;
	seq->m = max; seq->l = l;
	/* now come to the qualities */
	while (!feof(fp) && fgetc(fp) != '+');
	p = name;
	while (!feof(fp) && (c = fgetc(fp)) != ' ' && c != '\t' && c != '\n')
		if (c != '\r' && *p++ != c) {
			fprintf(stderr, "[seq_read_fastq] Inconsistent sequence name: %s. Continue anyway.\n", name);
			return seq->l;
		}
	l = 0; max = qual->m;
	while (!feof(fp) && l < seq->l) {
		c = fgetc(fp);
		if (c >= 33 && c < 127) {
			if (l + 1 >= max) {
				max += SEQ_BLOCK_SIZE;
				qual->s = (unsigned char*)realloc(qual->s, sizeof(char) * max);
			}
			qual->s[l++] = (unsigned char)(c - 33);
		}
	}
	c = fgetc(fp);
	if (c == '@') ungetc(c, fp);
	qual->s[l] = 0;
	qual->m = max; qual->l = l;
	if (seq->l != qual->l)
		fprintf(stderr, "[seq_read_fastq] Inconsistent length: %d!=%d. Continue anyway.", seq->l, qual->l);
	return seq->l;
}