File: gzFasta.cc

package info (click to toggle)
stacks 2.55%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 2,420 kB
  • sloc: cpp: 38,596; perl: 1,337; sh: 539; python: 497; makefile: 144
file content (145 lines) | stat: -rw-r--r-- 4,172 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
//
// Copyright 2013-2017, Julian Catchen <jcatchen@illinois.edu>
//
// This file is part of Stacks.
//
// Stacks is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// Stacks is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with Stacks.  If not, see <http://www.gnu.org/licenses/>.
//

#include "gzFasta.h"

#ifdef HAVE_LIBZ

GzFasta::GzFasta(const char *path) : Input(), gz_fh(NULL)
{
    this->open(path);
}

void
GzFasta::open(const char *path)
{
    this->gz_fh = gzopen(path, "rb");
    if (!this->gz_fh) {
        cerr << "Failed to open gzipped file '" << path << "': " << strerror(errno) << ".\n";
        exit(EXIT_FAILURE);
    }
    #if ZLIB_VERNUM >= 0x1240
    gzbuffer(this->gz_fh, libz_buffer_size);
    #endif
    int first = gzgetc(gz_fh);
    if (first == -1) {
        int errnum;
        const char* errstr = gzerror(gz_fh, &errnum);
        cerr << "Error: Failed to read any content from '" << path
            << "' (gzerror: " << errstr << ").\n";
        exit(EXIT_FAILURE);
    } else if (first != '>') {
        cerr << "Error: '" << path << "': not in fasta format (expected '>', got 0x"
             << std::hex << first << " '" << flush << (char) first << "')."
             << " (note: using zlib version " << zlibVersion()
             << "; compiled with zlib version " << ZLIB_VERSION << ")\n" << flush;
        exit(EXIT_FAILURE);
    }
    gzrewind(gz_fh);
}

Seq *
GzFasta::next_seq()
{
    #ifdef DEBUG
    DOES_NOT_HAPPEN; // As this function isn't efficient.
    #endif
    Seq* s = new Seq();
    s->id = new char[id_len];
    if(!next_seq(*s)) {
        delete s;
        return NULL;
    }
    return s;
}

int
GzFasta::next_seq(Seq &s)
{
    this->buf.clear();

    //
    // Check the contents of the line buffer. When we finish reading a FASTA record
    // the buffer will either contain whitespace or the header of the next FAST
    // record.
    //
    while (this->line[0] != '>' && !gzeof(this->gz_fh)) {
        gzgets(this->gz_fh, this->line, max_len);
    }

    if (gzeof(this->gz_fh)) {
        return false;
    }

    //
    // Check if there is a carraige return in the buffer
    //
    uint len = strlen(this->line);
    if (len >= 1 && this->line[len - 1] == '\n') this->line[len - 1] = '\0';
    if (len >= 2 && this->line[len - 2] == '\r') this->line[len - 2] = '\0';

    //
    // Check if the ID line of the FASTA file has a comment after the ID.
    //
    char* q = this->line + 1;
    ++q;
    while (*q != '\0' && *q != ' ' && *q != '\t')
        ++q;
    if (*q != '\0') {
        // Comment present.
        *q = '\0';
        ++q;
        s.comment.assign(q);
    }
    assert(s.id != NULL);
    strcpy(s.id, this->line + 1);

    //
    // Read the sequence from the file -- keep reading lines until we reach the next
    // record or the end of file.
    //
    gzgets(this->gz_fh, this->line, max_len);

    while (this->line[0] != '>' && !gzeof(this->gz_fh)) {
        len = strlen(this->line);
        if (len >= 1 && this->line[len - 1] == '\n') this->line[len - 1] = '\0';
        if (len >= 2 && this->line[len - 2] == '\r') this->line[len - 2] = '\0';

        this->buf    += this->line;
        this->line[0] = '\0';
        gzgets(this->gz_fh, this->line, max_len);
    }

    if (gzeof(this->gz_fh)) {
        len = strlen(this->line);
        if (len >= 1 && this->line[len - 1] == '\n') this->line[len - 1] = '\0';
        if (len >= 2 && this->line[len - 2] == '\r') this->line[len - 2] = '\0';

        this->buf += this->line;
        this->line[0] = '\0';
    }

    s.reserve(buf.length(), false);
    strcpy(s.seq, this->buf.c_str());

    return true;
}

#endif // HAVE_LIBZ