File: fasta-file.hpp

package info (click to toggle)
sra-sdk 3.0.3%2Bdfsg-6~deb12u1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 165,852 kB
  • sloc: ansic: 374,775; cpp: 232,734; perl: 8,959; java: 6,253; sh: 6,032; python: 3,890; makefile: 1,046; yacc: 703; xml: 310; lex: 235
file content (63 lines) | stat: -rw-r--r-- 2,171 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
/*==============================================================================
 *
 *                            PUBLIC DOMAIN NOTICE
 *               National Center for Biotechnology Information
 *
 *  This software/database is a "United States Government Work" under the
 *  terms of the United States Copyright Act.  It was written as part of
 *  the author's official duties as a United States Government employee and
 *  thus cannot be copyrighted.  This software/database is freely available
 *  to the public for use. The National Library of Medicine and the U.S.
 *  Government have not placed any restriction on its use or reproduction.
 *
 *  Although all reasonable efforts have been taken to ensure the accuracy
 *  and reliability of the software and data, the NLM and the U.S.
 *  Government do not and cannot warrant the performance or results that
 *  may be obtained by using this software or data. The NLM and the U.S.
 *  Government disclaim all warranties, express or implied, including
 *  warranties of performance, merchantability or fitness for any particular
 *  purpose.
 *
 *  Please cite the author in any work or product based on this material.
 *
 * ===========================================================================
 */

#include <string>
#include <vector>
#include <iostream>

/*
 * Fasta files:
 *  Fasta file consists of one of more sequences.  A sequence in a fasta file
 *  consists of a seqid line followed by lines containing the bases of the
 *  sequence.  A seqid line starts with '>' and the next word (whitespace
 *  delimited) is the seqid.
 */

class FastaFile {
    FastaFile() : data(NULL) {}
    FastaFile(std::istream &is);

    void *data;
public:
    struct Sequence {
        std::string SEQID;
        std::string SEQID_LINE;
        char const *data;
        unsigned length;
        bool hadErrors; // erroneous base values are replaced with N
    };

    std::vector<Sequence const> sequences;

    ~FastaFile() {
        free(data);
        data = nullptr;
    }

    static FastaFile load(std::istream &is) {
        return FastaFile(is);
    }
    static FastaFile load(std::string const filename);
};