File: readFasta.h

package info (click to toggle)
librostlab 1.0.20-1
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 1,616 kB
  • sloc: sh: 10,131; cpp: 826; makefile: 109
file content (107 lines) | stat: -rw-r--r-- 3,443 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
/*
    Copyright (C) 2011 Laszlo Kajan, Technical University of Munich, Germany

    This file is part of librostlab.

    librostlab is free software: you can redistribute it and/or modify
    it under the terms of the GNU Lesser General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Lesser General Public License for more details.

    You should have received a copy of the GNU Lesser General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef ROSTLAB_READFASTA
#define ROSTLAB_READFASTA 1

#include <boost/regex.hpp>
#include <iostream>
#include <fstream>
#include "rostlab/rostlab_stdexcept.h"

namespace bo = boost;

namespace rostlab {
namespace bio {

  namespace fmt {
    class fasta{}; // fasta format class
  };

template<typename _FmtT>
class seq {
  private:
    std::string        _desc;
    std::string        _display_id;
    std::string        _seqstr;
  public:
                  seq(){};
                  seq( const std::string& __desc, const std::string& __display_id, const std::string& __seqstr ) : _desc(__desc), _display_id(__display_id), _seqstr(__seqstr) {};
    virtual       ~seq(){};

    std::string&       seqstr(){ return _seqstr; };
};

/*template<> // could specialize it...
class seq<bio::fmt::fasta>
{
  private:
  public:
};*/

/*template<typename _FmtT>
std::istream&          operator>>( std::istream& __is, bio::seq<_FmtT>& __n )
{
  return __is;
}*/

inline std::istream&   operator>>( std::istream& __is, bio::seq<bio::fmt::fasta>& __seq )
{
  // based on Bio/SeqIO/fasta.pm
  std::string rec; rec.reserve(1024);
  while( __is.peek() != std::istream::traits_type::eof() )
  {
    if(rec.capacity() == rec.size()) rec.reserve(rec.capacity() * 2);
    if( rec.size() && __is.peek() == '>' && *rec.rbegin() == '\n' ) break;
    else rec += __is.get();
  }

  if( !rec.size() || *rec.begin() != '>' ) throw runtime_error( std::string("FASTA syntax error in record '") + rec + "': no leading '>'" );

  rec = bo::regex_replace( rec, bo::regex("^>"), "" ); // $entry =~ s/^>//;

  bo::sregex_token_iterator i(rec.begin(), rec.end(), bo::regex("\n"), -1); // split(/\n/,$entry,2);

  if( i == boost::sregex_token_iterator() ) throw runtime_error( std::string("FASTA syntax error in record '") + rec + "': only one line" );

  std::string top = *i++;
  std::string sequence( i->first, static_cast<std::string::const_iterator>( rec.end() ) );

  sequence = bo::regex_replace( sequence, bo::regex(">"), "" ); // $sequence =~ s/>//g;

  bo::match_results<std::string::const_iterator> what;
  std::string id, fulldesc;
  if( bo::regex_search( top, what, bo::regex("^[[:space:]]*([^[:space:]]+)[:space:]*(.*)") ) )
  { id = std::string( what[1].first, what[1].second ); fulldesc = std::string( what[2].first, what[2].second ); }

  if( id.empty() ) id = fulldesc;

  sequence = bo::regex_replace( sequence, bo::regex("[ \t\n\r]"), "" );

  // alphabet? would be good to have this

  __seq = bio::seq<bio::fmt::fasta>( fulldesc, id, sequence );

  return __is;
}

}; // namespace bio
}; // namespace rostlab

#endif /* ROSTLAB_READFASTA */
// vim:et:ts=2:ai: