File: tokenizer.hpp

package info (click to toggle)
aspell 0.60.8.2-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 15,336 kB
  • sloc: cpp: 24,378; sh: 12,340; perl: 1,924; ansic: 1,661; makefile: 852; sed: 16
file content (72 lines) | stat: -rw-r--r-- 1,909 bytes parent folder | download | duplicates (13)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
// This file is part of The New Aspell
// Copyright (C) 2001 by Kevin Atkinson under the GNU LGPL license
// version 2.0 or 2.1.  You should have received a copy of the LGPL
// license along with this library if you did not you can find
// it at http://www.gnu.org/.

#ifndef ACOMMON_TOKENIZER__HPP
#define ACOMMON_TOKENIZER__HPP

#include "char_vector.hpp"
#include "filter_char.hpp"
#include "filter_char_vector.hpp"

namespace acommon {

  class Convert;
  class Speller;
  class Config;

  class Tokenizer {

  public:
    Tokenizer();
    virtual ~Tokenizer();

    FilterChar * word_begin;
    FilterChar * word_end;
    FilterChar * end;
    
    CharVector word; // this word is in the final encoded form
    unsigned int begin_pos; // pointers back to the original word
    unsigned int end_pos;
    
    // The string passed in _must_ have a null character
    // at stop - 1. (ie stop must be one past the end)
    void reset (FilterChar * in, FilterChar * stop);
    bool at_end() const {return word_begin == word_end;}
    
    virtual bool advance() = 0; // returns false if there is nothing left

    bool is_begin(unsigned char c) const
      {return char_type_[c].begin;}
    bool is_middle(unsigned char c) const
      {return char_type_[c].middle;}
    bool is_end(unsigned char c) const
      {return char_type_[c].end;}
    bool is_word(unsigned char c) const
      {return char_type_[c].word;}

  public: // but don't use
    // The speller class is expected to fill these members in
    struct CharType {
      bool begin;
      bool middle;
      bool end;
      bool word;
      CharType() : begin(false), middle(false), end(false), word(false) {}
    };
    
    CharType char_type_[256];
    Convert * conv_;
    FilterCharVector buf_;
  };

  // returns a new tokenizer and sets it up with the given speller
  // class

  PosibErr<Tokenizer *> new_tokenizer(Speller *);

}

#endif