File: normalizer.hh

package info (click to toggle)
ruby-unf-ext 0.0.6-2
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 2,128 kB
  • ctags: 147
  • sloc: cpp: 14,043; ruby: 79; makefile: 2
file content (139 lines) | stat: -rw-r--r-- 4,432 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#ifndef UNF_NORMALIZER_HH
#define UNF_NORMALIZER_HH

#include <vector>
#include <string>
#include <algorithm>
#include <cstring>
#include "trie/searcher.hh"
#include "trie/char_stream.hh"
#include "table.hh"
#include "util.hh"

namespace UNF {
  class Normalizer {
  public:
    enum Form { FORM_NFD, FORM_NFC, FORM_NFKD, FORM_NFKC };

  public:
    Normalizer()
      : nf_d(TABLE::NODES, TABLE::CANONICAL_DECOM_ROOT, TABLE::STRINGS),
	nf_kd(TABLE::NODES, TABLE::COMPATIBILITY_DECOM_ROOT, TABLE::STRINGS),
	nf_c(TABLE::NODES, TABLE::CANONICAL_COM_ROOT, TABLE::STRINGS),
	nf_c_qc(TABLE::NODES, TABLE::NFC_ILLEGAL_ROOT),
	nf_kc_qc(TABLE::NODES, TABLE::NFKC_ILLEGAL_ROOT),
	ccc(TABLE::NODES, TABLE::CANONICAL_CLASS_ROOT)
    {}

    const char* normalize(const char* src, Form form) {
      switch(form) {
      case FORM_NFD:  return nfd(src);
      case FORM_NFC:  return nfc(src);
      case FORM_NFKD: return nfkd(src);
      case FORM_NFKC: return nfkc(src);
      default:        return src;
      }
    }
    const char* nfd(const char* src)  { return decompose(src, nf_d); }
    const char* nfkd(const char* src) { return decompose(src, nf_kd); }
    const char* nfc(const char* src)  { return compose(src, nf_c_qc, nf_d); }
    const char* nfkc(const char* src) { return compose(src, nf_kc_qc, nf_kd); }

  private:
    const char* decompose(const char* src, const Trie::NormalizationForm& nf) {
      const char* beg = next_invalid_char(src, nf);
      if(*beg=='\0')
	return src;
      
      buffer.assign(src, beg);
      do {
	const char* end = next_valid_starter(beg, nf);
	decompose_one(beg, end, nf, buffer);
	beg = next_invalid_char(end, nf);
	buffer.append(end, beg);
      } while(*beg!='\0');
      
      return buffer.c_str();      
    }

    void decompose_one(const char* beg, const char* end, const Trie::NormalizationForm& nf, std::string& buf) {
      unsigned last = buf.size();
      nf.decompose(Trie::RangeCharStream(beg,end), buf);
      char* bufbeg = const_cast<char*>(buf.data());
      canonical_combining_class_ordering(bufbeg+last, bufbeg+buf.size());
    }

    const char* compose(const char* src, const Trie::NormalizationForm& nf, const Trie::NormalizationForm& nf_decomp) {
      const char* beg = next_invalid_char(src, nf);
      if(*beg=='\0')
	return src;
      
      buffer.assign(src, beg);
      while(*beg!='\0') {
	const char* end = next_valid_starter(beg, nf);
	buffer2.clear();
	decompose_one(beg, end, nf_decomp, buffer2);
	end = compose_one(buffer2.c_str(), end, buffer);
	beg = next_invalid_char(end, nf);
	buffer.append(end, beg);
      }

      return buffer.c_str();      
    }

    const char* compose_one(const char* starter, const char* rest_starter, std::string& buf) {
      Trie::CharStreamForComposition in(starter, rest_starter, canonical_classes, buffer3);
      while(in.within_first())
	nf_c.compose(in, buf);
      return in.cur();
    }

    void canonical_combining_class_ordering(char* beg, const char* end) {
      canonical_classes.assign(end-beg+1, 0); // +1 is for sentinel value
      ccc.sort(beg, canonical_classes);
    }

    const char* next_invalid_char(const char* src, const Trie::NormalizationForm& nf) const {
      int last_canonical_class = 0;
      const char* cur = Util::nearest_utf8_char_start_point(src);
      const char* starter = cur;
      
      for(; *cur != '\0'; cur = Util::nearest_utf8_char_start_point(cur+1)) {
	int canonical_class = ccc.get_class(cur);
	if(last_canonical_class > canonical_class && canonical_class != 0)
	  return starter;

	if(nf.quick_check(cur)==false)
	  return starter;

	if(canonical_class==0)
	  starter=cur;

	last_canonical_class = canonical_class;
      }
      return cur;
    }

    const char* next_valid_starter(const char* src, const Trie::NormalizationForm& nf) const {
      const char* cur = Util::nearest_utf8_char_start_point(src+1);
      while(ccc.get_class(cur)!=0 || nf.quick_check(cur)==false)
	cur = Util::nearest_utf8_char_start_point(cur+1);
      return cur;
    }

  private:
    const Trie::NormalizationForm nf_d;
    const Trie::NormalizationForm nf_kd;
    const Trie::NormalizationForm nf_c;
    const Trie::NormalizationForm nf_c_qc;
    const Trie::NormalizationForm nf_kc_qc;
    const Trie::CanonicalCombiningClass ccc;
    
    std::string buffer;
    std::string buffer2;
    std::string buffer3;
    std::vector<unsigned char> canonical_classes;
  };
}

#endif