1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139
|
#ifndef UNF_NORMALIZER_HH
#define UNF_NORMALIZER_HH
#include <vector>
#include <string>
#include <algorithm>
#include <cstring>
#include "trie/searcher.hh"
#include "trie/char_stream.hh"
#include "table.hh"
#include "util.hh"
namespace UNF {
class Normalizer {
public:
enum Form { FORM_NFD, FORM_NFC, FORM_NFKD, FORM_NFKC };
public:
Normalizer()
: nf_d(TABLE::NODES, TABLE::CANONICAL_DECOM_ROOT, TABLE::STRINGS),
nf_kd(TABLE::NODES, TABLE::COMPATIBILITY_DECOM_ROOT, TABLE::STRINGS),
nf_c(TABLE::NODES, TABLE::CANONICAL_COM_ROOT, TABLE::STRINGS),
nf_c_qc(TABLE::NODES, TABLE::NFC_ILLEGAL_ROOT),
nf_kc_qc(TABLE::NODES, TABLE::NFKC_ILLEGAL_ROOT),
ccc(TABLE::NODES, TABLE::CANONICAL_CLASS_ROOT)
{}
const char* normalize(const char* src, Form form) {
switch(form) {
case FORM_NFD: return nfd(src);
case FORM_NFC: return nfc(src);
case FORM_NFKD: return nfkd(src);
case FORM_NFKC: return nfkc(src);
default: return src;
}
}
const char* nfd(const char* src) { return decompose(src, nf_d); }
const char* nfkd(const char* src) { return decompose(src, nf_kd); }
const char* nfc(const char* src) { return compose(src, nf_c_qc, nf_d); }
const char* nfkc(const char* src) { return compose(src, nf_kc_qc, nf_kd); }
private:
const char* decompose(const char* src, const Trie::NormalizationForm& nf) {
const char* beg = next_invalid_char(src, nf);
if(*beg=='\0')
return src;
buffer.assign(src, beg);
do {
const char* end = next_valid_starter(beg, nf);
decompose_one(beg, end, nf, buffer);
beg = next_invalid_char(end, nf);
buffer.append(end, beg);
} while(*beg!='\0');
return buffer.c_str();
}
void decompose_one(const char* beg, const char* end, const Trie::NormalizationForm& nf, std::string& buf) {
unsigned last = buf.size();
nf.decompose(Trie::RangeCharStream(beg,end), buf);
char* bufbeg = const_cast<char*>(buf.data());
canonical_combining_class_ordering(bufbeg+last, bufbeg+buf.size());
}
const char* compose(const char* src, const Trie::NormalizationForm& nf, const Trie::NormalizationForm& nf_decomp) {
const char* beg = next_invalid_char(src, nf);
if(*beg=='\0')
return src;
buffer.assign(src, beg);
while(*beg!='\0') {
const char* end = next_valid_starter(beg, nf);
buffer2.clear();
decompose_one(beg, end, nf_decomp, buffer2);
end = compose_one(buffer2.c_str(), end, buffer);
beg = next_invalid_char(end, nf);
buffer.append(end, beg);
}
return buffer.c_str();
}
const char* compose_one(const char* starter, const char* rest_starter, std::string& buf) {
Trie::CharStreamForComposition in(starter, rest_starter, canonical_classes, buffer3);
while(in.within_first())
nf_c.compose(in, buf);
return in.cur();
}
void canonical_combining_class_ordering(char* beg, const char* end) {
canonical_classes.assign(end-beg+1, 0); // +1 is for sentinel value
ccc.sort(beg, canonical_classes);
}
const char* next_invalid_char(const char* src, const Trie::NormalizationForm& nf) const {
int last_canonical_class = 0;
const char* cur = Util::nearest_utf8_char_start_point(src);
const char* starter = cur;
for(; *cur != '\0'; cur = Util::nearest_utf8_char_start_point(cur+1)) {
int canonical_class = ccc.get_class(cur);
if(last_canonical_class > canonical_class && canonical_class != 0)
return starter;
if(nf.quick_check(cur)==false)
return starter;
if(canonical_class==0)
starter=cur;
last_canonical_class = canonical_class;
}
return cur;
}
const char* next_valid_starter(const char* src, const Trie::NormalizationForm& nf) const {
const char* cur = Util::nearest_utf8_char_start_point(src+1);
while(ccc.get_class(cur)!=0 || nf.quick_check(cur)==false)
cur = Util::nearest_utf8_char_start_point(cur+1);
return cur;
}
private:
const Trie::NormalizationForm nf_d;
const Trie::NormalizationForm nf_kd;
const Trie::NormalizationForm nf_c;
const Trie::NormalizationForm nf_c_qc;
const Trie::NormalizationForm nf_kc_qc;
const Trie::CanonicalCombiningClass ccc;
std::string buffer;
std::string buffer2;
std::string buffer3;
std::vector<unsigned char> canonical_classes;
};
}
#endif
|