File: cjk-tokenizer.cc

package info (click to toggle)
xapian-core 1.4.3-2%2Bdeb9u3
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 21,412 kB
  • sloc: cpp: 113,868; ansic: 8,723; sh: 4,433; perl: 836; makefile: 566; tcl: 317; python: 40
file content (126 lines) | stat: -rw-r--r-- 3,750 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
/** @file cjk-tokenizer.cc
 * @brief Tokenise CJK text as n-grams
 */
/* Copyright (c) 2007, 2008 Yung-chung Lin (henearkrxern@gmail.com)
 * Copyright (c) 2011 Richard Boulton (richard@tartarus.org)
 * Copyright (c) 2011 Brandon Schaefer (brandontschaefer@gmail.com)
 * Copyright (c) 2011 Olly Betts
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#include <config.h>

#include "cjk-tokenizer.h"

#include "omassert.h"
#include "xapian/unicode.h"

#include <cstdlib>
#include <string>

using namespace std;

static unsigned NGRAM_SIZE = 2;

bool
CJK::is_cjk_enabled()
{
    const char * p;
    static bool result = ((p = getenv("XAPIAN_CJK_NGRAM")) != NULL && *p);
    return result;
}

// 2E80..2EFF; CJK Radicals Supplement
// 3000..303F; CJK Symbols and Punctuation
// 3040..309F; Hiragana
// 30A0..30FF; Katakana
// 3100..312F; Bopomofo
// 3130..318F; Hangul Compatibility Jamo
// 3190..319F; Kanbun
// 31A0..31BF; Bopomofo Extended
// 31C0..31EF; CJK Strokes
// 31F0..31FF; Katakana Phonetic Extensions
// 3200..32FF; Enclosed CJK Letters and Months
// 3300..33FF; CJK Compatibility
// 3400..4DBF; CJK Unified Ideographs Extension A
// 4DC0..4DFF; Yijing Hexagram Symbols
// 4E00..9FFF; CJK Unified Ideographs
// A700..A71F; Modifier Tone Letters
// AC00..D7AF; Hangul Syllables
// F900..FAFF; CJK Compatibility Ideographs
// FE30..FE4F; CJK Compatibility Forms
// FF00..FFEF; Halfwidth and Fullwidth Forms
// 20000..2A6DF; CJK Unified Ideographs Extension B
// 2F800..2FA1F; CJK Compatibility Ideographs Supplement
bool
CJK::codepoint_is_cjk(unsigned p)
{
    if (p < 0x2E80) return false;
    return ((p >= 0x2E80 && p <= 0x2EFF) ||
	    (p >= 0x3000 && p <= 0x9FFF) ||
	    (p >= 0xA700 && p <= 0xA71F) ||
	    (p >= 0xAC00 && p <= 0xD7AF) ||
	    (p >= 0xF900 && p <= 0xFAFF) ||
	    (p >= 0xFE30 && p <= 0xFE4F) ||
	    (p >= 0xFF00 && p <= 0xFFEF) ||
	    (p >= 0x20000 && p <= 0x2A6DF) ||
	    (p >= 0x2F800 && p <= 0x2FA1F));
}

string
CJK::get_cjk(Xapian::Utf8Iterator &it)
{
    string str;
    while (it != Xapian::Utf8Iterator() &&
	   codepoint_is_cjk(*it) &&
	   Xapian::Unicode::is_wordchar(*it)) {
	Xapian::Unicode::append_utf8(str, *it);
	++it;
    }
    return str;
}

const string &
CJKTokenIterator::operator*() const
{
    if (current_token.empty()) {
	Assert(it != Xapian::Utf8Iterator());
	p = it;
	Xapian::Unicode::append_utf8(current_token, *p);
	++p;
	len = 1;
    }
    return current_token;
}

CJKTokenIterator &
CJKTokenIterator::operator++()
{
    if (len < NGRAM_SIZE && p != Xapian::Utf8Iterator()) {
	Xapian::Unicode::append_utf8(current_token, *p);
	++p;
	++len;
    } else {
	Assert(it != Xapian::Utf8Iterator());
	++it;
	current_token.resize(0);
    }
    return *this;
}