1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280
|
/*
* Jeffrey Friedl
* Omron Corporation ʳ
* Nagaokakyoshi, Japan 617Ĺ
*
* jfriedl@nff.ncl.omron.co.jp
*
* This work is placed under the terms of the GNU General Purpose License
* (the "GNU Copyleft").
*
* Oct 1993
*
* Feb 1995: Added fuzziness.
* See "fuzzhkana.h" for overall comments.
*/
#include "config.h"
#include "assert.h"
#include "output.h"
#include "fuzzkana.h"
#include "kanaid.h"
/*
* Given
*
* IN -- some regex pattern string whose kana is to be fuzzified.
* OUT -- where to stick the new pattern string
* OUT_SIZE -- size of area pointed to by OUT.
*
* FLAGS --
* If FUZZ_LONG_VOWELS is set, the pattern will be written such
* that longness of vowels doesn't matter.
*
* If FUZZ_SMALL_TSU is set, the pattern will be written such
* that small TSUs won't matter.
*
* The number of bytes written to the output is returned, or zero if the
* output buffer was overflowed (or, I suppose, if the input is empty)
*
* If OUT is zero, nothing is written (obviously)... the number of bytes
* that would be required is returned (OUT_SIZE is ignored in this case)
*
* An example with both flags set
* in ֤Ȥ
* out ֤[]*?[]*
*
*/
#define LEAST_KANJI_HI_BYTE 0260
#define GREATEST_KANJI_HI_BYTE 0364
#define IS_EUC(HighByte) ((HighByte) & 0x80)
unsigned fuzzkana(const unsigned char *in,
unsigned char *out,
unsigned out_size,
unsigned flags)
{
const unsigned char *orig_out = out;
const unsigned char *out_end = out + out_size;
unsigned char hi, lo;
int just_want_size = (out == 0);
int do_voiced = (flags & FUZZ_VOICED);
int do_vowels = (flags & FUZZ_LONG_VOWELS);
int do_tsu = (flags & FUZZ_SMALL_TSU);
#ifdef FUZZ_REPEATER
int do_repeat = (flags & FUZZ_REPEATER);
#endif
if (in == 0 || *in == 0)
return 0;
/*
* Shove the given character to the 'out' buffer,
* aborting the function if the buffer is overflowed.
* However, if just requesting how much buffer is required
* (buy supplying no out buffer), just bump up the counter.
*/
#define SENDOUT(c) \
macro_start { \
unsigned char value = (c); \
if (just_want_size) \
out++; \
else \
{ \
if (out >= out_end) \
return 0; \
*out++ = value; \
} \
} macro_end
/* to tack on the given string to the output */
#define add(STR) \
macro_start { \
const unsigned char *str = (const unsigned char *)(STR); \
while (*str) \
SENDOUT(*str++); \
} macro_end
/* run through the line */
while (hi = *in++, hi != 0)
{
unsigned next_id;
unsigned vsound;
unsigned id;
/* Just pass through ASCII characters */
if (!IS_EUC(hi))
{
SENDOUT(hi);
continue;
}
lo = *in++; /* get the next byte of the EUC char. */
/* if this char is not kana, so just pass through and continue */
if (!IS_KANA(hi,lo))
{
SENDOUT(hi);
SENDOUT(lo);
#ifdef FUZZ_REPEATER_no_no_no /* now done in jregex.c */
#define REPEATER_HI_BYTE 0241 /* high byte of */
#define REPEATER_LO_BYTE 0271 /* low byte of */
if (do_repeat &&
hi >= LEAST_KANJI_HI_BYTE && hi <= GREATEST_KANJI_HI_BYTE &&
((in[0] == hi && in[1] == lo) ||
(in[0] == REPEATER_HI_BYTE && in[1] == REPEATER_LO_BYTE)))
{
SENDOUT('[');
SENDOUT(REPEATER_HI_BYTE);
SENDOUT(REPEATER_LO_BYTE);
SENDOUT(hi);
SENDOUT(lo);
SENDOUT(']');
in += 2;
}
#endif /* FUZZ_REPEATER */
continue;
}
id = KANA_ID(hi,lo); /* the the id flags for this kana */
/* if we're doing voiced fuzz, fuzz dual-chracter'd sounds */
if (do_voiced && (id & KID_DUAL))
{
switch(id)
{
default:
die("oops, %02x %02x -> id is %x\n", hi, lo, id);
break;
case KID_E | KID_VOWEL | KID_DUAL: /* */
case KID_ARCHAIC | KID_DUAL: /* */
add(IS_HIRAGANA(hi,lo) ? "[]" : "[]");
break;
case KID_O | KID_VOWEL | KID_DUAL: /* */
case KID_o | KID_DUAL: /* */
add(IS_HIRAGANA(hi,lo) ? "[]" : "[]");
break;
case KID_I | KID_Z | KID_DUAL: /* */
case KID_I | KID_D | KID_DUAL: /* */
add(IS_HIRAGANA(hi,lo) ? "[]" : "[]");
break;
case KID_U | KID_D | KID_DUAL: /* */
case KID_U | KID_Z | KID_DUAL: /* */
add(IS_HIRAGANA(hi,lo) ? "[Ť]" : "[ť]");
break;
}
} else {
/* otherwise, just pass through */
SENDOUT(hi);
SENDOUT(lo);
}
vsound = id & KID_VSOUND;
#define GET_NEXT_ID(ptr) \
macro_start { \
if (!IS_EUC((ptr)[0])) \
next_id = 0; \
else if (IS_DASH((ptr)[0], (ptr)[1])) \
next_id = vsound | KID_VOWEL; \
else if (!IS_KANA((ptr)[0], (ptr)[1])) \
next_id = 0; \
else \
next_id = KANA_ID((ptr)[0], (ptr)[1]); \
} macro_end
GET_NEXT_ID(in);
#if 0
/* consider the O and U sound to be the same */
if (vsound & (KID_O|KID_U))
vsound = KID_O|KID_U;
#endif
if (do_vowels)
{
/*
* If current character has a vowel sound and is not followed
* by a small y-consonant sound, allow to be doubled.
*/
if (vsound && (next_id & (KID_SMALL|KID_Y)) != (KID_SMALL|KID_Y))
{
/*
* Unless they have case folding off, it won't matter which
* of these two we use, but since it would matter if they
* had it off, we'll separate them....
*/
if (IS_HIRAGANA(hi,lo))
switch (vsound) {
default: assert(0); break;
case KID_A: add("[]*"); break;
case KID_I: add("[]*"); break;
case KID_U: add("[]*"); break;
case KID_O: add("[]*"); vsound |= KID_U; break;
case KID_E: add("[]*"); break;
}
else
switch (vsound) {
default: assert(0); break;
case KID_A: add("[]*"); break;
case KID_I: add("[]*"); break;
case KID_U: add("[]*"); break;
case KID_O: add("[]*"); vsound |= KID_U; break;
case KID_E: add("[]*"); break;
}
/*
* If the next char is the simple vowel we've just
* allowed to be doubled, skip it.
*
* If the input string is something like֤
* This IF will cause each֤pair to combine into
* a֤[]*pattern. However, if you make the IF
* a WHILE, it would collapse them all down into one.
* Either method has their benefits.
*/
while ((next_id & KID_VOWEL) && (next_id & vsound))
{
in += 2; /* skip the vowel we've just replaced */
GET_NEXT_ID(in);
}
}
}
if (do_tsu)
{
unsigned next_is_small_tsu = (next_id & (KID_T|KID_U|KID_SMALL)) ==
(KID_T|KID_U|KID_SMALL);
/*
* if the next thing is a hard-coded small-TSU, or all of
* + currently have a vowel sound, and
* + the next character has an "appropriate" consonant sound,
* + and the next character isn't small.
* then we'll add a possible small tsu.
*/
#ifndef SMALL_TSU_OK
#define SMALL_TSU_OK (KID_K|KID_S|KID_T|KID_D|KID_P|KID_W|KID_M)
#endif
if (next_is_small_tsu ||
(vsound && (next_id & SMALL_TSU_OK) && !(next_id & KID_SMALL)))
{
/* add a possability for a small TSU */
add("?");
if (next_is_small_tsu)
in += 2; /* skip small tsu that's there */
}
}
}
SENDOUT(0); /* tack on a null */
return out - orig_out;
}
|