File: fuzzkana.c

package info (click to toggle)
lookup 1.08b-9
links: PTS
area: main
in suites: sarge
size: 1,112 kB
ctags: 1,308
sloc: ansic: 12,637; makefile: 245; perl: 174; sh: 53
file content (280 lines) | stat: -rw-r--r-- 8,939 bytes
parent folder | download | duplicates (9)
/*
 * Jeffrey Friedl
 * Omron Corporation			ʳ
 * Nagaokakyoshi, Japan			617Ĺ
 *
 * jfriedl@nff.ncl.omron.co.jp
 *
 * This work is placed under the terms of the GNU General Purpose License
 * (the "GNU Copyleft").
 *
 * Oct 1993
 *
 * Feb 1995: Added  fuzziness.
 * See "fuzzhkana.h" for overall comments.
 */

#include "config.h"
#include "assert.h"
#include "output.h"
#include "fuzzkana.h"
#include "kanaid.h"

/*
 * Given
 *
 *   IN       -- some regex pattern string whose kana is to be fuzzified.
 *   OUT      -- where to stick the new pattern string
 *   OUT_SIZE -- size of area pointed to by OUT.
 *
 *   FLAGS    --
 *       If FUZZ_LONG_VOWELS is set, the pattern will be written such
 *       that longness of vowels doesn't matter.
 *
 *       If FUZZ_SMALL_TSU is set, the pattern will be written such
 *       that small TSUs won't matter.
 *
 * The number of bytes written to the output is returned, or zero if the
 * output buffer was overflowed (or, I suppose, if the input is empty)
 *
 * If OUT is zero, nothing is written (obviously)... the number of bytes
 * that would be required is returned (OUT_SIZE is ignored in this case)
 *
 * An example with both flags set
 *   in  ֤Ȥ
 *   out ֤[]*?[]*
 *
 */
#define LEAST_KANJI_HI_BYTE     0260
#define GREATEST_KANJI_HI_BYTE  0364

#define IS_EUC(HighByte)    ((HighByte) & 0x80)

unsigned fuzzkana(const unsigned char *in,
		  unsigned char *out,
		  unsigned out_size,
		  unsigned flags)
{
    const unsigned char *orig_out = out;
    const unsigned char *out_end = out + out_size;
    unsigned char hi, lo;
    int just_want_size = (out == 0);
    int do_voiced = (flags & FUZZ_VOICED);
    int do_vowels = (flags & FUZZ_LONG_VOWELS);
    int do_tsu    = (flags & FUZZ_SMALL_TSU);
    #ifdef FUZZ_REPEATER
    int do_repeat = (flags & FUZZ_REPEATER);
    #endif

    if (in == 0 || *in == 0)
	return 0;

    /*
     * Shove the given character to the 'out' buffer,
     * aborting the function if the buffer is overflowed.
     * However, if just requesting how much buffer is required
     * (buy supplying no out buffer), just bump up the counter.
     */
    #define SENDOUT(c)                                                       \
    macro_start {                                                            \
	unsigned char value = (c);                                           \
        if (just_want_size)                                                  \
	    out++;                                                           \
	else                                                                 \
	{                                                                    \
	    if (out >= out_end)                                              \
		return 0;                                                    \
	    *out++ = value;                                                  \
	}                                                                    \
    } macro_end

    /* to tack on the given string to the output */
    #define add(STR)                                                         \
    macro_start {                                                            \
	const unsigned char *str = (const unsigned char *)(STR);             \
	while (*str)                                                         \
	    SENDOUT(*str++);                                                 \
    } macro_end

    /* run through the line */
    while (hi = *in++, hi != 0)
    {
	unsigned next_id;
	unsigned vsound;
	unsigned id;

	/* Just pass through ASCII characters */
	if (!IS_EUC(hi))
	{
	    SENDOUT(hi);
	    continue;
	}

	lo = *in++;  /* get the next byte of the EUC char. */


	/* if this char is not kana, so just pass through and continue */
	if (!IS_KANA(hi,lo))
	{

	    SENDOUT(hi);
	    SENDOUT(lo);
#ifdef FUZZ_REPEATER_no_no_no /* now done in jregex.c */
            #define REPEATER_HI_BYTE        0241 /* high byte of  */
            #define REPEATER_LO_BYTE        0271 /* low byte of  */
	    if (do_repeat &&
		hi >= LEAST_KANJI_HI_BYTE && hi <= GREATEST_KANJI_HI_BYTE &&
		((in[0] == hi               && in[1] == lo) ||
		 (in[0] == REPEATER_HI_BYTE && in[1] == REPEATER_LO_BYTE)))
	    {
		SENDOUT('[');
		SENDOUT(REPEATER_HI_BYTE);
		SENDOUT(REPEATER_LO_BYTE);
		SENDOUT(hi);
		SENDOUT(lo);
		SENDOUT(']');
		in += 2;
	    }
#endif /* FUZZ_REPEATER */
	    continue;
	}

	id = KANA_ID(hi,lo); /* the the id flags for this kana */

	/* if we're doing voiced fuzz, fuzz dual-chracter'd sounds */
	if (do_voiced && (id & KID_DUAL))
	{
	    switch(id)
	    {
	      default:
		die("oops, %02x %02x -> id is %x\n", hi, lo, id);
		break;

	      case KID_E | KID_VOWEL | KID_DUAL: /*   */
	      case KID_ARCHAIC | KID_DUAL:	 /*   */
		add(IS_HIRAGANA(hi,lo) ? "[]" : "[]");
		break;
	      case KID_O | KID_VOWEL | KID_DUAL: /*   */
	      case KID_o | KID_DUAL:	         /*   */  
		add(IS_HIRAGANA(hi,lo) ? "[]" : "[]");
		break;

	      case KID_I | KID_Z | KID_DUAL:     /*   */
	      case KID_I | KID_D | KID_DUAL:     /*   */
		add(IS_HIRAGANA(hi,lo) ? "[]" : "[]");
		break;

	      case KID_U | KID_D | KID_DUAL:     /*   */
	      case KID_U | KID_Z | KID_DUAL:     /*   */
		add(IS_HIRAGANA(hi,lo) ? "[Ť]" : "[ť]");
		break;
	    }
	} else {
	    /* otherwise, just pass through */
	    SENDOUT(hi);
	    SENDOUT(lo);
	}

	vsound = id & KID_VSOUND;

        #define GET_NEXT_ID(ptr)                                             \
	macro_start {                                                        \
	    if (!IS_EUC((ptr)[0]))                                           \
		next_id = 0;                                                 \
	    else if (IS_DASH((ptr)[0], (ptr)[1]))                            \
		next_id = vsound | KID_VOWEL;                                \
	    else if (!IS_KANA((ptr)[0], (ptr)[1]))                           \
		next_id = 0;                                                 \
	    else                                                             \
		next_id = KANA_ID((ptr)[0], (ptr)[1]);                       \
	} macro_end

        GET_NEXT_ID(in);

#if 0
        /* consider the O and U sound to be the same */
	if (vsound & (KID_O|KID_U))
	    vsound = KID_O|KID_U;
#endif


	if (do_vowels)
	{
	    /*
	     * If current character has a vowel sound and is not followed
	     * by a small y-consonant sound, allow to be doubled.
	     */
	    if (vsound && (next_id & (KID_SMALL|KID_Y)) != (KID_SMALL|KID_Y))
	    {
		/*
		 * Unless they have case folding off, it won't matter which
		 * of these two we use, but since it would matter if they
		 * had it off, we'll separate them....
		 */
		if (IS_HIRAGANA(hi,lo))
		    switch (vsound) {
		      default: assert(0); break;
		      case KID_A:       add("[]*");     break;
		      case KID_I:       add("[]*");     break;
		      case KID_U:       add("[]*");     break;
		      case KID_O: add("[]*"); vsound |= KID_U; break;
		      case KID_E:       add("[]*");     break;
		    }
		else
		    switch (vsound) {
		      default: assert(0); break;
		      case KID_A:       add("[]*");     break;
		      case KID_I:       add("[]*");     break;
		      case KID_U:       add("[]*");     break;
		      case KID_O: add("[]*"); vsound |= KID_U; break;
		      case KID_E:       add("[]*");     break;
		    }

		/*
		 * If the next char is the simple vowel we've just
		 * allowed to be doubled, skip it.
		 *
		 * If the input string is something like֤
		 * This IF will cause each֤pair to combine into
		 * a֤[]*pattern. However, if you make the IF
		 * a WHILE, it would collapse them all down into one.
		 * Either method has their benefits.
		 */
		while ((next_id & KID_VOWEL) && (next_id & vsound))
		{
		    in += 2; /* skip the vowel we've just replaced */
		    GET_NEXT_ID(in);
		}
	    }
	}

	if (do_tsu)
	{
	    unsigned next_is_small_tsu = (next_id & (KID_T|KID_U|KID_SMALL)) ==
	                                            (KID_T|KID_U|KID_SMALL);

	    /*
	     * if the next thing is a hard-coded small-TSU, or all of
	     *   +  currently have a vowel sound, and
	     *   +  the next character has an "appropriate" consonant sound,
	     *   +  and the next character isn't small.
	     * then we'll add a possible small tsu.
	     */

	    #ifndef SMALL_TSU_OK
	    #define SMALL_TSU_OK (KID_K|KID_S|KID_T|KID_D|KID_P|KID_W|KID_M)
	    #endif

	    if (next_is_small_tsu ||
		(vsound && (next_id & SMALL_TSU_OK) && !(next_id & KID_SMALL)))
	    {
		/* add a possability for a small TSU */
		add("?");
		if (next_is_small_tsu)
		    in += 2; /* skip small tsu that's there */
	    }
	}
    }
    SENDOUT(0); /* tack on a null */
    return out - orig_out;
}