File: stemmer.c

package info (click to toggle)
yaz 5.27.1-2
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 16,184 kB
  • sloc: xml: 123,414; ansic: 72,530; sh: 5,007; tcl: 2,169; makefile: 1,321; yacc: 382
file content (133 lines) | stat: -rw-r--r-- 4,000 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
/* This file is part of the YAZ toolkit.
 * Copyright (C) Index Data
 * See the file LICENSE for details.
 */

/**
 * \file stemmer.c
 * \brief Implements stemmer wrapper
 */

#if HAVE_CONFIG_H
#include <config.h>
#endif

#if YAZ_HAVE_ICU

#include <yaz/stemmer.h>

#include <yaz/xmalloc.h>

#include <libstemmer.h>

#include <unicode/ustring.h>  /* some more string fcns*/
#include <unicode/uchar.h>    /* char names           */

enum stemmer_implementation {
    yaz_no_operation,
    yaz_snowball
};
struct yaz_stemmer_t
{
    int implementation;
    // Required for cloning.
    char *locale;
    char *rule;
    struct sb_stemmer *sb_stemmer;
};

const char* yaz_stemmer_lookup_charenc(const char *charenc, const char *rule) {
    return "UTF_8";
}

const char* yaz_stemmer_lookup_algorithm(const char *locale, const char *rule) {
    return locale;
}

yaz_stemmer_p yaz_stemmer_snowball_create(const char *locale, const char *rule, UErrorCode *status) {
    const char *charenc = yaz_stemmer_lookup_charenc(locale, rule);
    const char *algorithm = yaz_stemmer_lookup_algorithm(locale,rule);
    struct sb_stemmer *stemmer = sb_stemmer_new(algorithm, charenc);
    yaz_stemmer_p yaz_stemmer;
    if (stemmer == 0) {
        *status = U_ILLEGAL_ARGUMENT_ERROR;
        yaz_log(YLOG_FATAL, "yaz_stemmer: Failed to create snowball stemmer from locale %srule %s. Showball: charenc %s algorithm %s ",
		locale, rule, charenc, algorithm);
	return 0;
    }
    yaz_log(YLOG_DEBUG, "created snowball stemmer: algorithm %s charenc %s ", algorithm, charenc);
    yaz_stemmer = xmalloc(sizeof(*yaz_stemmer));
    yaz_stemmer->implementation = yaz_snowball;

    yaz_stemmer->locale = xstrdup(locale);
    yaz_stemmer->rule = xstrdup(rule);
    yaz_stemmer->sb_stemmer = stemmer;
    yaz_log(YLOG_DEBUG, "created snowball stemmer: algorithm %s charenc %s ", algorithm, charenc);
    return yaz_stemmer;
}

yaz_stemmer_p yaz_stemmer_create(const char *locale, const char *rule, UErrorCode *status) {
    *status = U_ZERO_ERROR;
    // dispatch logic required if more algorithms is implemented.
    yaz_log(YLOG_DEBUG, "create stemmer: locale %s rule %s ", locale, rule);
    return yaz_stemmer_snowball_create(locale, rule, status);
}

yaz_stemmer_p yaz_stemmer_clone(yaz_stemmer_p stemmer) {
    UErrorCode error = U_ZERO_ERROR;
    if (stemmer == 0)
      return 0;
    return yaz_stemmer_create(stemmer->locale, stemmer->rule, &error);
}

void yaz_stemmer_stem(yaz_stemmer_p stemmer, struct icu_buf_utf16 *dst, struct icu_buf_utf16* src, UErrorCode *status)
{
    switch(stemmer->implementation) {
        case yaz_snowball: {
            struct icu_buf_utf8 *utf8_buf = icu_buf_utf8_create(0);
            icu_utf16_to_utf8(utf8_buf, src, status);
            if (*status == U_ZERO_ERROR) {
                const sb_symbol *cstr = (const sb_symbol*) icu_buf_utf8_to_cstr(utf8_buf);
                const sb_symbol *sb_symbol = sb_stemmer_stem(stemmer->sb_stemmer, cstr, utf8_buf->utf8_len);
                if (sb_symbol == 0) {
                    icu_buf_utf16_copy(dst, src);
                }
                else {

                    const char *cstr2 = (const char *) sb_symbol;
                    icu_utf16_from_utf8_cstr(dst, cstr2 , status);
#if 0
		    yaz_log(YLOG_DEBUG, "stemming %s to %s ", cstr, cstr2);
#endif
                }
            }
            icu_buf_utf8_destroy(utf8_buf);
            return ;
            break;
        }
    case yaz_no_operation:
      yaz_log(YLOG_DEBUG, "Stemmer (No operation) called");
    default: {
            // Default return the same as given.
            icu_buf_utf16_copy(dst, src);
        }
    }
}

void yaz_stemmer_destroy(yaz_stemmer_p stemmer)
{
  /* Handle no stemmer correctly */
  if (stemmer == 0)
    return ;

  switch (stemmer->implementation) {
  case yaz_snowball:
    sb_stemmer_delete(stemmer->sb_stemmer);
    break;
  }
  xfree(stemmer->locale);
  xfree(stemmer->rule);
  xfree(stemmer);
}

#endif /* YAZ_HAVE_ICU */