1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
|
/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-
*
* Copyright (C) 2016 Richard Hughes <richard@hughsie.com>
*
* SPDX-License-Identifier: LGPL-2.1+
*/
#include "config.h"
#include <glib/gi18n.h>
#ifdef HAVE_LIBSTEMMER
#include "libstemmer.h"
#endif
#include "as-stemmer.h"
#include "as-ref-string.h"
struct _AsStemmer
{
GObject parent_instance;
gboolean enabled;
GHashTable *hash;
struct sb_stemmer *ctx;
GMutex ctx_mutex;
};
G_DEFINE_TYPE (AsStemmer, as_stemmer, G_TYPE_OBJECT)
/**
* as_stemmer_process:
* @stemmer: A #AsStemmer
* @value: The input string
*
* Stems a string using the Porter algorithm.
*
* Since: 0.2.2
*
* Returns: A new refcounted string
**/
AsRefString *
as_stemmer_process (AsStemmer *stemmer, const gchar *value)
{
#ifdef HAVE_LIBSTEMMER
AsRefString *new;
const gchar *tmp;
gsize value_len;
g_autofree gchar *value_casefold = NULL;
g_autoptr(GMutexLocker) locker = g_mutex_locker_new (&stemmer->ctx_mutex);
/* look for word in the cache */
new = g_hash_table_lookup (stemmer->hash, value);
if (new != NULL)
return as_ref_string_ref (new);
/* not enabled */
value_casefold = g_utf8_casefold (value, -1);
if (stemmer->ctx == NULL || !stemmer->enabled)
return as_ref_string_new (value_casefold);
/* stem, then add to the cache */
value_len = strlen (value_casefold);
tmp = (const gchar *) sb_stemmer_stem (stemmer->ctx,
(guchar *) value_casefold,
(gint) value_len);
if (value_len == (gsize) sb_stemmer_length (stemmer->ctx)) {
new = as_ref_string_new_with_length (value_casefold, value_len);
} else {
new = as_ref_string_new (tmp);
}
g_hash_table_insert (stemmer->hash,
as_ref_string_new (value_casefold),
as_ref_string_ref (new));
return new;
#else
g_autofree gchar *value_casefold = NULL;
value_casefold = g_utf8_casefold (value, -1);
return as_ref_string_new (value_casefold);
#endif
}
static void
as_stemmer_finalize (GObject *object)
{
AsStemmer *stemmer = AS_STEMMER (object);
#ifdef HAVE_LIBSTEMMER
sb_stemmer_delete (stemmer->ctx);
g_mutex_clear (&stemmer->ctx_mutex);
#endif
g_hash_table_unref (stemmer->hash);
G_OBJECT_CLASS (as_stemmer_parent_class)->finalize (object);
}
static void
as_stemmer_class_init (AsStemmerClass *klass)
{
GObjectClass *object_class = G_OBJECT_CLASS (klass);
object_class->finalize = as_stemmer_finalize;
}
static void
as_stemmer_init (AsStemmer *stemmer)
{
/* FIXME: use as_utils_locale_to_language()? */
#ifdef HAVE_LIBSTEMMER
stemmer->ctx = sb_stemmer_new ("en", NULL);
g_mutex_init (&stemmer->ctx_mutex);
#endif
stemmer->enabled = g_getenv ("APPSTREAM_GLIB_DISABLE_STEMMER") == NULL;
stemmer->hash = g_hash_table_new_full (g_str_hash, g_str_equal,
(GDestroyNotify) as_ref_string_unref,
(GDestroyNotify) as_ref_string_unref);
}
/**
* as_stemmer_new:
*
* Creates a new #AsStemmer.
*
* Returns: (transfer full): a #AsStemmer
*
* Since: 0.2.2
**/
AsStemmer *
as_stemmer_new (void)
{
AsStemmer *stemmer = g_object_new (AS_TYPE_STEMMER, NULL);
return AS_STEMMER (stemmer);
}
|