File: stemdb.cpp

package info (click to toggle)
recoll 1.43.13-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 16,956 kB
  • sloc: cpp: 104,864; python: 9,923; xml: 7,324; ansic: 6,447; sh: 1,252; perl: 166; makefile: 73
file content (87 lines) | stat: -rw-r--r-- 2,778 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
/* Copyright (C) 2005-2022 J.F.Dockes 
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the
 *   Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 */

/**
 * Management of the auxiliary databases listing stems and their expansion terms
 */

#include "autoconfig.h"

#include "safeunistd.h"

#include <algorithm>
#include <map>
#include <iostream>
#include <string>
using namespace std;

#include <xapian.h>

#include "stemdb.h"
#include "log.h"
#include "smallut.h"
#include "synfamily.h"
#include "unacpp.h"
#include "rclconfig.h"

namespace Rcl {

/**
 * Expand for one or several languages
 */
bool StemDb::stemExpand(const std::string& langs, const std::string& _term, vector<string>& result)
{
    vector<string> llangs;
    stringToStrings(langs, llangs);
    
    // The stemdb keys may have kept their diacritics or not but they are always lower-case. It
    // would be more logical for the term transformers to perform before doing the stemming, but
    // this would be inefficient when there are several stemming languages
    string term;
    unacmaybefold(_term, term, UNACOP_FOLD);

    for (const auto& lang : llangs) {
        SynTermTransStem stemmer(lang);
        XapComputableSynFamMember expander(getdb(), synFamStem, lang, &stemmer);
        (void)expander.synExpand(term, result);
    }

    if (!o_index_stripchars) {
        string unac;
        unacmaybefold(term, unac, UNACOP_UNAC);
        // Expand the unaccented stem, using the unaccented stem db. Because it's a different db, We
        // need to do it even if the input has no accent (unac == term)
        for (const auto& lang : llangs) {
            SynTermTransStem stemmer(lang);
            XapComputableSynFamMember expander(getdb(), synFamStemUnac, lang, &stemmer);
            (void)expander.synExpand(unac, result);
        }
    }

    if (result.empty())
        result.push_back(term);

    sort(result.begin(), result.end());
    auto uit = unique(result.begin(), result.end());
    result.resize(uit - result.begin());
    LOGDEB1("stemExpand:" << langs << ": " << term << " ->  " << stringsToString(result) << "\n");
    return true;
}


}