File: stemdb.cpp

package info (click to toggle)

recoll 1.43.13-1

links: PTS, VCS
area: main
in suites: forky, sid
size: 16,956 kB
sloc: cpp: 104,864; python: 9,923; xml: 7,324; ansic: 6,447; sh: 1,252; perl: 166; makefile: 73

file content (87 lines) | stat: -rw-r--r-- 2,778 bytes

parent folder | download | duplicates (3)

/* Copyright (C) 2005-2022 J.F.Dockes 
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the
 *   Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 */

/**
 * Management of the auxiliary databases listing stems and their expansion terms
 */

#include "autoconfig.h"

#include "safeunistd.h"

#include <algorithm>
#include <map>
#include <iostream>
#include <string>
using namespace std;

#include <xapian.h>

#include "stemdb.h"
#include "log.h"
#include "smallut.h"
#include "synfamily.h"
#include "unacpp.h"
#include "rclconfig.h"

namespace Rcl {

/**
 * Expand for one or several languages
 */
bool StemDb::stemExpand(const std::string& langs, const std::string& _term, vector<string>& result)
{
    vector<string> llangs;
    stringToStrings(langs, llangs);
    
    // The stemdb keys may have kept their diacritics or not but they are always lower-case. It
    // would be more logical for the term transformers to perform before doing the stemming, but
    // this would be inefficient when there are several stemming languages
    string term;
    unacmaybefold(_term, term, UNACOP_FOLD);

    for (const auto& lang : llangs) {
        SynTermTransStem stemmer(lang);
        XapComputableSynFamMember expander(getdb(), synFamStem, lang, &stemmer);
        (void)expander.synExpand(term, result);
    }

    if (!o_index_stripchars) {
        string unac;
        unacmaybefold(term, unac, UNACOP_UNAC);
        // Expand the unaccented stem, using the unaccented stem db. Because it's a different db, We
        // need to do it even if the input has no accent (unac == term)
        for (const auto& lang : llangs) {
            SynTermTransStem stemmer(lang);
            XapComputableSynFamMember expander(getdb(), synFamStemUnac, lang, &stemmer);
            (void)expander.synExpand(unac, result);
        }
    }

    if (result.empty())
        result.push_back(term);

    sort(result.begin(), result.end());
    auto uit = unique(result.begin(), result.end());
    result.resize(uit - result.begin());
    LOGDEB1("stemExpand:" << langs << ": " << term << " ->  " << stringsToString(result) << "\n");
    return true;
}


}