File: qresultstore.cpp

package info (click to toggle)
recoll 1.43.7-1
  • links: PTS, VCS
  • area: main
  • in suites: forky
  • size: 16,512 kB
  • sloc: cpp: 104,170; python: 9,500; xml: 7,248; ansic: 6,447; sh: 1,212; perl: 130; makefile: 72
file content (197 lines) | stat: -rw-r--r-- 6,494 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
/* Copyright (C) 2017-2020 J.F.Dockes
 *
 * License: GPL 2.1
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program; if not, write to the
 * Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 */

#include "qresultstore.h"

#include <string>
#include <iostream>
#include <map>
#include <vector>

#include <string.h>

#include "rcldoc.h"
#include "rclquery.h"
#include "log.h"
#include "smallut.h"

namespace Rcl {

class QResultStore::Internal {
public:
    static bool fieldneeded(const std::set<std::string>& fieldspec, bool isinc, 
                          const std::pair<std::string, std::string>& entry) {
        return !entry.second.empty() &&
            (isinc ? fieldspec.find(entry.first) != fieldspec.end() :
             fieldspec.find(entry.first) == fieldspec.end());
    }

    // Store correspondance between field name and index in offsets array.
    std::map<std::string, int> keyidx;

    // Storage for one doc. Uses a char array (base) for the data and an std::vector for the field
    // offsets. Notes:
    // - offsets[0] is always 0, not really useful, simpler this way.
    // - This could be made more efficient by going C: make the storage more linear: one big char
    //   array for all the docs data and one big vector of Nxm offsets with a bit of index
    //   computations. Or, not:
    //     *** Actually tried it, but not significantly better. See resultstore_bigarrays branch for
    //         more details. ***
    struct docoffs {
        ~docoffs() {
            free(base);
        }
        char *base{nullptr};
        std::vector<int> offsets;
    };

    // One entry with data and offsets for each result doc.
    std::vector<struct docoffs> docs;
};

QResultStore::QResultStore()
{
    m = new Internal;
}
QResultStore::~QResultStore()
{
    delete m;
}

bool QResultStore::storeQuery(Rcl::Query& query, const std::set<std::string>& fldspec, bool isinc)
{
    LOGDEB1("QResultStore::storeQuery: fldspec " << stringsToString(fldspec)  << " isinc " <<
           isinc << '\n');
    /////////////
    // Enumerate all existing keys and assign array indexes for them. Count documents while we are
    // at it.

    // The fields we always include
    m->keyidx = {{"url", 0},
                 {"mimetype", 1},
                 {"fmtime", 2},
                 {"dmtime", 3},
                 {"fbytes", 4},
                 {"dbytes", 5}
    };

    // Walk the docs and assign a keyidx slot to any field which both is included by fldspec and
    // exists in at least one doc.
    int count = 0;
    for (;;count++) {
        Rcl::Doc doc;
        if (!query.getDoc(count, doc, false)) {
            break;
        }
        for (const auto& entry : doc.meta) {
            if (Internal::fieldneeded(fldspec, isinc, entry)) {
                m->keyidx.try_emplace(entry.first, (int)m->keyidx.size());
            }
        }
    }
    
    ///////
    // Populate the main array with doc-equivalent structures.
    
    m->docs.resize(count);
    
    for (int i = 0; i < count; i++) {
        Rcl::Doc doc;
        if (!query.getDoc(i, doc, false)) {
            break;
        }
        auto& vdoc = m->docs[i];
        vdoc.offsets.resize(m->keyidx.size());
        auto nbytes =
            doc.url.size() + 1 +
            doc.mimetype.size() + 1 +
            doc.fmtime.size() + 1 +
            doc.dmtime.size() + 1 +
            doc.fbytes.size() + 1 +
            doc.dbytes.size() + 1;
        for (const auto& entry : doc.meta) {
            if (Internal::fieldneeded(fldspec, isinc, entry)) {
                nbytes += entry.second.size() + 1;
            }
        }

        char *cp = (char*)malloc(nbytes);
        if (nullptr == cp) {
            abort();
        }

#define STRINGCPCOPY(CHARP, S) do { \
            memcpy(CHARP, S.c_str(), S.size()+1); \
            CHARP += S.size()+1; \
        } while (false);

        // Copy to storage and take note of offsets for all static (5 first) fields.
        vdoc.base = cp;
        vdoc.offsets[0] = static_cast<int>(cp - vdoc.base);
        STRINGCPCOPY(cp, doc.url);
        vdoc.offsets[1] = static_cast<int>(cp - vdoc.base);
        auto firstzero = vdoc.offsets[1] - 1;
        STRINGCPCOPY(cp, doc.mimetype);
        vdoc.offsets[2] = static_cast<int>(cp - vdoc.base);
        STRINGCPCOPY(cp, doc.fmtime);
        vdoc.offsets[3] = static_cast<int>(cp - vdoc.base);
        STRINGCPCOPY(cp, doc.dmtime);
        vdoc.offsets[4] = static_cast<int>(cp - vdoc.base);
        STRINGCPCOPY(cp, doc.fbytes);
        vdoc.offsets[5] = static_cast<int>(cp - vdoc.base);
        STRINGCPCOPY(cp, doc.dbytes);
        // Walk our variable field list. If doc.meta[fld] is absent or empty, store a pointer to a
        // zero byte, else copy the data and store a pointer to it. Walking the field list and not
        // the doc meta allows setting all the needed empty pointers while we are at it.
        for (auto& entry : m->keyidx) {
            if (entry.second <= 5)
                continue;
            auto it = doc.meta.find(entry.first);
            if (it == doc.meta.end() || it->second.empty()) {
                vdoc.offsets[entry.second] = firstzero;
            } else {
                vdoc.offsets[entry.second] = static_cast<int>(cp - vdoc.base);
                STRINGCPCOPY(cp, it->second);
            }
        }
    }
    return true;
}

int QResultStore::getCount()
{
    return int(m->docs.size());
}

const char *QResultStore::fieldValue(int docindex, const std::string& fldname)
{
    if (docindex < 0 || docindex >= int(m->docs.size())) {
        return nullptr;
    }
    auto& vdoc = m->docs[docindex];

    auto it = m->keyidx.find(fldname);
    if (it == m->keyidx.end() || it->second < 0 || it->second >= (int)vdoc.offsets.size()) {
        return nullptr;
    }
    return vdoc.base + vdoc.offsets[it->second];
}

} // namespace Rcl