File: dbdimp_tokenizer.inc

package info (click to toggle)
libdbd-sqlite3-perl 1.62-3
  • links: PTS, VCS
  • area: main
  • in suites: buster, sid
  • size: 9,708 kB
  • sloc: ansic: 140,930; perl: 8,458; pascal: 286; makefile: 7
file content (296 lines) | stat: -rw-r--r-- 8,526 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
typedef struct perl_tokenizer {
    sqlite3_tokenizer base;
    SV *coderef;                 /* the perl tokenizer is a coderef that takes
                                    a string and returns a cursor coderef */
} perl_tokenizer;

typedef struct perl_tokenizer_cursor {
    sqlite3_tokenizer_cursor base;
    SV *coderef;                 /* ref to the closure that returns terms */
    char *pToken;                /* storage for a copy of the last token */
    int nTokenAllocated;         /* space allocated to pToken buffer */

    /* members below are only used if the input string is in utf8 */
    const char *pInput;          /* input we are tokenizing */
    const char *lastByteOffset;  /* offset into pInput */
    int lastCharOffset;          /* char offset corresponding to lastByteOffset */
} perl_tokenizer_cursor;

/*
** Create a new tokenizer instance.
** Will be called whenever a FTS3 table is created with
**   CREATE .. USING fts3( ... , tokenize=perl qualified::function::name)
** where qualified::function::name is a fully qualified perl function
*/
static int perl_tokenizer_Create(
    int argc, const char * const *argv,
    sqlite3_tokenizer **ppTokenizer
){
    dTHX;
    dSP;
    int n_retval;
    SV *retval;
    perl_tokenizer *t;

    if (!argc) {
        return SQLITE_ERROR;
    }

    t = (perl_tokenizer *) sqlite3_malloc(sizeof(*t));
    if( t==NULL ) return SQLITE_NOMEM;
    memset(t, 0, sizeof(*t));

    ENTER;
    SAVETMPS;

    /* call the qualified::function::name */
    PUSHMARK(SP);
    PUTBACK;
    n_retval = call_pv(argv[0], G_SCALAR);
    SPAGAIN;

    /* store a copy of the returned coderef into the tokenizer structure */
    if (n_retval != 1) {
        warn("tokenizer_Create returned %d arguments", n_retval);
    }
    retval = POPs;
    t->coderef   = newSVsv(retval);
    *ppTokenizer = &t->base;

    PUTBACK;
    FREETMPS;
    LEAVE;

    return SQLITE_OK;
}

/*
** Destroy a tokenizer
*/
static int perl_tokenizer_Destroy(sqlite3_tokenizer *pTokenizer){
    dTHX;
    perl_tokenizer *t = (perl_tokenizer *) pTokenizer;
    sv_free(t->coderef);
    sqlite3_free(t);
    return SQLITE_OK;
}

/*
** Prepare to begin tokenizing a particular string.  The input
** string to be tokenized is supposed to be pInput[0..nBytes-1] ..
** except that nBytes passed by fts3 is -1 (don't know why) !
** This is passed to the tokenizer instance, which then returns a
** closure implementing the cursor (so the cursor is again a coderef).
*/
static int perl_tokenizer_Open(
    sqlite3_tokenizer *pTokenizer,       /* Tokenizer object */
    const char *pInput, int nBytes,      /* Input buffer */
    sqlite3_tokenizer_cursor **ppCursor  /* OUT: Created tokenizer cursor */
){
    dTHX;
    dSP;
    dMY_CXT;
    U32 flags;
    SV *perl_string;
    int n_retval;

    perl_tokenizer *t = (perl_tokenizer *)pTokenizer;

    /* allocate and initialize the cursor struct */
    perl_tokenizer_cursor *c;
    c = (perl_tokenizer_cursor *) sqlite3_malloc(sizeof(*c));
    memset(c, 0, sizeof(*c));
    *ppCursor = &c->base;

    /* flags for creating the Perl SV containing the input string */
    flags = SVs_TEMP; /* will call sv_2mortal */

    /* special handling if working with utf8 strings */
    if (MY_CXT.last_dbh_is_unicode) {

        /* data to keep track of byte offsets */
        c->lastByteOffset = c->pInput = pInput;
        c->lastCharOffset = 0;

        /* string passed to Perl needs to be flagged as utf8 */
        flags |= SVf_UTF8;
    }

    ENTER;
    SAVETMPS;

    /* build a Perl copy of the input string */
    if (nBytes < 0) { /* we get -1 from fts3. Don't know why ! */
        nBytes = strlen(pInput);
    }
    perl_string = newSVpvn_flags(pInput, nBytes, flags);

    /* call the tokenizer coderef */
    PUSHMARK(SP);
    XPUSHs(perl_string);
    PUTBACK;
    n_retval = call_sv(t->coderef, G_SCALAR);
    SPAGAIN;

    /* store the cursor coderef returned by the tokenizer */
    if (n_retval != 1) {
        warn("tokenizer returned %d arguments", n_retval);
    }
    c->coderef = newSVsv(POPs);

    PUTBACK;
    FREETMPS;
    LEAVE;
    return SQLITE_OK;
}

/*
** Close a tokenization cursor previously opened by a call to
** perl_tokenizer_Open() above.
*/
static int perl_tokenizer_Close(sqlite3_tokenizer_cursor *pCursor){
    perl_tokenizer_cursor *c = (perl_tokenizer_cursor *) pCursor;

    dTHX;
    sv_free(c->coderef);
    if (c->pToken) sqlite3_free(c->pToken);
    sqlite3_free(c);
    return SQLITE_OK;
}

/*
** Extract the next token from a tokenization cursor.  The cursor must
** have been opened by a prior call to perl_tokenizer_Open().
*/
static int perl_tokenizer_Next(
    sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by perl_tokenizer_Open */
    const char **ppToken,               /* OUT: *ppToken is the token text */
    int *pnBytes,                       /* OUT: Number of bytes in token */
    int *piStartOffset,                 /* OUT: Starting offset of token */
    int *piEndOffset,                   /* OUT: Ending offset of token */
    int *piPosition                     /* OUT: Position integer of token */
){
    perl_tokenizer_cursor *c = (perl_tokenizer_cursor *) pCursor;
    int result;
    int n_retval;
    char *token;
    char *byteOffset;
    STRLEN n_a; /* this is required for older perls < 5.8.8 */
    I32 hop;

    dTHX;
    dSP;

    ENTER;
    SAVETMPS;

    /* call the cursor */
    PUSHMARK(SP);
    PUTBACK;
    n_retval = call_sv(c->coderef, G_ARRAY);
    SPAGAIN;

    /* if we get back an empty list, there is no more token */
    if (n_retval == 0) {
        result = SQLITE_DONE;
    }
    /* otherwise, get token details from the return list */
    else {
        if (n_retval != 5) {
            warn("tokenizer cursor returned %d arguments", n_retval);
        }
        *piPosition    = POPi;
        *piEndOffset   = POPi;
        *piStartOffset = POPi;
        *pnBytes       = POPi;
        token          = POPpx;

        if (c->pInput) { /* if working with utf8 data */

            /* recompute *pnBytes in bytes, not in chars */
            *pnBytes = strlen(token);

            /* recompute start/end offsets in bytes, not in chars */
            hop            = *piStartOffset - c->lastCharOffset;
            byteOffset     = (char*)utf8_hop((U8*)c->lastByteOffset, hop);
            hop            = *piEndOffset - *piStartOffset;
            *piStartOffset = byteOffset - c->pInput;
            byteOffset     = (char*)utf8_hop((U8*)byteOffset, hop);
            *piEndOffset   = byteOffset - c->pInput;

            /* remember where we are for next round */
            c->lastCharOffset = *piEndOffset,
            c->lastByteOffset = byteOffset;
        }

        /* make sure we have enough storage for copying the token */
        if (*pnBytes > c->nTokenAllocated ){
            char *pNew;
            c->nTokenAllocated = *pnBytes + 20;
            pNew = sqlite3_realloc(c->pToken, c->nTokenAllocated);
            if( !pNew ) return SQLITE_NOMEM;
            c->pToken = pNew;
        }

        /* need to copy the token into the C cursor before perl frees that
           memory */
        memcpy(c->pToken, token, *pnBytes);
        *ppToken  = c->pToken;

        result = SQLITE_OK;
    }

    PUTBACK;
    FREETMPS;
    LEAVE;

    return result;
}

/*
** The set of routines that implement the perl tokenizer
*/
sqlite3_tokenizer_module perl_tokenizer_Module = {
    0,
    perl_tokenizer_Create,
    perl_tokenizer_Destroy,
    perl_tokenizer_Open,
    perl_tokenizer_Close,
    perl_tokenizer_Next
};

/*
** Register the perl tokenizer with FTS3
*/
int sqlite_db_register_fts3_perl_tokenizer(pTHX_ SV *dbh)
{
    D_imp_dbh(dbh);

    int rc;
    sqlite3_stmt *pStmt;
    const char zSql[] = "SELECT fts3_tokenizer(?, ?)";
    sqlite3_tokenizer_module *p = &perl_tokenizer_Module;

    if (!DBIc_ACTIVE(imp_dbh)) {
        sqlite_error(dbh, -2, "attempt to register fts3 tokenizer on inactive database handle");
        return FALSE;
    }

#if SQLITE_VERSION_NUMBER >= 3012000
    rc = sqlite3_db_config(imp_dbh->db, SQLITE_DBCONFIG_ENABLE_FTS3_TOKENIZER, 1, 0);
    if( rc!=SQLITE_OK ){
        return rc;
    }
#endif

    rc = sqlite3_prepare_v2(imp_dbh->db, zSql, -1, &pStmt, 0);
    if( rc!=SQLITE_OK ){
        return rc;
    }

    sqlite3_bind_text(pStmt, 1, "perl", -1, SQLITE_STATIC);
    sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
    sqlite3_step(pStmt);

    return sqlite3_finalize(pStmt);
}