1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174
|
/*
** The author disclaims copyright to this source code.
**
*************************************************************************
** Implementation of the "simple" full-text-search tokenizer.
*/
#include <assert.h>
#if !defined(__APPLE__)
#include <malloc.h>
#else
#include <stdlib.h>
#endif
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include "tokenizer.h"
/* Duplicate a string; the caller must free() the returned string.
* (We don't use strdup() since it's not part of the standard C library and
* may not be available everywhere.) */
/* TODO(shess) Copied from fulltext.c, consider util.c for such
** things. */
static char *string_dup(const char *s){
char *str = malloc(strlen(s) + 1);
strcpy(str, s);
return str;
}
typedef struct simple_tokenizer {
sqlite3_tokenizer base;
const char *zDelim; /* token delimiters */
} simple_tokenizer;
typedef struct simple_tokenizer_cursor {
sqlite3_tokenizer_cursor base;
const char *pInput; /* input we are tokenizing */
int nBytes; /* size of the input */
const char *pCurrent; /* current position in pInput */
int iToken; /* index of next token to be returned */
char *zToken; /* storage for current token */
int nTokenBytes; /* actual size of current token */
int nTokenAllocated; /* space allocated to zToken buffer */
} simple_tokenizer_cursor;
static sqlite3_tokenizer_module simpleTokenizerModule;/* forward declaration */
static int simpleCreate(
int argc, const char **argv,
sqlite3_tokenizer **ppTokenizer
){
simple_tokenizer *t;
t = (simple_tokenizer *) malloc(sizeof(simple_tokenizer));
/* TODO(shess) Delimiters need to remain the same from run to run,
** else we need to reindex. One solution would be a meta-table to
** track such information in the database, then we'd only want this
** information on the initial create.
*/
if( argc>1 ){
t->zDelim = string_dup(argv[1]);
} else {
/* Build a string excluding alphanumeric ASCII characters */
char zDelim[0x80]; /* nul-terminated, so nul not a member */
int i, j;
for(i=1, j=0; i<0x80; i++){
if( !isalnum(i) ){
zDelim[j++] = i;
}
}
zDelim[j++] = '\0';
assert( j<=sizeof(zDelim) );
t->zDelim = string_dup(zDelim);
}
*ppTokenizer = &t->base;
return SQLITE_OK;
}
static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
simple_tokenizer *t = (simple_tokenizer *) pTokenizer;
free((void *) t->zDelim);
free(t);
return SQLITE_OK;
}
static int simpleOpen(
sqlite3_tokenizer *pTokenizer,
const char *pInput, int nBytes,
sqlite3_tokenizer_cursor **ppCursor
){
simple_tokenizer_cursor *c;
c = (simple_tokenizer_cursor *) malloc(sizeof(simple_tokenizer_cursor));
c->pInput = pInput;
c->nBytes = nBytes<0 ? (int) strlen(pInput) : nBytes;
c->pCurrent = c->pInput; /* start tokenizing at the beginning */
c->iToken = 0;
c->zToken = NULL; /* no space allocated, yet. */
c->nTokenBytes = 0;
c->nTokenAllocated = 0;
*ppCursor = &c->base;
return SQLITE_OK;
}
static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
if( NULL!=c->zToken ){
free(c->zToken);
}
free(c);
return SQLITE_OK;
}
static int simpleNext(
sqlite3_tokenizer_cursor *pCursor,
const char **ppToken, int *pnBytes,
int *piStartOffset, int *piEndOffset, int *piPosition
){
simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
int ii;
while( c->pCurrent-c->pInput<c->nBytes ){
int n = (int) strcspn(c->pCurrent, t->zDelim);
if( n>0 ){
if( n+1>c->nTokenAllocated ){
c->zToken = realloc(c->zToken, n+1);
}
for(ii=0; ii<n; ii++){
/* TODO(shess) This needs expansion to handle UTF-8
** case-insensitivity.
*/
char ch = c->pCurrent[ii];
c->zToken[ii] = (unsigned char)ch<0x80 ? tolower(ch) : ch;
}
c->zToken[n] = '\0';
*ppToken = c->zToken;
*pnBytes = n;
*piStartOffset = (int) (c->pCurrent-c->pInput);
*piEndOffset = *piStartOffset+n;
*piPosition = c->iToken++;
c->pCurrent += n + 1;
return SQLITE_OK;
}
c->pCurrent += n + 1;
/* TODO(shess) could strspn() to skip delimiters en masse. Needs
** to happen in two places, though, which is annoying.
*/
}
return SQLITE_DONE;
}
static sqlite3_tokenizer_module simpleTokenizerModule = {
0,
simpleCreate,
simpleDestroy,
simpleOpen,
simpleClose,
simpleNext,
};
void get_simple_tokenizer_module(
sqlite3_tokenizer_module **ppModule
){
*ppModule = &simpleTokenizerModule;
}
|