File: tokenizer.h

package info (click to toggle)
pg-similarity 1.0-9
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 576 kB
  • sloc: ansic: 3,257; sql: 517; makefile: 25; sh: 1
file content (48 lines) | stat: -rw-r--r-- 1,125 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
/*----------------------------------------------------------------------------
 *
 * tokenizer.h
 *
 * Copyright (c) 2008-2018, Euler Taveira de Oliveira
 *
 *----------------------------------------------------------------------------
 */

#include "postgres.h"

#include <ctype.h>
#include <string.h>
#include <stdlib.h>

#define	PGS_MAX_TOKEN_LEN	1024

#define	PGS_GRAM_LEN		3
#define	PGS_BLANK_CHAR		' '

#define	PGS_FULL_NGRAM

typedef struct Token
{
	char		*data;	/* token data */
	int		freq;	/* frequency */
	struct Token	*next;	/* next token */
} Token;

typedef struct TokenList
{
	int	isset;	/* is a set? */
	int	size;	/* list size */
	Token	*head;	/* first token */
	Token	*tail;	/* last token */
} TokenList;

TokenList *initTokenList(int isset);
void destroyTokenList(TokenList *t);
int addToken(TokenList *t, char *s);
int removeToken(TokenList *t);
Token *searchToken(TokenList *t, char *s);
void printToken(TokenList *t);

void tokenizeByNonAlnum(TokenList *t, char *s);
void tokenizeBySpace(TokenList *t, char *s);
void tokenizeByGram(TokenList *t, char *s);
void tokenizeByCamelCase(TokenList *t, char *s);