File: matching.c

package info (click to toggle)
pg-similarity 1.0-9
links: PTS, VCS
area: main
in suites: forky, sid
size: 576 kB
sloc: ansic: 3,257; sql: 517; makefile: 25; sh: 1
file content (162 lines) | stat: -rw-r--r-- 3,451 bytes
parent folder | download | duplicates (2)
/*----------------------------------------------------------------------------
 *
 * matching.c
 *
 * The Matching Coefficient is a simple vector based approach
 *
 *          nt
 * s = -------------
 *      max(nx, ny)
 *
 * where nt is the number of common n-grams found in both strings, nx is the
 * number of n-grams in x and, ny is the number of n-grams in y.
 *
 * For example:
 *
 * x: euler = {e, u, l, e, r}
 * y: heuser = {h, e, u, s, e, r}
 *
 *      4
 * s = --- = 0.666...
 *      6
 *
 * PS> we call n-grams: (i) n-sequence of letters (ii) n-sequence of words
 *
 *
 * Copyright (c) 2008-2018, Euler Taveira de Oliveira
 *
 *----------------------------------------------------------------------------
 */

#include "similarity.h"
#include "tokenizer.h"


/* GUC variables */
int		pgs_matching_tokenizer = PGS_UNIT_ALNUM;
double	pgs_matching_threshold = 0.7;
bool	pgs_matching_is_normalized = true;

PG_FUNCTION_INFO_V1(matchingcoefficient);

Datum
matchingcoefficient(PG_FUNCTION_ARGS)
{
	char		*a, *b;
	TokenList	*s, *t;
	Token		*p, *q;
	int		atok, btok, comtok, maxtok;
	float8		res;

	a = DatumGetPointer(DirectFunctionCall1(textout, PointerGetDatum(PG_GETARG_TEXT_P(0))));
	b = DatumGetPointer(DirectFunctionCall1(textout, PointerGetDatum(PG_GETARG_TEXT_P(1))));

	if (strlen(a) > PGS_MAX_STR_LEN || strlen(b) > PGS_MAX_STR_LEN)
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
				errmsg("argument exceeds the maximum length of %d bytes",
					PGS_MAX_STR_LEN)));

	/* lists */
	s = initTokenList(0);
	t = initTokenList(0);

	switch (pgs_matching_tokenizer)
	{
		case PGS_UNIT_WORD:
			tokenizeBySpace(s, a);
			tokenizeBySpace(t, b);
			break;
		case PGS_UNIT_GRAM:
			tokenizeByGram(s, a);
			tokenizeByGram(t, b);
			break;
		case PGS_UNIT_CAMELCASE:
			tokenizeByCamelCase(s, a);
			tokenizeByCamelCase(t, b);
			break;
		case PGS_UNIT_ALNUM:
		default:
			tokenizeByNonAlnum(s, a);
			tokenizeByNonAlnum(t, b);
			break;
	}

	elog(DEBUG3, "Token List A");
	printToken(s);
	elog(DEBUG3, "Token List B");
	printToken(t);

	atok = s->size;
	btok = t->size;
	maxtok = max2(atok, btok);

	comtok = 0;

	/*
	 * XXX consider sorting s and t when we're dealing with large lists?
	 */
	p = s->head;
	while (p != NULL)
	{
		int found = 0;

		q = t->head;
		while (q != NULL)
		{
			elog(DEBUG3, "p: %s; q: %s", p->data, q->data);
			if (strcmp(p->data, q->data) == 0)
			{
				found = 1;
				break;
			}
			q = q->next;
		}

		if (found)
		{
			comtok++;
			elog(DEBUG2, "\"%s\" found; comtok = %d", p->data, comtok);
		}

		p = p->next;
	}

	destroyTokenList(s);
	destroyTokenList(t);

	elog(DEBUG1, "is normalized: %d", pgs_matching_is_normalized);
	elog(DEBUG1, "common tokens size: %d", comtok);
	elog(DEBUG1, "maximum token size: %d", maxtok);

	if (pgs_matching_is_normalized)
		res = (float8) comtok / maxtok;
	else
		res = comtok;

	PG_RETURN_FLOAT8(res);
}

PG_FUNCTION_INFO_V1(matchingcoefficient_op);

Datum matchingcoefficient_op(PG_FUNCTION_ARGS)
{
	float8	res;

	/*
	 * store *_is_normalized value temporarily 'cause
	 * threshold (we're comparing against) is normalized
	 */
	bool	tmp = pgs_matching_is_normalized;
	pgs_matching_is_normalized = true;

	res = DatumGetFloat8(DirectFunctionCall2(
					matchingcoefficient,
					PG_GETARG_DATUM(0),
					PG_GETARG_DATUM(1)));

	/* we're done; back to the previous value */
	pgs_matching_is_normalized = tmp;

	PG_RETURN_BOOL(res >= pgs_matching_threshold);
}