1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222
|
/*----------------------------------------------------------------------------
*
* hamming.c
*
* Hamming Distance is a similarity metric
*
* Hamming distance between two strings of equal length is the number of
* positions for which the correspondings symbols are different, i.e, the
* number of substitutions required to change one into the other.
*
* X is a bit string
* Y is a bit string
*
* For each position i we compare X[i] with Y[i]; if it doesn't match we
* accumulate those mismatches.
*
* For example:
*
* x: 1010101010
* y: 1010111000
* ^ ^
* mismatches
*
* s = 2
*
*
* Copyright (c) 2008-2018, Euler Taveira de Oliveira
*
*----------------------------------------------------------------------------
*/
#include "similarity.h"
#include "utils/varbit.h"
/* GUC variables */
double pgs_hamming_threshold = 0.7;
bool pgs_hamming_is_normalized = true;
PG_FUNCTION_INFO_V1(hamming);
Datum
hamming(PG_FUNCTION_ARGS)
{
VarBit *a, *b;
int alen, blen;
bits8 *pa, *pb;
int maxlen;
float8 res = 0.0;
int i;
int n;
a = PG_GETARG_VARBIT_P(0);
b = PG_GETARG_VARBIT_P(1);
alen = VARBITLEN(a);
blen = VARBITLEN(b);
elog(DEBUG1, "alen: %d; blen: %d", alen, blen);
if (alen != blen)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("bit strings must have the same length")));
/* alen and blen have the same length */
maxlen = alen;
pa = VARBITS(a);
pb = VARBITS(b);
for (i = 0; i < VARBITBYTES(a); i++)
{
n = *pa++ ^ *pb++;
while (n)
{
res += n & 1;
n >>= 1;
}
}
elog(DEBUG1, "is normalized: %d", pgs_hamming_is_normalized);
elog(DEBUG1, "maximum length: %d", maxlen);
/*
* FIXME print string of bits
elog(DEBUG1, "hammingdistance(%s, %s) = %.3f", DatumGetCString(varbit_out(VarBitPGetDatum(a))), DatumGetCString(varbit_out(VarBitPGetDatum(b))), res);
*/
/* if one string has zero length then return one */
if (maxlen == 0)
{
PG_RETURN_FLOAT8(1.0);
}
else if (pgs_hamming_is_normalized)
{
res = 1.0 - (res / maxlen);
/*
* FIXME print string of bits
elog(DEBUG1, "hamming(%s, %s) = %.3f", DatumGetCString(varbit_out(VarBitPGetDatum(a))), DatumGetCString(varbit_out(VarBitPGetDatum(b))), res);
*/
PG_RETURN_FLOAT8(res);
}
else
{
PG_RETURN_FLOAT8(res);
}
}
PG_FUNCTION_INFO_V1(hamming_op);
Datum hamming_op(PG_FUNCTION_ARGS)
{
float8 res;
/*
* store *_is_normalized value temporarily 'cause
* threshold (we're comparing against) is normalized
*/
bool tmp = pgs_hamming_is_normalized;
pgs_hamming_is_normalized = true;
res = DatumGetFloat8(DirectFunctionCall2(
hamming,
PG_GETARG_DATUM(0),
PG_GETARG_DATUM(1)));
/* we're done; back to the previous value */
pgs_hamming_is_normalized = tmp;
PG_RETURN_BOOL(res >= pgs_hamming_threshold);
}
PG_FUNCTION_INFO_V1(hamming_text);
Datum
hamming_text(PG_FUNCTION_ARGS)
{
char *a, *b;
int alen, blen;
char *pa, *pb;
int maxlen;
float8 res = 0.0;
a = DatumGetPointer(DirectFunctionCall1(textout, PointerGetDatum(PG_GETARG_TEXT_P(0))));
b = DatumGetPointer(DirectFunctionCall1(textout, PointerGetDatum(PG_GETARG_TEXT_P(1))));
alen = strlen(a);
blen = strlen(b);
if (alen > PGS_MAX_STR_LEN || blen > PGS_MAX_STR_LEN)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("argument exceeds the maximum length of %d bytes",
PGS_MAX_STR_LEN)));
elog(DEBUG1, "alen: %d; blen: %d", alen, blen);
if (alen != blen)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("text strings must have the same length")));
elog(DEBUG1, "a: %s ; b: %s", a, b);
/* alen and blen have the same length */
maxlen = alen;
pa = a;
pb = b;
while (*pa != '\0')
{
elog(DEBUG4, "a: %c ; b: %c", *pa, *pb);
if (*pa++ != *pb++)
res += 1.0;
}
elog(DEBUG1, "is normalized: %d", pgs_hamming_is_normalized);
elog(DEBUG1, "maximum length: %d", maxlen);
elog(DEBUG1, "hammingdistance(%s, %s) = %.3f", a, b, res);
/* if one string has zero length then return one */
if (maxlen == 0)
{
PG_RETURN_FLOAT8(1.0);
}
else if (pgs_hamming_is_normalized)
{
res = 1.0 - (res / maxlen);
elog(DEBUG1, "hamming(%s, %s) = %.3f", a, b, res);
PG_RETURN_FLOAT8(res);
}
else
{
PG_RETURN_FLOAT8(res);
}
}
PG_FUNCTION_INFO_V1(hamming_text_op);
Datum hamming_text_op(PG_FUNCTION_ARGS)
{
float8 res;
/*
* store *_is_normalized value temporarily 'cause
* threshold (we're comparing against) is normalized
*/
bool tmp = pgs_hamming_is_normalized;
pgs_hamming_is_normalized = true;
res = DatumGetFloat8(DirectFunctionCall2(
hamming_text,
PG_GETARG_DATUM(0),
PG_GETARG_DATUM(1)));
/* we're done; back to the previous value */
pgs_hamming_is_normalized = tmp;
PG_RETURN_BOOL(res >= pgs_hamming_threshold);
}
|