File: suggest.php

package info (click to toggle)
sphinxsearch 2.2.11-8
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, sid, trixie
  • size: 25,720 kB
  • sloc: cpp: 102,259; xml: 85,608; sh: 9,259; php: 3,790; ansic: 3,158; yacc: 1,969; java: 1,336; ruby: 1,289; python: 1,062; pascal: 912; perl: 381; lex: 275; makefile: 150; sql: 77; cs: 35
file content (151 lines) | stat: -rw-r--r-- 3,732 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
<?php

define ( "FREQ_THRESHOLD", 40 );
define ( "SUGGEST_DEBUG", 0 );
define ( "LENGTH_THRESHOLD", 2 );
define ( "LEVENSHTEIN_THRESHOLD", 2 );
define ( "TOP_COUNT", 10 );

// error_reporting ( E_ALL ^ E_NOTICE );
mb_internal_encoding ( "utf-8" );

require ( "../../api/sphinxapi.php" );

/// build a list of trigrams for a given keywords
function BuildTrigrams ( $keyword )
{
	$t = "__" . $keyword . "__";

	$trigrams = "";
	for ( $i=0; $i<mb_strlen($t)-2; $i++ )
		$trigrams .= mb_substr ( $t, $i, 3 ) . " ";

	return $trigrams;
}


/// create SQL dump of the dictionary from Sphinx stopwords file
/// expects open files as parameters
function BuildDictionarySQL ( $out, $in )
{
	fwrite ( $out, "DROP TABLE IF EXISTS suggest;

CREATE TABLE suggest (
	id			INTEGER PRIMARY KEY AUTO_INCREMENT NOT NULL,
	keyword		VARCHAR(255) NOT NULL,
	trigrams	VARCHAR(255) NOT NULL,
	freq		INTEGER NOT NULL,
	UNIQUE(keyword)
);

" );

	$n = 0;
	$m = 0;
	while ( $line = fgets ( $in, 1024 ) )
	{
		list ( $keyword, $freq ) = preg_split ( "/[\s,]+/", trim ( $line ) );

		if ( $freq<FREQ_THRESHOLD || strstr ( $keyword, "_" )!==false || strstr ( $keyword, "'" )!==false )
			continue;

		$trigrams = BuildTrigrams ( $keyword );

		if ( !$m )
			print "INSERT INTO suggest VALUES\n";
		else
			print ",\n";

		$n++;
		fwrite ( $out, "( 0, '$keyword', '$trigrams', $freq )" );

		$m++;
		if ( ( $m % 10000 )==0 )
		{
			print ";\n";
			$m = 0;
		}
	}

	if ( $m )
		fwrite ( $out, ";" );
}


/// search for suggestions
function MakeSuggestion ( $keyword )
{
	$trigrams = BuildTrigrams ( $keyword );
	$query = "\"$trigrams\"/1";
	$len = strlen($keyword);

	$delta = LENGTH_THRESHOLD;
	$cl = new SphinxClient ();
	$cl->SetMatchMode ( SPH_MATCH_EXTENDED2 );
	$cl->SetRankingMode ( SPH_RANK_WORDCOUNT );
	$cl->SetFilterRange ( "len", $len-$delta, $len+$delta );
	$cl->SetSelect ( "*, @weight+$delta-abs(len-$len) AS myrank" );
	$cl->SetSortMode ( SPH_SORT_EXTENDED, "myrank DESC, freq DESC" );
  	$cl->SetArrayResult ( true );

  	// pull top-N best trigram matches and run them through Levenshtein
	$cl->SetLimits ( 0, TOP_COUNT );
	$res = $cl->Query ( $query, "suggest" );

	if ( !$res || !$res["matches"] )
		return false;

	if ( SUGGEST_DEBUG )
	{
		print "--- DEBUG START ---\n";

		foreach ( $res["matches"] as $match )
		{
			$w = $match["attrs"]["keyword"];
			$myrank = @$match["attrs"]["myrank"];
			if ( $myrank )
				$myrank = ", myrank=$myrank";

			// FIXME? add costs?
			// FIXME! does not work with UTF-8.. THIS! IS!! PHP!!!
			$levdist = levenshtein ( $keyword, $w );

			print "id=$match[id], weight=$match[weight], freq={$match[attrs][freq]}{$myrank}, word=$w, levdist=$levdist\n";
		}

		print "--- DEBUG END ---\n";
	}

	// further restrict trigram matches with a sane Levenshtein distance limit
	foreach ( $res["matches"] as $match )
	{
		$suggested = $match["attrs"]["keyword"];
		if ( levenshtein ( $keyword, $suggested )<=LEVENSHTEIN_THRESHOLD )
			return $suggested;
	}
	return $keyword;
}

/// main
if ( $_SERVER["argc"]<2 )
{
	die ( "usage:\n"
		. "php suggest.php --builddict\treads stopwords from stdin, prints SQL dump of the dictionary to stdout\n"
		. "php suggest.php --query WORD\tqueries Sphinx, prints suggestion\n" );
}

if ( $_SERVER["argv"][1]=="--builddict" )
{
	$in = fopen ( "php://stdin", "r" );
	$out = fopen ( "php://stdout", "w+" );
	BuildDictionarySQL ( $out, $in );
}

if ( $_SERVER["argv"][1]=="--query" )
{
	mysql_connect ( "localhost", "root", "" ) or die ( "mysql_connect() failed: ".mysql_error() );
	mysql_select_db ( "test" ) or die ( "mysql_select_db() failed: ".mysql_error() );

	$keyword = $_SERVER["argv"][2];
	printf ( "keyword: %s\nsuggestion: %s\n", $keyword, MakeSuggestion($keyword) );
}