1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
|
-- state_extract(addressStringLessZipCode)
-- Extracts the state from end of the given string.
--
-- This function uses the state_lookup table to determine which state
-- the input string is indicating. First, an exact match is pursued,
-- and in the event of failure, a word-by-word fuzzy match is attempted.
--
-- The result is the state as given in the input string, and the approved
-- state abbreviation, seperated by a colon.
CREATE OR REPLACE FUNCTION state_extract(rawInput VARCHAR) RETURNS VARCHAR
AS $_$
DECLARE
tempInt INTEGER;
tempString VARCHAR;
state VARCHAR;
stateAbbrev VARCHAR;
result VARCHAR;
rec RECORD;
test BOOLEAN;
ws VARCHAR;
var_verbose boolean := false;
BEGIN
ws := E'[ ,.\t\n\f\r]';
-- If there is a trailing space or , get rid of it
-- this is to handle case where people use , instead of space to separate state and zip
-- such as '2450 N COLORADO ST, PHILADELPHIA, PA, 19132' instead of '2450 N COLORADO ST, PHILADELPHIA, PA 19132'
--tempString := regexp_replace(rawInput, E'(.*)' || ws || '+', E'\\1');
tempString := btrim(rawInput, ', ');
-- Separate out the last word of the state, and use it to compare to
-- the state lookup table to determine the entire name, as well as the
-- abbreviation associated with it. The zip code may or may not have
-- been found.
tempString := substring(tempString from ws || E'+([^ ,.\t\n\f\r0-9]*?)$');
IF var_verbose THEN RAISE NOTICE 'state_extract rawInput: % tempString: %', rawInput, tempString; END IF;
SELECT INTO tempInt count(*) FROM (select distinct abbrev from state_lookup
WHERE upper(abbrev) = upper(tempString)) as blah;
IF tempInt = 1 THEN
state := tempString;
SELECT INTO stateAbbrev abbrev FROM (select distinct abbrev from
state_lookup WHERE upper(abbrev) = upper(tempString)) as blah;
ELSE
SELECT INTO tempInt count(*) FROM state_lookup WHERE upper(name)
like upper('%' || tempString);
IF tempInt >= 1 THEN
FOR rec IN SELECT name from state_lookup WHERE upper(name)
like upper('%' || tempString) LOOP
SELECT INTO test texticregexeq(rawInput, name) FROM state_lookup
WHERE rec.name = name;
IF test THEN
SELECT INTO stateAbbrev abbrev FROM state_lookup
WHERE rec.name = name;
state := substring(rawInput, '(?i)' || rec.name);
EXIT;
END IF;
END LOOP;
ELSE
-- No direct match for state, so perform fuzzy match.
SELECT INTO tempInt count(*) FROM state_lookup
WHERE soundex(tempString) = end_soundex(name);
IF tempInt >= 1 THEN
FOR rec IN SELECT name, abbrev FROM state_lookup
WHERE soundex(tempString) = end_soundex(name) LOOP
tempInt := count_words(rec.name);
tempString := get_last_words(rawInput, tempInt);
test := TRUE;
FOR i IN 1..tempInt LOOP
IF soundex(split_part(tempString, ' ', i)) !=
soundex(split_part(rec.name, ' ', i)) THEN
test := FALSE;
END IF;
END LOOP;
IF test THEN
state := tempString;
stateAbbrev := rec.abbrev;
EXIT;
END IF;
END LOOP;
END IF;
END IF;
END IF;
IF state IS NOT NULL AND stateAbbrev IS NOT NULL THEN
result := state || ':' || stateAbbrev;
END IF;
RETURN result;
END;
$_$ LANGUAGE plpgsql STABLE;
|