File: Words.pm

package info (click to toggle)
faqomatic 2.721-9.1
links: PTS
area: main
in suites: etch, etch-m68k
size: 1,996 kB
ctags: 548
sloc: perl: 13,356; sh: 69; makefile: 47
file content (239 lines) | stat: -rw-r--r-- 7,397 bytes
parent folder | download | duplicates (2)
##############################################################################
# The Faq-O-Matic is Copyright 1997 by Jon Howell, all rights reserved.      #
#                                                                            #
# This program is free software; you can redistribute it and/or              #
# modify it under the terms of the GNU General Public License                #
# as published by the Free Software Foundation; either version 2             #
# of the License, or (at your option) any later version.                     #
#                                                                            #
# This program is distributed in the hope that it will be useful,            #
# but WITHOUT ANY WARRANTY; without even the implied warranty of             #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              #
# GNU General Public License for more details.                               #
#                                                                            #
# You should have received a copy of the GNU General Public License          #
# along with this program; if not, write to the Free Software                #
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.#
#                                                                            #
# Jon Howell can be contacted at:                                            #
# 6211 Sudikoff Lab, Dartmouth College                                       #
# Hanover, NH  03755-3510                                                    #
# jonh@cs.dartmouth.edu                                                      #
#                                                                            #
# An electronic copy of the GPL is available at:                             #
# http://www.gnu.org/copyleft/gpl.html                                       #
#                                                                            #
##############################################################################

use strict;
use locale;

### Words.pm
###
### Support for extracting "words" from strings
###
### To change these routines to support other character sets,
### copy this file to a location outside of the FAQ::OMatic tree and
### add the following lines to the start of your cgi-bin/fom file:
###	use lib '/Whatever/your/directory/path/is';
###	require Words;
###	#existing use lib line
###	use FAQ::OMatic::Words
### This will override the definitions in this file.


package FAQ::OMatic::Words;

BEGIN {
#   This code use Japanese environment only.
#   see http://chasen.aist-nara.ac.jp/index.html.en
#
    if (FAQ::OMatic::I18N::language() eq 'ja_JP.EUC') {
        require Text::ChaSen;  import Text::ChaSen;
        &Text::ChaSen::getopt_argv('faq-omatic', '-j', '-F', '%m ');
    }
}

sub cannonical {
    my $string = shift;

    # convert the input string into cannonical form.
    #
    # The default is to strip parenthesis and apostrophies, and
    # convert to ASCII lower case.
    #
    # If you use another character set (e.g. ISO-8859-?), you'll want
    # to override to do correct lower case handling.
    #
    # This routine is called both when the indicies are created and
    # when the search pattern is formed, so things will be done
    # consistantly.

    # convert
    #	timer(s) to timers
    #   timer's to timers
    #   e-mail  to email
    $string =~ s/[()'-]//g;
    $string  = lc($string);		# convert to lower case

    # Accentuated lc(),
    if (FAQ::OMatic::I18N::language() eq 'hu') {
        $string =~ tr/\301\311\315\323\326\325\332\334\333/\341\351\355\363\366\365\372\374\373/;
    }

    if (FAQ::OMatic::I18N::language() eq 'pl') {
        $string =~ tr/\241\306\312\243\321\323\246\257\254/\261\346\352\263\361\363\266\277\274/;
    }
    $string;
}

sub getWords {
    my $string = shift;
    my $encode_lang = FAQ::OMatic::I18N::language();
#EUC-JP case
    return getWordsEUCJP($string) if($encode_lang eq "ja_JP.EUC");
# Hungarian case
    return getWordshu($string) if($encode_lang eq 'hu');
# Polish case
    return getWordspl($string) if($encode_lang eq 'pl');
#normal case
    return getWordsSB($string);
}

sub getWordsSB {
	my $string = shift;

	# given a user-input string, we break it into "legal" words
	# and return an array of them

	$string = cannonical( $string );

	my $wordPattern = '[\w-]';	# alphanumeric + '_' + '-'

	#my @words = ($string =~ m/($wordPattern+)/gso);
	# /gso seems to break in some circumstances. :v(
	my @wordspl = split(/($wordPattern+)/, $string);
	my @words=();
	my $i;
	for ($i=1; $i<@wordspl; $i+=2) {
		push (@words, $wordspl[$i]);
	}
	return @words;

}

sub getWordsEUCJP {
    require Text::ChaSen; import Text::ChaSen;
    require NKF; import NKF;

	my $string = shift;

	# given a user-input string, we break it into "legal" words
	# and return an array of them

	$string = nkf('-e', $string);
	$string = cannonical( $string );

	my $wordPattern = '[\w-]';	# alphanumeric + '_' + '-'

	my $s = &Text::ChaSen::sparse_tostr($string);
	chomp $s;
	my @words = split / /, $s;
	return @words;

}

sub getWordshu {
	my $string = shift;

	# given a user-input string, we break it into "legal" words
	# and return an array of them

	$string = cannonical( $string );

	# pattern for hungarian language:
	my $wordPattern = '[\w\341\351\355\363\366\365\372\374\373-]';	

	#my @words = ($string =~ m/($wordPattern+)/gso);
	# /gso seems to break in some circumstances. :v(
	my @wordspl = split(/($wordPattern+)/, $string);
	my @words=();
	my $i;
	for ($i=1; $i<@wordspl; $i+=2) {
		push (@words, $wordspl[$i]);
	}
	return @words;
}

sub getWordspl {
        my $string = shift;

        # given a user-input string, we break it into "legal" words
        # and return an array of them

        $string = cannonical( $string );

        # pattern for polish language:
        my $wordPattern = '[\w\261\346\352\263\361\363\266\277\274-]';

        #my @words = ($string =~ m/($wordPattern+)/gso);
        # /gso seems to break in some circumstances. :v(
        my @wordspl = split(/($wordPattern+)/, $string);
        my @words=();
        my $i;
        for ($i=1; $i<@wordspl; $i+=2) {
                push (@words, $wordspl[$i]);
        }
        return @words;
}

sub getPrefixes {
    my $word = shift;
    my $encode_lang = FAQ::OMatic::I18N::language();
#EUC-JP case
    return getPrefixesEUCJP($word) if($encode_lang eq "ja_JP.EUC");
#normal case
    return getPrefixesSB($word);
}

sub getPrefixesSB {
    my $word = shift;

    # given a word, return an array of prefixes which should be
    # indexed.
    #
    # default routine returns all substrings
    my @prefix=();
    my $i = length( $word );
    while( $i ) {
        push @prefix, substr( $word, 0, $i-- );
    }

    @prefix;
}

## Japanese EUC-JP multibyte extended getPrefixes by oota ##
sub getPrefixesEUCJP {
    my $word = shift;

    # given a word, return an array of prefixes which should be
    # indexed.
    #
    # default routine returns all substrings
    my @prefix=();
    my $i = 1;
    while( $i <= length( $word )) {	
       if(ord(substr($word,$i-1,1)) >= 128) {
           push @prefix, substr( $word, 0, $i+1 );
           $i += 2;
       } else {
           push @prefix, substr( $word, 0, $i );
           $i += 1;
       }
    }

    reverse @prefix;
}

'true';