File: porter_stemming.rb

package info (click to toggle)
ruby-text 1.3.0-1
links: PTS, VCS
area: main
in suites: buster, jessie, jessie-kfreebsd, stretch
size: 568 kB
ctags: 102
sloc: ruby: 1,134; makefile: 2
file content (171 lines) | stat: -rw-r--r-- 4,524 bytes
parent folder | download | duplicates (4)
#
# This is the Porter Stemming algorithm, ported to Ruby from the
# version coded up in Perl.  It's easy to follow against the rules
# in the original paper in:
#
#   Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
#   no. 3, pp 130-137,
#
# Taken from http://www.tartarus.org/~martin/PorterStemmer (Public Domain)
#
module Text # :nodoc:
module PorterStemming

  STEP_2_LIST = {
    'ational' => 'ate', 'tional' => 'tion', 'enci' => 'ence', 'anci' => 'ance',
    'izer' => 'ize', 'bli' => 'ble',
    'alli' => 'al', 'entli' => 'ent', 'eli' => 'e', 'ousli' => 'ous',
    'ization' => 'ize', 'ation' => 'ate',
    'ator' => 'ate', 'alism' => 'al', 'iveness' => 'ive', 'fulness' => 'ful',
    'ousness' => 'ous', 'aliti' => 'al',
    'iviti' => 'ive', 'biliti' => 'ble', 'logi' => 'log'
  }

  STEP_3_LIST = {
    'icate' => 'ic', 'ative' => '', 'alize' => 'al', 'iciti' => 'ic',
    'ical' => 'ic', 'ful' => '', 'ness' => ''
  }

  SUFFIX_1_REGEXP = /(
                    ational  |
                    tional   |
                    enci     |
                    anci     |
                    izer     |
                    bli      |
                    alli     |
                    entli    |
                    eli      |
                    ousli    |
                    ization  |
                    ation    |
                    ator     |
                    alism    |
                    iveness  |
                    fulness  |
                    ousness  |
                    aliti    |
                    iviti    |
                    biliti   |
                    logi)$/x

  SUFFIX_2_REGEXP = /(
                      al       |
                      ance     |
                      ence     |
                      er       |
                      ic       |
                      able     |
                      ible     |
                      ant      |
                      ement    |
                      ment     |
                      ent      |
                      ou       |
                      ism      |
                      ate      |
                      iti      |
                      ous      |
                      ive      |
                      ize)$/x

  C = "[^aeiou]"             # consonant
  V = "[aeiouy]"             # vowel
  CC = "#{C}(?>[^aeiouy]*)"  # consonant sequence
  VV = "#{V}(?>[aeiou]*)"    # vowel sequence

  MGR0 = /^(#{CC})?#{VV}#{CC}/o                # [cc]vvcc... is m>0
  MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o       # [cc]vvcc[vv] is m=1
  MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o      # [cc]vvccvvcc... is m>1
  VOWEL_IN_STEM   = /^(#{CC})?#{V}/o           # vowel in stem

  def self.stem(word)

    # make a copy of the given object and convert it to a string.
    word = word.dup.to_str

    return word if word.length < 3

    # now map initial y to Y so that the patterns never treat it as vowel
    word[0] = 'Y' if word[0] == ?y

    # Step 1a
    if word =~ /(ss|i)es$/
      word = $` + $1
    elsif word =~ /([^s])s$/
      word = $` + $1
    end

    # Step 1b
    if word =~ /eed$/
      word.chop! if $` =~ MGR0
    elsif word =~ /(ed|ing)$/
      stem = $`
      if stem =~ VOWEL_IN_STEM
        word = stem
        case word
          when /(at|bl|iz)$/             then word << "e"
          when /([^aeiouylsz])\1$/       then word.chop!
          when /^#{CC}#{V}[^aeiouwxy]$/o then word << "e"
        end
      end
    end

    if word =~ /y$/
      stem = $`
      word = stem + "i" if stem =~ VOWEL_IN_STEM
    end

    # Step 2
    if word =~ SUFFIX_1_REGEXP
      stem = $`
      suffix = $1
      # print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
      if stem =~ MGR0
        word = stem + STEP_2_LIST[suffix]
      end
    end

    # Step 3
    if word =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
      stem = $`
      suffix = $1
      if stem =~ MGR0
        word = stem + STEP_3_LIST[suffix]
      end
    end

    # Step 4
    if word =~ SUFFIX_2_REGEXP
      stem = $`
      if stem =~ MGR1
        word = stem
      end
    elsif word =~ /(s|t)(ion)$/
      stem = $` + $1
      if stem =~ MGR1
        word = stem
      end
    end

    #  Step 5
    if word =~ /e$/
      stem = $`
      if (stem =~ MGR1) ||
          (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
        word = stem
      end
    end

    if word =~ /ll$/ && word =~ MGR1
      word.chop!
    end

    # and turn initial Y back to y
    word[0] = 'y' if word[0] == ?Y

    word
  end

end
end