| Module | Stemmable |
| In: |
lib/stemmer.rb
|
The Porter stemmer, from http://www.tartarus.org/martin/PorterStemmer/ruby.txt License for this file stemmer.rb ONLY, from http://www.tartarus.org/martin/PorterStemmer/: "All these encodings of the algorithm can be used free of charge for any purpose."
! /local/ruby/bin/ruby
$Id: stemmable.rb,v 1.2 2003/02/01 02:07:30 condit Exp $ See example usage at the end of this file.
| STEP_2_LIST | = | { 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance', 'izer'=>'ize', 'bli'=>'ble', 'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous', 'ization'=>'ize', 'ation'=>'ate', 'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful', 'ousness'=>'ous', 'aliti'=>'al', 'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log' |
| STEP_3_LIST | = | { 'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic', 'ical'=>'ic', 'ful'=>'', 'ness'=>'' |
| SUFFIX_1_REGEXP | = | /( ational | tional | enci | anci | izer | bli | alli | entli | eli | ousli | ization | ation | ator | alism | iveness | fulness | ousness | aliti | iviti | biliti | logi)$/x |
| SUFFIX_2_REGEXP | = | /( al | ance | ence | er | ic | able | ible | ant | ement | ment | ent | ou | ism | ate | iti | ous | ive | ize)$/x |
| C | = | "[^aeiou]" |
| V | = | "[aeiouy]" |
| CC | = | "#{C}(?>[^aeiouy]*)" |
| VV | = | "#{V}(?>[aeiou]*)" |
| MGR0 | = | /^(#{CC})?#{VV}#{CC}/o |
| MEQ1 | = | /^(#{CC})?#{VV}#{CC}(#{VV})?$/o |
| MGR1 | = | /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o |
| VOWEL_IN_STEM | = | /^(#{CC})?#{V}/o |
Porter stemmer in Ruby.
This is the Porter stemming algorithm, ported to Ruby from the version coded up in Perl. It’s easy to follow against the rules in the original paper in:
Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, no. 3, pp 130-137,
See also www.tartarus.org/~martin/PorterStemmer
Send comments to raypereda@hotmail.com
# File lib/stemmer.rb, line 102
102: def stem_porter
103:
104: # make a copy of the given object and convert it to a string.
105: w = self.dup.to_str
106:
107: return w if w.length < 3
108:
109: # now map initial y to Y so that the patterns never treat it as vowel
110: w[0] = 'Y' if w[0] == ?y
111:
112: # Step 1a
113: if w =~ /(ss|i)es$/
114: w = $` + $1
115: elsif w =~ /([^s])s$/
116: w = $` + $1
117: end
118:
119: # Step 1b
120: if w =~ /eed$/
121: w.chop! if $` =~ MGR0
122: elsif w =~ /(ed|ing)$/
123: stem = $`
124: if stem =~ VOWEL_IN_STEM
125: w = stem
126: case w
127: when /(at|bl|iz)$/ then w << "e"
128: when /([^aeiouylsz])\1$/ then w.chop!
129: when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
130: end
131: end
132: end
133:
134: if w =~ /y$/
135: stem = $`
136: w = stem + "i" if stem =~ VOWEL_IN_STEM
137: end
138:
139: # Step 2
140: if w =~ SUFFIX_1_REGEXP
141: stem = $`
142: suffix = $1
143: # print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
144: if stem =~ MGR0
145: w = stem + STEP_2_LIST[suffix]
146: end
147: end
148:
149: # Step 3
150: if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
151: stem = $`
152: suffix = $1
153: if stem =~ MGR0
154: w = stem + STEP_3_LIST[suffix]
155: end
156: end
157:
158: # Step 4
159: if w =~ SUFFIX_2_REGEXP
160: stem = $`
161: if stem =~ MGR1
162: w = stem
163: end
164: elsif w =~ /(s|t)(ion)$/
165: stem = $` + $1
166: if stem =~ MGR1
167: w = stem
168: end
169: end
170:
171: # Step 5
172: if w =~ /e$/
173: stem = $`
174: if (stem =~ MGR1) ||
175: (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
176: w = stem
177: end
178: end
179:
180: if w =~ /ll$/ && w =~ MGR1
181: w.chop!
182: end
183:
184: # and turn initial Y back to y
185: w[0] = 'y' if w[0] == ?Y
186:
187: w
188: end