# Natural Language Toolkit: Stemmers # # Copyright (C) 2001-2018 NLTK Project # Author: Steven Tomcavage # URL: # For license information, see LICENSE.TXT """ A word stemmer based on the Lancaster (Paice/Husk) stemming algorithm. Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61. """ from __future__ import unicode_literals import re from nltk.stem.api import StemmerI from nltk.compat import python_2_unicode_compatible @python_2_unicode_compatible class LancasterStemmer(StemmerI): """ Lancaster Stemmer >>> from nltk.stem.lancaster import LancasterStemmer >>> st = LancasterStemmer() >>> st.stem('maximum') # Remove "-um" when word is intact 'maxim' >>> st.stem('presumably') # Don't remove "-um" when word is not intact 'presum' >>> st.stem('multiply') # No action taken if word ends with "-ply" 'multiply' >>> st.stem('provision') # Replace "-sion" with "-j" to trigger "j" set of rules 'provid' >>> st.stem('owed') # Word starting with vowel must contain at least 2 letters 'ow' >>> st.stem('ear') # ditto 'ear' >>> st.stem('saying') # Words starting with consonant must contain at least 3 'say' >>> st.stem('crying') # letters and one of those letters must be a vowel 'cry' >>> st.stem('string') # ditto 'string' >>> st.stem('meant') # ditto 'meant' >>> st.stem('cement') # ditto 'cem' >>> st_pre = LancasterStemmer(strip_prefix_flag=True) >>> st_pre.stem('kilometer') # Test Prefix 'met' >>> st_custom = LancasterStemmer(rule_tuple=("ssen4>", "s1t.")) >>> st_custom.stem("ness") # Change s to t 'nest' """ # The rule list is static since it doesn't change between instances default_rule_tuple = ( "ai*2.", # -ia > - if intact "a*1.", # -a > - if intact "bb1.", # -bb > -b "city3s.", # -ytic > -ys "ci2>", # -ic > - "cn1t>", # -nc > -nt "dd1.", # -dd > -d "dei3y>", # -ied > -y "deec2ss.", # -ceed >", -cess "dee1.", # -eed > -ee "de2>", # -ed > - "dooh4>", # -hood > - "e1>", # -e > - "feil1v.", # -lief > -liev "fi2>", # -if > - "gni3>", # -ing > - "gai3y.", # -iag > -y "ga2>", # -ag > - "gg1.", # -gg > -g "ht*2.", # -th > - if intact "hsiug5ct.", # -guish > -ct "hsi3>", # -ish > - "i*1.", # -i > - if intact "i1y>", # -i > -y "ji1d.", # -ij > -id -- see nois4j> & vis3j> "juf1s.", # -fuj > -fus "ju1d.", # -uj > -ud "jo1d.", # -oj > -od "jeh1r.", # -hej > -her "jrev1t.", # -verj > -vert "jsim2t.", # -misj > -mit "jn1d.", # -nj > -nd "j1s.", # -j > -s "lbaifi6.", # -ifiabl > - "lbai4y.", # -iabl > -y "lba3>", # -abl > - "lbi3.", # -ibl > - "lib2l>", # -bil > -bl "lc1.", # -cl > c "lufi4y.", # -iful > -y "luf3>", # -ful > - "lu2.", # -ul > - "lai3>", # -ial > - "lau3>", # -ual > - "la2>", # -al > - "ll1.", # -ll > -l "mui3.", # -ium > - "mu*2.", # -um > - if intact "msi3>", # -ism > - "mm1.", # -mm > -m "nois4j>", # -sion > -j "noix4ct.", # -xion > -ct "noi3>", # -ion > - "nai3>", # -ian > - "na2>", # -an > - "nee0.", # protect -een "ne2>", # -en > - "nn1.", # -nn > -n "pihs4>", # -ship > - "pp1.", # -pp > -p "re2>", # -er > - "rae0.", # protect -ear "ra2.", # -ar > - "ro2>", # -or > - "ru2>", # -ur > - "rr1.", # -rr > -r "rt1>", # -tr > -t "rei3y>", # -ier > -y "sei3y>", # -ies > -y "sis2.", # -sis > -s "si2>", # -is > - "ssen4>", # -ness > - "ss0.", # protect -ss "suo3>", # -ous > - "su*2.", # -us > - if intact "s*1>", # -s > - if intact "s0.", # -s > -s "tacilp4y.", # -plicat > -ply "ta2>", # -at > - "tnem4>", # -ment > - "tne3>", # -ent > - "tna3>", # -ant > - "tpir2b.", # -ript > -rib "tpro2b.", # -orpt > -orb "tcud1.", # -duct > -duc "tpmus2.", # -sumpt > -sum "tpec2iv.", # -cept > -ceiv "tulo2v.", # -olut > -olv "tsis0.", # protect -sist "tsi3>", # -ist > - "tt1.", # -tt > -t "uqi3.", # -iqu > - "ugo1.", # -ogu > -og "vis3j>", # -siv > -j "vie0.", # protect -eiv "vi2>", # -iv > - "ylb1>", # -bly > -bl "yli3y>", # -ily > -y "ylp0.", # protect -ply "yl2>", # -ly > - "ygo1.", # -ogy > -og "yhp1.", # -phy > -ph "ymo1.", # -omy > -om "ypo1.", # -opy > -op "yti3>", # -ity > - "yte3>", # -ety > - "ytl2.", # -lty > -l "yrtsi5.", # -istry > - "yra3>", # -ary > - "yro3>", # -ory > - "yfi3.", # -ify > - "ycn2t>", # -ncy > -nt "yca3>", # -acy > - "zi2>", # -iz > - "zy1s." # -yz > -ys ) def __init__(self, rule_tuple=None, strip_prefix_flag=False): """Create an instance of the Lancaster stemmer. """ # Setup an empty rule dictionary - this will be filled in later self.rule_dictionary = {} # Check if a user wants to strip prefix self._strip_prefix = strip_prefix_flag # Check if a user wants to use his/her own rule tuples. self._rule_tuple = rule_tuple if rule_tuple else self.default_rule_tuple def parseRules(self, rule_tuple=None): """Validate the set of rules used in this stemmer. If this function is called as an individual method, without using stem method, rule_tuple argument will be compiled into self.rule_dictionary. If this function is called within stem, self._rule_tuple will be used. """ # If there is no argument for the function, use class' own rule tuple. rule_tuple = rule_tuple if rule_tuple else self._rule_tuple valid_rule = re.compile("^[a-z]+\*?\d[a-z]*[>\.]?$") # Empty any old rules from the rule set before adding new ones self.rule_dictionary = {} for rule in rule_tuple: if not valid_rule.match(rule): raise ValueError("The rule {0} is invalid".format(rule)) first_letter = rule[0:1] if first_letter in self.rule_dictionary: self.rule_dictionary[first_letter].append(rule) else: self.rule_dictionary[first_letter] = [rule] def stem(self, word): """Stem a word using the Lancaster stemmer. """ # Lower-case the word, since all the rules are lower-cased word = word.lower() word = self.__stripPrefix(word) if self._strip_prefix else word # Save a copy of the original word intact_word = word # If rule dictionary is empty, parse rule tuple. if not self.rule_dictionary: self.parseRules() return self.__doStemming(word, intact_word) def __doStemming(self, word, intact_word): """Perform the actual word stemming """ valid_rule = re.compile("^([a-z]+)(\*?)(\d)([a-z]*)([>\.]?)$") proceed = True while proceed: # Find the position of the last letter of the word to be stemmed last_letter_position = self.__getLastLetter(word) # Only stem the word if it has a last letter and a rule matching that last letter if last_letter_position < 0 or word[last_letter_position] not in self.rule_dictionary: proceed = False else: rule_was_applied = False # Go through each rule that matches the word's final letter for rule in self.rule_dictionary[word[last_letter_position]]: rule_match = valid_rule.match(rule) if rule_match: (ending_string, intact_flag, remove_total, append_string, cont_flag) = rule_match.groups() # Convert the number of chars to remove when stemming # from a string to an integer remove_total = int(remove_total) # Proceed if word's ending matches rule's word ending if word.endswith(ending_string[::-1]): if intact_flag: if (word == intact_word and self.__isAcceptable(word, remove_total)): word = self.__applyRule(word, remove_total, append_string) rule_was_applied = True if cont_flag == '.': proceed = False break elif self.__isAcceptable(word, remove_total): word = self.__applyRule(word, remove_total, append_string) rule_was_applied = True if cont_flag == '.': proceed = False break # If no rules apply, the word doesn't need any more stemming if rule_was_applied == False: proceed = False return word def __getLastLetter(self, word): """Get the zero-based index of the last alphabetic character in this string """ last_letter = -1 for position in range(len(word)): if word[position].isalpha(): last_letter = position else: break return last_letter def __isAcceptable(self, word, remove_total): """Determine if the word is acceptable for stemming. """ word_is_acceptable = False # If the word starts with a vowel, it must be at least 2 # characters long to be stemmed if word[0] in "aeiouy": if (len(word) - remove_total >= 2): word_is_acceptable = True # If the word starts with a consonant, it must be at least 3 # characters long (including one vowel) to be stemmed elif (len(word) - remove_total >= 3): if word[1] in "aeiouy": word_is_acceptable = True elif word[2] in "aeiouy": word_is_acceptable = True return word_is_acceptable def __applyRule(self, word, remove_total, append_string): """Apply the stemming rule to the word """ # Remove letters from the end of the word new_word_length = len(word) - remove_total word = word[0:new_word_length] # And add new letters to the end of the truncated word if append_string: word += append_string return word def __stripPrefix(self, word): """Remove prefix from a word. This function originally taken from Whoosh. """ for prefix in ("kilo", "micro", "milli", "intra", "ultra", "mega", "nano", "pico", "pseudo"): if word.startswith(prefix): return word[len(prefix):] return word def __repr__(self): return ''