Stemmer.cpp
上传用户:sanxfzhen
上传日期:2014-12-28
资源大小:2324k
文件大小:10k
- // Stemmer.cpp: implementation of the CStemmer class.
- //
- //////////////////////////////////////////////////////////////////////
- #include "stdafx.h"
- #include "Stemmer.h"
- #ifdef _DEBUG
- #undef THIS_FILE
- static char THIS_FILE[]=__FILE__;
- #define new DEBUG_NEW
- #endif
- CStemmer theStemmer;
- //////////////////////////////////////////////////////////////////////
- // Construction/Destruction
- //////////////////////////////////////////////////////////////////////
- CStemmer::CStemmer()
- {
- }
- CStemmer::~CStemmer()
- {
- }
- /* The main part of the stemming algorithm starts here. b is a buffer
- holding a word to be stemmed. The letters are in b[k0], b[k0+1] ...
- ending at b[k]. In fact k0 = 0 in this demo program. k is readjusted
- downwards as the stemming progresses. Zero termination is not in fact
- used in the algorithm.
- Note that only lower case sequences are stemmed. Forcing to lower case
- should be done before stem(...) is called.
- */
- /* cons(i) is TRUE <=> b[i] is a consonant. */
- int CStemmer::cons(int i)
- { switch (b[i])
- { case 'a': case 'e': case 'i': case 'o': case 'u': return FALSE;
- case 'y': return (i==k0) ? TRUE : !cons(i-1);
- default: return TRUE;
- }
- }
- /* m() measures the number of consonant sequences between k0 and j. if c is
- a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
- presence,
- <c><v> gives 0
- <c>vc<v> gives 1
- <c>vcvc<v> gives 2
- <c>vcvcvc<v> gives 3
- ....
- */
- int CStemmer::m()
- { int n = 0;
- int i = k0;
- while(TRUE)
- { if (i > j) return n;
- if (! cons(i)) break; i++;
- }
- i++;
- while(TRUE)
- { while(TRUE)
- { if (i > j) return n;
- if (cons(i)) break;
- i++;
- }
- i++;
- n++;
- while(TRUE)
- { if (i > j) return n;
- if (! cons(i)) break;
- i++;
- }
- i++;
- }
- }
- /* vowelinstem() is TRUE <=> k0,...j contains a vowel */
- int CStemmer::vowelinstem()
- { int i; for (i = k0; i <= j; i++) if (! cons(i)) return TRUE;
- return FALSE;
- }
- /* doublec(j) is TRUE <=> j,(j-1) contain a double consonant. */
- int CStemmer::doublec(int j)
- { if (j < k0+1) return FALSE;
- if (b[j] != b[j-1]) return FALSE;
- return cons(j);
- }
- /* cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant
- and also if the second c is not w,x or y. this is used when trying to
- restore an e at the end of a short word. e.g.
- cav(e), lov(e), hop(e), crim(e), but
- snow, box, tray.
- */
- int CStemmer::cvc(int i)
- { if (i < k0+2 || !cons(i) || cons(i-1) || !cons(i-2)) return FALSE;
- { int ch = b[i];
- if (ch == 'w' || ch == 'x' || ch == 'y') return FALSE;
- }
- return TRUE;
- }
- /* ends(s) is TRUE <=> k0,...k ends with the string s. */
- int CStemmer::ends(char * s)
- { int length = s[0];
- if (s[length] != b[k]) return FALSE; /* tiny speed-up */
- if (length > k-k0+1) return FALSE;
- if (memcmp(b+k-length+1,s+1,length) != 0) return FALSE;
- j = k-length;
- return TRUE;
- }
- /* setto(s) sets (j+1),...k to the characters in the string s, readjusting
- k. */
- void CStemmer::setto(char * s)
- { int length = s[0];
- memmove(b+j+1,s+1,length);
- k = j+length;
- }
- /* r(s) is used further down. */
- void CStemmer::r(char * s) { if (m() > 0) setto(s); }
- /* step1ab() gets rid of plurals and -ed or -ing. e.g.
- caresses -> caress
- ponies -> poni
- ties -> ti
- caress -> caress
- cats -> cat
- feed -> feed
- agreed -> agree
- disabled -> disable
- matting -> mat
- mating -> mate
- meeting -> meet
- milling -> mill
- messing -> mess
- meetings -> meet
- */
- void CStemmer::step1ab()
- { if (b[k] == 's')
- { if (ends("