StandardTokenizer.jj
上传用户:zhangkuixh
上传日期:2013-09-30
资源大小:5473k
文件大小:5k
- /**f
- * Copyright 2004 The Apache Software Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- options {
- STATIC = false;
- //IGNORE_CASE = true;
- //BUILD_PARSER = false;
- UNICODE_INPUT = true;
- USER_CHAR_STREAM = true;
- OPTIMIZE_TOKEN_MANAGER = true;
- //DEBUG_TOKEN_MANAGER = true;
- }
- PARSER_BEGIN(StandardTokenizer)
- package org.apache.lucene.analysis.standard;
- import java.io.*;
- /** A grammar-based tokenizer constructed with JavaCC.
- *
- * <p> This should be a good tokenizer for most European-language documents:
- *
- * <ul>
- * <li>Splits words at punctuation characters, removing punctuation. However, a
- * dot that's not followed by whitespace is considered part of a token.
- * <li>Splits words at hyphens, unless there's a number in the token, in which case
- * the whole token is interpreted as a product number and is not split.
- * <li>Recognizes email addresses and internet hostnames as one token.
- * </ul>
- *
- * <p>Many applications have specific tokenizer needs. If this tokenizer does
- * not suit your application, please consider copying this source code
- * directory to your project and maintaining your own grammar-based tokenizer.
- */
- public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer {
- /** Constructs a tokenizer for this Reader. */
- public StandardTokenizer(Reader reader) {
- this(new FastCharStream(reader));
- this.input = reader;
- }
- }
- PARSER_END(StandardTokenizer)
- TOKEN : { // token patterns
- // basic word: a sequence of digits & letters
- <ALPHANUM: (<LETTER>|<DIGIT>|<KOREAN>)+ >
- // internal apostrophes: O'Reilly, you're, O'Reilly's
- // use a post-filter to remove possesives
- | <APOSTROPHE: <ALPHA> ("'" <ALPHA>)+ >
- // acronyms: U.S.A., I.B.M., etc.
- // use a post-filter to remove dots
- | <ACRONYM: <ALPHA> "." (<ALPHA> ".")+ >
- // company names like AT&T and Excite@Home.
- | <COMPANY: <ALPHA> ("&"|"@") <ALPHA> >
- // email addresses
- | <EMAIL: <ALPHANUM> (("."|"-"|"_") <ALPHANUM>)* "@" <ALPHANUM> (("."|"-") <ALPHANUM>)+ >
- // hostname
- | <HOST: <ALPHANUM> ("." <ALPHANUM>)+ >
- // floating point, serial, model numbers, ip addresses, etc.
- // every other segment must have at least one digit
- | <NUM: (<ALPHANUM> <P> <HAS_DIGIT>
- | <HAS_DIGIT> <P> <ALPHANUM>
- | <ALPHANUM> (<P> <HAS_DIGIT> <P> <ALPHANUM>)+
- | <HAS_DIGIT> (<P> <ALPHANUM> <P> <HAS_DIGIT>)+
- | <ALPHANUM> <P> <HAS_DIGIT> (<P> <ALPHANUM> <P> <HAS_DIGIT>)+
- | <HAS_DIGIT> <P> <ALPHANUM> (<P> <HAS_DIGIT> <P> <ALPHANUM>)+
- )
- >
- | <#P: ("_"|"-"|"/"|"."|",") >
- | <#HAS_DIGIT: // at least one digit
- (<LETTER>|<DIGIT>)*
- <DIGIT>
- (<LETTER>|<DIGIT>)*
- >
- | < #ALPHA: (<LETTER>)+>
- | < #LETTER: // unicode letters
- [
- "u0041"-"u005a",
- "u0061"-"u007a",
- "u00c0"-"u00d6",
- "u00d8"-"u00f6",
- "u00f8"-"u00ff",
- "u0100"-"u1fff"
- ]
- >
- | < CJ: // Chinese, Japanese
- [
- "u3040"-"u318f",
- "u3300"-"u337f",
- "u3400"-"u3d2d",
- "u4e00"-"u9fff",
- "uf900"-"ufaff"
- ]
- >
- | < KOREAN: // Korean
- [
- "uac00"-"ud7af"
- ]
- >
- | < #DIGIT: // unicode digits
- [
- "u0030"-"u0039",
- "u0660"-"u0669",
- "u06f0"-"u06f9",
- "u0966"-"u096f",
- "u09e6"-"u09ef",
- "u0a66"-"u0a6f",
- "u0ae6"-"u0aef",
- "u0b66"-"u0b6f",
- "u0be7"-"u0bef",
- "u0c66"-"u0c6f",
- "u0ce6"-"u0cef",
- "u0d66"-"u0d6f",
- "u0e50"-"u0e59",
- "u0ed0"-"u0ed9",
- "u1040"-"u1049"
- ]
- >
- }
- SKIP : { // skip unrecognized chars
- <NOISE: ~[] >
- }
- /** Returns the next token in the stream, or null at EOS.
- * <p>The returned token's type is set to an element of {@link
- * StandardTokenizerConstants#tokenImage}.
- */
- org.apache.lucene.analysis.Token next() throws IOException :
- {
- Token token = null;
- }
- {
- ( token = <ALPHANUM> |
- token = <APOSTROPHE> |
- token = <ACRONYM> |
- token = <COMPANY> |
- token = <EMAIL> |
- token = <HOST> |
- token = <NUM> |
- token = <CJ> |
- token = <EOF>
- )
- {
- if (token.kind == EOF) {
- return null;
- } else {
- return
- new org.apache.lucene.analysis.Token(token.image,
- token.beginColumn,token.endColumn,
- tokenImage[token.kind]);
- }
- }
- }