搜索引擎

开发平台：
C#

StandardTokenizer.jj：源码内容
							/**f
 * Copyright 2004 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
options {
  STATIC = false;
//IGNORE_CASE = true;
//BUILD_PARSER = false;
  UNICODE_INPUT = true;
  USER_CHAR_STREAM = true;
  OPTIMIZE_TOKEN_MANAGER = true;
//DEBUG_TOKEN_MANAGER = true;
}
PARSER_BEGIN(StandardTokenizer)
package org.apache.lucene.analysis.standard;
import java.io.*;
/** A grammar-based tokenizer constructed with JavaCC.
 *
 * <p> This should be a good tokenizer for most European-language documents:
 *
 * <ul>
 *   <li>Splits words at punctuation characters, removing punctuation. However, a 
 *     dot that's not followed by whitespace is considered part of a token.
 *   <li>Splits words at hyphens, unless there's a number in the token, in which case
 *     the whole token is interpreted as a product number and is not split.
 *   <li>Recognizes email addresses and internet hostnames as one token.
 * </ul>
 *
 * <p>Many applications have specific tokenizer needs.  If this tokenizer does
 * not suit your application, please consider copying this source code
 * directory to your project and maintaining your own grammar-based tokenizer.
 */
public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer {
  /** Constructs a tokenizer for this Reader. */
  public StandardTokenizer(Reader reader) {
    this(new FastCharStream(reader));
    this.input = reader;
  }
}
PARSER_END(StandardTokenizer)
TOKEN : {					  // token patterns
  // basic word: a sequence of digits & letters
  <ALPHANUM: (<LETTER>|<DIGIT>|<KOREAN>)+ >
  // internal apostrophes: O'Reilly, you're, O'Reilly's
  // use a post-filter to remove possesives
| <APOSTROPHE: <ALPHA> ("'" <ALPHA>)+ >
  // acronyms: U.S.A., I.B.M., etc.
  // use a post-filter to remove dots
| <ACRONYM: <ALPHA> "." (<ALPHA> ".")+ >
  // company names like AT&T and Excite@Home.
| <COMPANY: <ALPHA> ("&"|"@") <ALPHA> >
  // email addresses
| <EMAIL: <ALPHANUM> (("."|"-"|"_") <ALPHANUM>)* "@" <ALPHANUM> (("."|"-") <ALPHANUM>)+ >
  // hostname
| <HOST: <ALPHANUM> ("." <ALPHANUM>)+ >
  // floating point, serial, model numbers, ip addresses, etc.
  // every other segment must have at least one digit
| <NUM: (<ALPHANUM> <P> <HAS_DIGIT>
       | <HAS_DIGIT> <P> <ALPHANUM>
       | <ALPHANUM> (<P> <HAS_DIGIT> <P> <ALPHANUM>)+
       | <HAS_DIGIT> (<P> <ALPHANUM> <P> <HAS_DIGIT>)+
       | <ALPHANUM> <P> <HAS_DIGIT> (<P> <ALPHANUM> <P> <HAS_DIGIT>)+
       | <HAS_DIGIT> <P> <ALPHANUM> (<P> <HAS_DIGIT> <P> <ALPHANUM>)+
        )
  >
| <#P: ("_"|"-"|"/"|"."|",") >
| <#HAS_DIGIT:					  // at least one digit
    (<LETTER>|<DIGIT>)*
    <DIGIT>
    (<LETTER>|<DIGIT>)*
  >
| < #ALPHA: (<LETTER>)+>
| < #LETTER:					  // unicode letters
      [
       "u0041"-"u005a",
       "u0061"-"u007a",
       "u00c0"-"u00d6",
       "u00d8"-"u00f6",
       "u00f8"-"u00ff",
       "u0100"-"u1fff"
      ]
  >
| < CJ:                                          // Chinese, Japanese
      [
       "u3040"-"u318f",
       "u3300"-"u337f",
       "u3400"-"u3d2d",
       "u4e00"-"u9fff",
       "uf900"-"ufaff"
      ]
  >
| < KOREAN:                                          // Korean
      [
       "uac00"-"ud7af"
      ]
  >
| < #DIGIT:					  // unicode digits
      [
       "u0030"-"u0039",
       "u0660"-"u0669",
       "u06f0"-"u06f9",
       "u0966"-"u096f",
       "u09e6"-"u09ef",
       "u0a66"-"u0a6f",
       "u0ae6"-"u0aef",
       "u0b66"-"u0b6f",
       "u0be7"-"u0bef",
       "u0c66"-"u0c6f",
       "u0ce6"-"u0cef",
       "u0d66"-"u0d6f",
       "u0e50"-"u0e59",
       "u0ed0"-"u0ed9",
       "u1040"-"u1049"
      ]
  >
}
SKIP : {					  // skip unrecognized chars
 <NOISE: ~[] >
}
/** Returns the next token in the stream, or null at EOS.
 * <p>The returned token's type is set to an element of {@link
 * StandardTokenizerConstants#tokenImage}.
 */
org.apache.lucene.analysis.Token next() throws IOException :
{
  Token token = null;
}
{
  ( token = <ALPHANUM> |
    token = <APOSTROPHE> |
    token = <ACRONYM> |
    token = <COMPANY> |
    token = <EMAIL> |
    token = <HOST> |
    token = <NUM> |
    token = <CJ> |
    token = <EOF>
   )
    {
      if (token.kind == EOF) {
	return null;
      } else {
	return
	  new org.apache.lucene.analysis.Token(token.image,
					token.beginColumn,token.endColumn,
					tokenImage[token.kind]);
      }
    }
}