CJKTokenizer.java
上传用户:cctqzzy
上传日期:2022-03-14
资源大小:12198k
文件大小:8k
源码类别:

搜索引擎

开发平台:

Java

  1. //package org.apache.lucene.analysis.cjk;
  2. package chapter8;
  3. /**
  4.  * Licensed to the Apache Software Foundation (ASF) under one or more
  5.  * contributor license agreements.  See the NOTICE file distributed with
  6.  * this work for additional information regarding copyright ownership.
  7.  * The ASF licenses this file to You under the Apache License, Version 2.0
  8.  * (the "License"); you may not use this file except in compliance with
  9.  * the License.  You may obtain a copy of the License at
  10.  *
  11.  *     http://www.apache.org/licenses/LICENSE-2.0
  12.  *
  13.  * Unless required by applicable law or agreed to in writing, software
  14.  * distributed under the License is distributed on an "AS IS" BASIS,
  15.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16.  * See the License for the specific language governing permissions and
  17.  * limitations under the License.
  18.  */
  19. import org.apache.lucene.analysis.Token;
  20. import org.apache.lucene.analysis.Tokenizer;
  21. import java.io.Reader;
  22. /**
  23.  * CJKTokenizer was modified from StopTokenizer which does a decent job for
  24.  * most European languages. It performs other token methods for double-byte
  25.  * Characters: the token will return at each two charactors with overlap match.<br>
  26.  * Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
  27.  * also need filter filter zero length token ""<br>
  28.  * for Digit: digit, '+', '#' will token as letter<br>
  29.  * for more info on Asia language(Chinese Japanese Korean) text segmentation:
  30.  * please search  <a
  31.  * href="http://www.google.com/search?q=word+chinese+segment">google</a>
  32.  *
  33.  * @author Che, Dong
  34.  */
  35. public final class CJKTokenizer extends Tokenizer {
  36.     //~ Static fields/initializers ---------------------------------------------
  37.     /** Max word length */
  38.     private static final int MAX_WORD_LEN = 255;
  39.     /** buffer size: */
  40.     private static final int IO_BUFFER_SIZE = 256;
  41.     //~ Instance fields --------------------------------------------------------
  42.     /** word offset, used to imply which character(in ) is parsed */
  43.     private int offset = 0;
  44.     /** the index used only for ioBuffer */
  45.     private int bufferIndex = 0;
  46.     /** data length */
  47.     private int dataLen = 0;
  48.     /**
  49.      * character buffer, store the characters which are used to compose <br>
  50.      * the returned Token
  51.      */
  52.     private final char[] buffer = new char[MAX_WORD_LEN];
  53.     /**
  54.      * I/O buffer, used to store the content of the input(one of the <br>
  55.      * members of Tokenizer)
  56.      */
  57.     private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
  58.     /** word type: single=>ASCII  double=>non-ASCII word=>default */
  59.     private String tokenType = "word";
  60.     /**
  61.      * tag: previous character is a cached double-byte character  "C1C2C3C4"
  62.      * ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
  63.      * C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
  64.      */
  65.     private boolean preIsTokened = false;
  66.     //~ Constructors -----------------------------------------------------------
  67.     /**
  68.      * Construct a token stream processing the given input.
  69.      *
  70.      * @param in I/O reader
  71.      */
  72.     public CJKTokenizer(Reader in) {
  73.         input = in;
  74.     }
  75.     //~ Methods ----------------------------------------------------------------
  76.     /**
  77.      * Returns the next token in the stream, or null at EOS.
  78.      * See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
  79.      * for detail.
  80.      *
  81.      * @return Token
  82.      *
  83.      * @throws java.io.IOException - throw IOException when read error <br>
  84.      *         hanppened in the InputStream
  85.      *
  86.      */
  87.     public final Token next() throws java.io.IOException {
  88.         /** how many character(s) has been stored in buffer */
  89.         int length = 0;
  90.         /** the position used to create Token */
  91.         int start = offset;
  92.         while (true) {
  93.             /** current charactor */
  94.             char c;
  95.             /** unicode block of current charactor for detail */
  96.             Character.UnicodeBlock ub;
  97.             offset++;
  98.             if (bufferIndex >= dataLen) {
  99.                 dataLen = input.read(ioBuffer);
  100.                 bufferIndex = 0;
  101.             }
  102.             if (dataLen == -1) {
  103.                 if (length > 0) {
  104.                     if (preIsTokened == true) {
  105.                         length = 0;
  106.                         preIsTokened = false;
  107.                     }
  108.                     break;
  109.                 } else {
  110.                     return null;
  111.                 }
  112.             } else {
  113.                 //get current character
  114.                 c = ioBuffer[bufferIndex++];
  115.                 //get the UnicodeBlock of the current character
  116.                 ub = Character.UnicodeBlock.of(c);
  117.             }
  118.             //if the current character is ASCII or Extend ASCII
  119.             if ((ub == Character.UnicodeBlock.BASIC_LATIN)
  120.                     || (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS)
  121.                ) {
  122.                 if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
  123.                     /** convert  HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */
  124.                     int i = (int) c;
  125.                     i = i - 65248;
  126.                     c = (char) i;
  127.                 }
  128.                 // if the current character is a letter or "_" "+" "#"
  129.                 if (Character.isLetterOrDigit(c)
  130.                         || ((c == '_') || (c == '+') || (c == '#'))
  131.                    ) {
  132.                     if (length == 0) {
  133.                         // "javaC1C2C3C4linux" <br>
  134.                         //      ^--: the current character begin to token the ASCII
  135.                         // letter
  136.                         start = offset - 1;
  137.                     } else if (tokenType == "double") {
  138.                         // "javaC1C2C3C4linux" <br>
  139.                         //              ^--: the previous non-ASCII
  140.                         // : the current character
  141.                         offset--;
  142.                         bufferIndex--;
  143.                         tokenType = "single";
  144.                         if (preIsTokened == true) {
  145.                             // there is only one non-ASCII has been stored
  146.                             length = 0;
  147.                             preIsTokened = false;
  148.                             break;
  149.                         } else {
  150.                             break;
  151.                         }
  152.                     }
  153.                     // store the LowerCase(c) in the buffer
  154.                     buffer[length++] = Character.toLowerCase(c);
  155.                     tokenType = "single";
  156.                     // break the procedure if buffer overflowed!
  157.                     if (length == MAX_WORD_LEN) {
  158.                         break;
  159.                     }
  160.                 } else if (length > 0) {
  161.                     if (preIsTokened == true) {
  162.                         length = 0;
  163.                         preIsTokened = false;
  164.                     } else {
  165.                         break;
  166.                     }
  167.                 }
  168.             } else {
  169.                 // non-ASCII letter, eg."C1C2C3C4"
  170.                 if (Character.isLetter(c)) {
  171.                     if (length == 0) {
  172.                         start = offset - 1;
  173.                         buffer[length++] = c;
  174.                         tokenType = "double";
  175.                     } else {
  176.                         if (tokenType == "single") {
  177.                             offset--;
  178.                             bufferIndex--;
  179.                             //return the previous ASCII characters
  180.                             break;
  181.                         } else {
  182.                             buffer[length++] = c;
  183.                             tokenType = "double";
  184.                             if (length == 2) {
  185.                                 offset--;
  186.                                 bufferIndex--;
  187.                                 preIsTokened = true;
  188.                                 break;
  189.                             }
  190.                         }
  191.                     }
  192.                 } else if (length > 0) {
  193.                     if (preIsTokened == true) {
  194.                         // empty the buffer
  195.                         length = 0;
  196.                         preIsTokened = false;
  197.                     } else {
  198.                         break;
  199.                     }
  200.                 }
  201.             }
  202.         }
  203.         return new Token(new String(buffer, 0, length), start, start + length,
  204.                          tokenType
  205.                         );
  206.     }
  207. }