SingleSplitter.cs
资源名称:2.rar [点击查看]
上传用户:hshongkong
上传日期:2021-11-20
资源大小:10241k
文件大小:7k
源码类别:

多国语言处理

开发平台:

C#

  1. using System;
  2. using System.Collections.Generic;
  3. using System.Text;
  4. //该源码首发自win.51aspx.com(51aspx.com)
  5. namespace ChineseSplitter
  6. {
  7.     /// <summary>
  8.     /// 单字切分
  9.     /// </summary>
  10.     public class SingleSplitter : BaseSplitter
  11.     {
  12.         OneSplit myOneSplit;
  13.         /// <summary>
  14.         /// 构造函数
  15.         /// </summary>
  16.         public SingleSplitter()
  17.         {
  18.             myOneSplit = new OneSplit();
  19.         }
  20.         public override string Split(string inputStr)
  21.         {
  22.             return myOneSplit.ReviseString(inputStr);
  23.         }
  24.         
  25.         /// <summary>
  26.         /// 释放内存
  27.         /// </summary>
  28.         public override void Dispose()
  29.         {
  30.         }
  31.     }
  32.     class OneSplit
  33.     {
  34.         string SplitChar = "  "; //分隔符
  35.         string nums = "0123456789+-%.abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
  36.         string fnums = "0123456789+-%.abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
  37.         /// <summary>
  38.         /// 构造函数
  39.         /// </summary>
  40.         public OneSplit()
  41.         {
  42.         }
  43.         /// <summary>
  44.         /// 字符串是否全为中文
  45.         /// </summary>
  46.         /// <param name="inputString">字符串</param>
  47.         /// <return>返回值</return></returns>
  48.         public bool IsChinese(string inputString)
  49.         {
  50.             if (Encoding.UTF8.GetByteCount(inputString) == inputString.Length * 3)
  51.             {
  52.                 return true;
  53.             }
  54.             else
  55.             {
  56.                 return false;
  57.             }
  58.         }
  59.         /// <summary>
  60.         /// 切分字符串
  61.         /// </summary>
  62.         /// <param name="str">字符串</param>
  63.         /// <returns>返回值</returns>
  64.         public string ReviseString(string str)
  65.         {
  66.             string spc = SplitChar.ToString();
  67.             int slen = str.Length;
  68.             if (slen == 0) return "";
  69.             StringBuilder okstr = new StringBuilder();
  70.             int[] chartypes = GetWordTypes(str);
  71.             int lasttype = 0;
  72.             char alabchar = ' ';
  73.             for (int i = 0; i < chartypes.Length; i++)
  74.             {
  75.                 switch (chartypes[i])
  76.                 {
  77.                     case 1:
  78.                         alabchar = str[i];
  79.                         switch (lasttype)
  80.                         {
  81.                             case 1:
  82.                                 okstr.Append(alabchar);
  83.                                 break;
  84.                             case 2:
  85.                                 okstr.Append(alabchar);
  86.                                 break;
  87.                             case 3:
  88.                                 okstr.Append(spc + alabchar);
  89.                                 break;
  90.                             default:
  91.                                 okstr.Append(spc + alabchar);
  92.                                 break;
  93.                         }
  94.                         lasttype = 1;
  95.                         break;
  96.                     case 2:
  97.                         alabchar = GetAlabNum(str[i]);
  98.                         switch (lasttype)
  99.                         {
  100.                             case 1:
  101.                                 okstr.Append(alabchar);
  102.                                 break;
  103.                             case 2:
  104.                                 okstr.Append(alabchar);
  105.                                 break;
  106.                             case 3:
  107.                                 okstr.Append(spc + alabchar);
  108.                                 break;
  109.                             default:
  110.                                 okstr.Append(spc + alabchar);
  111.                                 break;
  112.                         }
  113.                         lasttype = 2;
  114.                         break;
  115.                     case 3:
  116.                         alabchar = str[i];
  117.                         
  118.                                 okstr.Append(spc + alabchar);
  119.                         lasttype = 3;
  120.                         break;
  121.                     case 4:
  122.                         lasttype = 4;
  123.                         break;
  124.                     default:
  125.                         alabchar = str[i];
  126.                         okstr.Append(spc + alabchar);
  127.                         lasttype = 5;
  128.                         break;
  129.                 }
  130.             }
  131.             return okstr.ToString();
  132.         }
  133.         /// <summary>
  134.         /// 把全角数字或英文单词转为半角
  135.         /// </summary>
  136.         /// <param name="fnum">字符</param>
  137.         /// <returns>返加</returns>
  138.         private char GetAlabNum(char fnum)
  139.         {
  140.             int place = nums.IndexOf(fnum);
  141.             if (place != -1)
  142.             {
  143.                 return fnums[place];
  144.             }
  145.             return fnum;
  146.         }
  147.         /// <summary>
  148.         /// 分析字符类型
  149.         /// </summary>
  150.         /// <param name="words">字符串</param>
  151.         /// <returns>返回类型数组</returns>
  152.         private int[] GetWordTypes(string words)
  153.         {
  154.             byte[] unicode = Encoding.Unicode.GetBytes(words);
  155.             int[] revalue = new int[unicode.Length / 2];
  156.             for (int i = 0; i < revalue.Length; i++)
  157.             {
  158.                 int nowvalue = BitConverter.ToUInt16(unicode, i * 2);
  159.                 if (nowvalue < 0x4E00)
  160.                 {
  161.                     if (nowvalue <= 0x0039 && nowvalue >= 0x0030)
  162.                     {
  163.                         revalue[i] = 1;
  164.                     }
  165.                     else if ((nowvalue <= 0x005A && nowvalue >= 0x0041) || (nowvalue <= 0x007A && nowvalue >= 0x0061))
  166.                     {
  167.                         revalue[i] = 1;
  168.                     }
  169.                     else if ((nowvalue <= 0x301B && nowvalue >= 0x3000) || nowvalue <= 0x00BB || (nowvalue <= 0x2FFF && nowvalue >= 0x2000))
  170.                     {
  171.                         revalue[i] = 4;
  172.                     }
  173.                     else
  174.                     {
  175.                         revalue[i] = 5;
  176.                     }
  177.                 }
  178.                 else if (nowvalue <= 0xEFFF && nowvalue >= 0x4E00)
  179.                 {
  180.                     revalue[i] = 3;
  181.                 }
  182.                 else
  183.                 {
  184.                     if (nowvalue <= 0xFF19 && nowvalue >= 0xFF10)
  185.                     {
  186.                         revalue[i] = 2;
  187.                     }
  188.                     else if ((nowvalue <= 0xFF3A && nowvalue >= 0xFF21) || (nowvalue <= 0xFF5A && nowvalue >= 0xFF41))
  189.                     {
  190.                         revalue[i] = 2;
  191.                     }
  192.                     else if (nowvalue <= 0xFF65 && nowvalue >= 0xFF00)
  193.                     {
  194.                         revalue[i] = 4;
  195.                     }
  196.                     else
  197.                     {
  198.                         revalue[i] = 5;
  199.                     }
  200.                 }
  201.             }
  202.             return revalue;
  203.         }
  204.     }
  205. }