DoubleSplitter.cs
资源名称:2.rar [点击查看]
上传用户:hshongkong
上传日期:2021-11-20
资源大小:10241k
文件大小:9k
源码类别:

多国语言处理

开发平台:

C#

  1. using System;
  2. using System.Collections;
  3. using System.Collections.Generic;
  4. using System.Text;
  5. namespace ChineseSplitter
  6. {
  7.     /// <summary>
  8.     /// 双字切分
  9.     /// </summary>
  10.     public class DoubleSplitter : BaseSplitter
  11.     {
  12.         TwoSplit myTwoSplit;
  13.         /// <summary>
  14.         /// 构造函数
  15.         /// </summary>
  16.         public DoubleSplitter()
  17.         {
  18.             myTwoSplit = new TwoSplit(); 
  19.         }
  20.         public override string Split(string inputStr)
  21.         {
  22.             return myTwoSplit.ReviseString(inputStr);
  23.         }
  24.         /// <summary>
  25.         /// 释放内存
  26.         /// </summary>
  27.         public override void Dispose()
  28.         {
  29.         }
  30.     }
  31.     class TwoSplit
  32.     {
  33.         string SplitChar = "  "; //分隔符
  34.         string nums = "0123456789+-%.abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
  35.         string fnums = "0123456789+-%.abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
  36.         /// <summary>
  37.         /// 构造函数
  38.         /// </summary>
  39.         public TwoSplit()
  40.         {
  41.         }
  42.         
  43.         /// <summary>
  44.         /// 字符串是否全为中文
  45.         /// </summary>
  46.         /// <param name="inputString">字符串</param>
  47.         /// <return>返回值</return></returns>
  48.         public bool IsChinese(string inputString)
  49.         {
  50.             if (Encoding.UTF8.GetByteCount(inputString) == inputString.Length * 3)
  51.             {
  52.                 return true;
  53.             }
  54.             else
  55.             {
  56.                 return false;
  57.             }
  58.         }
  59.         /// <summary>
  60.         /// 切分字符串
  61.         /// </summary>
  62.         /// <param name="str">字符串</param>
  63.         /// <returns>返回值</returns>
  64.         public string ReviseString(string str)
  65.         {
  66.             string spc = SplitChar.ToString();
  67.             int slen = str.Length;
  68.             if (slen == 0) return "";
  69.             StringBuilder okstr = new StringBuilder();
  70.             int[] chartypes = GetWordTypes(str);
  71.             int lasttype = 0;
  72.             string lastchinese = "";
  73.             int chineseCount = 0;
  74.             for (int i = 0; i < chartypes.Length; i++)
  75.             {
  76.                 char alabchar = ' ';
  77.                 switch (chartypes[i])
  78.                 {
  79.                     case 1:
  80.                         if (chineseCount == 1)
  81.                         {
  82.                             okstr.Append(spc + lastchinese);
  83.                         }
  84.                         alabchar = str[i];
  85.                         switch (lasttype)
  86.                         {
  87.                             case 1:
  88.                                 okstr.Append(alabchar);
  89.                                 break;
  90.                             case 2:
  91.                                 okstr.Append(alabchar);
  92.                                 break;
  93.                             case 3:
  94.                                 okstr.Append(spc + alabchar);
  95.                                 break;
  96.                             default:
  97.                                 okstr.Append(spc + alabchar);
  98.                                 break;
  99.                         }
  100.                         lastchinese = "";
  101.                         chineseCount = 0;
  102.                         lasttype = 1;
  103.                         break;
  104.                     case 2:
  105.                         if (chineseCount == 1)
  106.                         {
  107.                             okstr.Append(spc + lastchinese);
  108.                         }
  109.                         alabchar = GetAlabNum(str[i]);
  110.                         switch (lasttype)
  111.                         {
  112.                             case 1:
  113.                                 okstr.Append(alabchar);
  114.                                 break;
  115.                             case 2:
  116.                                 okstr.Append(alabchar);
  117.                                 break;
  118.                             case 3:
  119.                                 okstr.Append(spc + alabchar);
  120.                                 break;
  121.                             default:
  122.                                 okstr.Append(spc + alabchar);
  123.                                 break;
  124.                         }
  125.                         lastchinese = "";
  126.                         chineseCount = 0;
  127.                         lasttype = 2;
  128.                         break;
  129.                     case 3:
  130.                         alabchar = str[i];
  131.                         switch (lasttype)
  132.                         {
  133.                             case 3:
  134.                                 okstr.Append(spc + lastchinese + alabchar);
  135.                                 break;
  136.                             default:
  137.                                 break;
  138.                         }
  139.                         chineseCount++;
  140.                         lastchinese = alabchar.ToString();
  141.                         lasttype = 3;
  142.                         break;
  143.                     case 4:
  144.                         if (chineseCount == 1)
  145.                         {
  146.                             okstr.Append(spc + lastchinese);
  147.                         }
  148.                         lastchinese = "";
  149.                         chineseCount = 0;
  150.                         lasttype = 4;
  151.                         break;
  152.                     default:
  153.                         if (chineseCount == 1)
  154.                         {
  155.                             okstr.Append(spc + lastchinese);
  156.                         }
  157.                         alabchar = str[i];
  158.                         okstr.Append(spc + alabchar);
  159.                         chineseCount = 0;
  160.                         lastchinese = "";
  161.                         lasttype = 5;
  162.                         break;
  163.                 }
  164.             }
  165.             if (chineseCount == 1)
  166.             {
  167.                 okstr.Append(spc + lastchinese);
  168.             }
  169.             return okstr.ToString();
  170.         }
  171.         
  172.         /// <summary>
  173.         /// 把全角数字或英文单词转为半角
  174.         /// </summary>
  175.         /// <param name="fnum">字符</param>
  176.         /// <returns>返加</returns>
  177.         private char GetAlabNum(char fnum)
  178.         {
  179.             int place = nums.IndexOf(fnum);
  180.             if (place != -1)
  181.             {
  182.                 return fnums[place];
  183.             }
  184.             return fnum;
  185.         }
  186.         /// <summary>
  187.         /// 分析字符类型
  188.         /// </summary>
  189.         /// <param name="words">字符串</param>
  190.         /// <returns>返回类型数组</returns>
  191.         private int[] GetWordTypes(string words)
  192.         {
  193.             byte[] unicode = Encoding.Unicode.GetBytes(words);
  194.             int[] revalue = new int[unicode.Length / 2];
  195.             for (int i = 0; i < revalue.Length; i++)
  196.             {
  197.                 int nowvalue = BitConverter.ToUInt16(unicode, i * 2);
  198.                 if (nowvalue < 0x4E00)
  199.                 {
  200.                     if (nowvalue <= 0x0039 && nowvalue >= 0x0030)
  201.                     {
  202.                         revalue[i] = 1;
  203.                     }
  204.                     else if ((nowvalue <= 0x005A && nowvalue >= 0x0041) || (nowvalue <= 0x007A && nowvalue >= 0x0061))
  205.                     {
  206.                         revalue[i] = 1;
  207.                     }
  208.                     else if ((nowvalue <= 0x301B && nowvalue >= 0x3000) || nowvalue <= 0x00BB || (nowvalue <= 0x2FFF && nowvalue >= 0x2000))
  209.                     {
  210.                         revalue[i] = 4;
  211.                     }
  212.                     else
  213.                     {
  214.                         revalue[i] = 5;
  215.                     }
  216.                 }
  217.                 else if (nowvalue <= 0xEFFF && nowvalue >= 0x4E00)
  218.                 {
  219.                     revalue[i] = 3;
  220.                 }
  221.                 else
  222.                 {
  223.                     if (nowvalue <= 0xFF19 && nowvalue >= 0xFF10)
  224.                     {
  225.                         revalue[i] = 2;
  226.                     }
  227.                     else if ((nowvalue <= 0xFF3A && nowvalue >= 0xFF21) || (nowvalue <= 0xFF5A && nowvalue >= 0xFF41))
  228.                     {
  229.                         revalue[i] = 2;
  230.                     }
  231.                     else if (nowvalue <= 0xFF65 && nowvalue >= 0xFF00)
  232.                     {
  233.                         revalue[i] = 4;
  234.                     }
  235.                     else
  236.                     {
  237.                         revalue[i] = 5;
  238.                     }
  239.                 }
  240.             }
  241.             return revalue;
  242.         }
  243.     }
  244. }