DoubleSplitter.cs
资源名称:2.rar [点击查看]
上传用户:hshongkong
上传日期:2021-11-20
资源大小:10241k
文件大小:9k
源码类别:
多国语言处理
开发平台:
C#
- using System;
- using System.Collections;
- using System.Collections.Generic;
- using System.Text;
- namespace ChineseSplitter
- {
- /// <summary>
- /// 双字切分
- /// </summary>
- public class DoubleSplitter : BaseSplitter
- {
- TwoSplit myTwoSplit;
- /// <summary>
- /// 构造函数
- /// </summary>
- public DoubleSplitter()
- {
- myTwoSplit = new TwoSplit();
- }
- public override string Split(string inputStr)
- {
- return myTwoSplit.ReviseString(inputStr);
- }
- /// <summary>
- /// 释放内存
- /// </summary>
- public override void Dispose()
- {
- }
- }
- class TwoSplit
- {
- string SplitChar = " "; //分隔符
- string nums = "0123456789+-%.abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
- string fnums = "0123456789+-%.abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
- /// <summary>
- /// 构造函数
- /// </summary>
- public TwoSplit()
- {
- }
- /// <summary>
- /// 字符串是否全为中文
- /// </summary>
- /// <param name="inputString">字符串</param>
- /// <return>返回值</return></returns>
- public bool IsChinese(string inputString)
- {
- if (Encoding.UTF8.GetByteCount(inputString) == inputString.Length * 3)
- {
- return true;
- }
- else
- {
- return false;
- }
- }
- /// <summary>
- /// 切分字符串
- /// </summary>
- /// <param name="str">字符串</param>
- /// <returns>返回值</returns>
- public string ReviseString(string str)
- {
- string spc = SplitChar.ToString();
- int slen = str.Length;
- if (slen == 0) return "";
- StringBuilder okstr = new StringBuilder();
- int[] chartypes = GetWordTypes(str);
- int lasttype = 0;
- string lastchinese = "";
- int chineseCount = 0;
- for (int i = 0; i < chartypes.Length; i++)
- {
- char alabchar = ' ';
- switch (chartypes[i])
- {
- case 1:
- if (chineseCount == 1)
- {
- okstr.Append(spc + lastchinese);
- }
- alabchar = str[i];
- switch (lasttype)
- {
- case 1:
- okstr.Append(alabchar);
- break;
- case 2:
- okstr.Append(alabchar);
- break;
- case 3:
- okstr.Append(spc + alabchar);
- break;
- default:
- okstr.Append(spc + alabchar);
- break;
- }
- lastchinese = "";
- chineseCount = 0;
- lasttype = 1;
- break;
- case 2:
- if (chineseCount == 1)
- {
- okstr.Append(spc + lastchinese);
- }
- alabchar = GetAlabNum(str[i]);
- switch (lasttype)
- {
- case 1:
- okstr.Append(alabchar);
- break;
- case 2:
- okstr.Append(alabchar);
- break;
- case 3:
- okstr.Append(spc + alabchar);
- break;
- default:
- okstr.Append(spc + alabchar);
- break;
- }
- lastchinese = "";
- chineseCount = 0;
- lasttype = 2;
- break;
- case 3:
- alabchar = str[i];
- switch (lasttype)
- {
- case 3:
- okstr.Append(spc + lastchinese + alabchar);
- break;
- default:
- break;
- }
- chineseCount++;
- lastchinese = alabchar.ToString();
- lasttype = 3;
- break;
- case 4:
- if (chineseCount == 1)
- {
- okstr.Append(spc + lastchinese);
- }
- lastchinese = "";
- chineseCount = 0;
- lasttype = 4;
- break;
- default:
- if (chineseCount == 1)
- {
- okstr.Append(spc + lastchinese);
- }
- alabchar = str[i];
- okstr.Append(spc + alabchar);
- chineseCount = 0;
- lastchinese = "";
- lasttype = 5;
- break;
- }
- }
- if (chineseCount == 1)
- {
- okstr.Append(spc + lastchinese);
- }
- return okstr.ToString();
- }
- /// <summary>
- /// 把全角数字或英文单词转为半角
- /// </summary>
- /// <param name="fnum">字符</param>
- /// <returns>返加</returns>
- private char GetAlabNum(char fnum)
- {
- int place = nums.IndexOf(fnum);
- if (place != -1)
- {
- return fnums[place];
- }
- return fnum;
- }
- /// <summary>
- /// 分析字符类型
- /// </summary>
- /// <param name="words">字符串</param>
- /// <returns>返回类型数组</returns>
- private int[] GetWordTypes(string words)
- {
- byte[] unicode = Encoding.Unicode.GetBytes(words);
- int[] revalue = new int[unicode.Length / 2];
- for (int i = 0; i < revalue.Length; i++)
- {
- int nowvalue = BitConverter.ToUInt16(unicode, i * 2);
- if (nowvalue < 0x4E00)
- {
- if (nowvalue <= 0x0039 && nowvalue >= 0x0030)
- {
- revalue[i] = 1;
- }
- else if ((nowvalue <= 0x005A && nowvalue >= 0x0041) || (nowvalue <= 0x007A && nowvalue >= 0x0061))
- {
- revalue[i] = 1;
- }
- else if ((nowvalue <= 0x301B && nowvalue >= 0x3000) || nowvalue <= 0x00BB || (nowvalue <= 0x2FFF && nowvalue >= 0x2000))
- {
- revalue[i] = 4;
- }
- else
- {
- revalue[i] = 5;
- }
- }
- else if (nowvalue <= 0xEFFF && nowvalue >= 0x4E00)
- {
- revalue[i] = 3;
- }
- else
- {
- if (nowvalue <= 0xFF19 && nowvalue >= 0xFF10)
- {
- revalue[i] = 2;
- }
- else if ((nowvalue <= 0xFF3A && nowvalue >= 0xFF21) || (nowvalue <= 0xFF5A && nowvalue >= 0xFF41))
- {
- revalue[i] = 2;
- }
- else if (nowvalue <= 0xFF65 && nowvalue >= 0xFF00)
- {
- revalue[i] = 4;
- }
- else
- {
- revalue[i] = 5;
- }
- }
- }
- return revalue;
- }
- }
- }