KeywordsSort.cs
资源名称:2.rar [点击查看]
上传用户:hshongkong
上传日期:2021-11-20
资源大小:10241k
文件大小:11k
源码类别:
多国语言处理
开发平台:
C#
- using System;
- using System.Collections.Generic;
- using System.Text;
- using System.Text.RegularExpressions;
- using CommonLibrary;
- namespace ChineseSplitter
- {
- /// <summary>
- /// 关键字排序
- /// </summary>
- public class KeywordsSort
- {
- static WordsType myWordsType;
- const string SPLIT_STRING = " ";
- const char SPLIT_CHAR = '/';
- public List<KeywordsInfo> myKeyList;
- /// <summary>
- /// 构造函数
- /// </summary>
- public KeywordsSort()
- {
- if (myWordsType == null)
- {
- myWordsType = new WordsType();
- }
- this.myKeyList = new List<KeywordsInfo>();
- }
- /// <summary>
- /// 添加关键字
- /// </summary>
- /// <param name="inputStr">字符串</param>
- public void Append(string inputStr)
- {
- string[] keywords = Regex.Split(inputStr, SPLIT_STRING);
- KeywordsInfo myTempInfo = new KeywordsInfo("");
- int indexCount = 0;
- for (int i = 0; i < keywords.Length; i++)
- {
- string[] keys = keywords[i].Split(SPLIT_CHAR);
- string keyword = keys[0].Replace("rn","");
- string keywordType = "";
- if (keys.Length >= 2) keywordType = keys[1];
- if (keywords[i] != String.Empty && myWordsType.GetMark(keywordType) > 0)
- {
- myTempInfo.Keywords = keyword;
- int place = Finder.FindPlace<KeywordsInfo>(this.myKeyList, myTempInfo, KeywordsInfoHashCompare);
- if (place == -1)
- {
- place = Finder.FindInsertPlace<KeywordsInfo>(this.myKeyList, myTempInfo, KeywordsInfoHashCompare);
- KeywordsInfo myNewInfo = new KeywordsInfo(keyword);
- this.myKeyList.Insert(place, myNewInfo);
- }
- KeywordsInfo myKeyInfo = this.myKeyList[place];
- myKeyInfo.Count += 1;
- myKeyInfo.WordsMark += myWordsType.GetMark(keywordType);
- myKeyInfo.PositionList.Add(indexCount);
- this.myKeyList[place] = myKeyInfo;
- }
- indexCount += keyword.Length;
- }
- }
- /// <summary>
- /// 获得积分最前的关键字
- /// </summary>
- /// <returns>返回值</returns>
- public string GetTopKeywords()
- {
- return GetTopKeywords(10);
- }
- /// <summary>
- /// 获得积分最前的关键字
- /// </summary>
- /// <param name="topCount">关键字数</param>
- /// <returns>返回值</returns>
- public string GetTopKeywords(int topCount)
- {
- Sorter.DimidiateSort<KeywordsInfo>(this.myKeyList, KeywordsInfoAllMarkCompare);
- if (topCount > this.myKeyList.Count) topCount = this.myKeyList.Count;
- StringBuilder myStr = new StringBuilder();
- for (int i = 0; i < topCount; i++)
- {
- myStr.Append(this.myKeyList[i].Keywords + " ");
- }
- Sorter.DimidiateSort<KeywordsInfo>(this.myKeyList, KeywordsInfoHashCompare);
- return myStr.ToString();
- }
- /// <summary>
- /// KeywordsInfo比较方法,根据Count
- /// </summary>
- /// <param name="value1">值1</param>
- /// <param name="value2">值2</param>
- /// <returns>返回值</returns>
- private int KeywordsInfoCountCompare(KeywordsInfo value1,KeywordsInfo value2)
- {
- if (value1.Count > value2.Count)
- {
- return 1;
- }
- else if (value1.Count < value2.Count)
- {
- return -1;
- }
- else
- {
- return 0;
- }
- }
- /// <summary>
- /// KeywordsInfo比较方法,根据Hash
- /// </summary>
- /// <param name="value1">值1</param>
- /// <param name="value2">值2</param>
- /// <returns>返回值</returns>
- private int KeywordsInfoHashCompare(KeywordsInfo value1, KeywordsInfo value2)
- {
- if (value1.Keywords.GetHashCode() > value2.Keywords.GetHashCode())
- {
- return 1;
- }
- else if (value1.Keywords.GetHashCode() < value2.Keywords.GetHashCode())
- {
- return -1;
- }
- else
- {
- return 0;
- }
- }
- /// <summary>
- /// KeywordsInfo比较方法,根据AllMark
- /// </summary>
- /// <param name="value1">值1</param>
- /// <param name="value2">值2</param>
- /// <returns>返回值</returns>
- private int KeywordsInfoAllMarkCompare(KeywordsInfo value1, KeywordsInfo value2)
- {
- if (value1.AllMark > value2.AllMark)
- {
- return 1;
- }
- else if (value1.AllMark < value2.AllMark)
- {
- return -1;
- }
- else
- {
- return 0;
- }
- }
- }
- /// <summary>
- /// 关键字信息
- /// </summary>
- public struct KeywordsInfo
- {
- /// <summary>
- /// 关键字
- /// </summary>
- public string Keywords;
- /// <summary>
- /// 出现次数
- /// </summary>
- public short Count;
- /// <summary>
- /// 关键字积分
- /// </summary>
- public float WordsMark;
- /// <summary>
- /// 关键字出现的位置类列
- /// </summary>
- public List<int> PositionList;
- public KeywordsInfo(string _Keywords)
- {
- this.Keywords = _Keywords;
- this.Count = 0;
- this.WordsMark = 0;
- this.PositionList = new List<int>();
- }
- /// <summary>
- /// 总积分
- /// </summary>
- public float AllMark
- {
- get
- {
- return this.Count * this.WordsMark;
- }
- }
- public byte[] SaveBytes
- {
- get
- {
- return null;
- }
- }
- }
- /// <summary>
- /// 关键字类别积分
- /// </summary>
- public class WordsType
- {
- Dictionary<string, float> typeList;
- const float DEFAULT_MARK = 0.8f;
- /// <summary>
- /// 构造函数
- /// </summary>
- public WordsType()
- {
- this.typeList = new Dictionary<string, float>();
- this.typeList.Add("", (float)(DEFAULT_MARK));
- this.typeList.Add("Ag", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("a", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("ad", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("an", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("b", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("c", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("Dg", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("d", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("e", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("f", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("g", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("h", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("i", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("j", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("k", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("l", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("m", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("Ng", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("n", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("nr", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("ns", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("nt", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("nz", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("o", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("p", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("q", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("r", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("s", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("Tg", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("t", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("u", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("Vg", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("v", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("vd", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("vn", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("w", (float)(DEFAULT_MARK * 0));
- this.typeList.Add("x", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("y", (float)(DEFAULT_MARK * 0.50));
- this.typeList.Add("z", (float)(DEFAULT_MARK * 0.50));
- }
- /// <summary>
- /// 获取关键字的积分
- /// </summary>
- /// <param name="wordsType">词性类别</param>
- /// <returns>返回积分</returns>
- public float GetMark(string wordsType)
- {
- if (wordsType != null && this.typeList.ContainsKey(wordsType))
- {
- return this.typeList[wordsType];
- }
- else
- {
- return DEFAULT_MARK;
- }
- }
- }
- /*
- 词性表
- Ag 形语素
- a 形容词
- ad 副形词
- an 名形词
- b 区别词
- c 连词
- Dg 副语素
- d 副词
- e 叹词
- f 方位词
- g 语素
- h 前接成分
- i 成语
- j 简称略语
- k 后接成分
- l 习用语
- m 数词
- Ng 名语素
- n 名词
- nr 人名
- ns 地名
- nt 机构团体
- nz 其他专名
- o 拟声词
- p 介词
- q 量词
- r 代词
- s 处所词
- Tg 时语素
- t 时间词
- u 助词
- Vg 动语素
- v 动词
- vd 副动词
- vn 名动词
- w 标点符号
- x 非语素字
- y 语气词
- z 状态词
- */
- /*
- public struct WordsTypeMark
- {
- public string WordsType;
- public byte Mark;
- public WordsTypeMark(string _WordsType, byte _Mark)
- {
- this.WordsType = _WordsType;
- this.Mark = _Mark;
- }
- }*/
- }