ClassUrlString.cs
上传用户:zhangkuixh
上传日期:2013-09-30
资源大小:5473k
文件大小:13k
源码类别:

搜索引擎

开发平台:

C#

  1. using System;
  2. using System.Collections.Generic;
  3. using System.Text;
  4. /*
  5.       '       迅龙中文分类搜索引擎  v0.6
  6.       '
  7.       '        LGPL  许可发行
  8.       '
  9.       '       宁夏大学  张冬 康彩  zd4004@163.com
  10.       ' 
  11.       '        官网 http://blog.163.com/zd4004/
  12.  */
  13. namespace XunLong.UrlStringLib
  14. {
  15.     /// <summary>
  16.     /// URL相似度比较类
  17.     /// </summary>
  18.     public static   class ClassUrlString
  19.     {
  20.         /// <summary>
  21.         /// 比较2个url相似度
  22.         /// </summary>
  23.         /// <param name="url1"></param>
  24.         /// <param name="url2"></param>
  25.         /// <returns>相同时返回相似度 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16</returns>
  26.         public static int Url2Url20(string url1, string url2)
  27.         {
  28.             if (url1 == null | url2 == null)
  29.             {
  30.                 return 0;
  31.             }
  32.             url1 = url1.Trim();
  33.             url2 = url2.Trim();
  34.             if (url1.Length == 0 | url2.Length == 0)
  35.             {
  36.                 return 0;
  37.             }
  38.             if (url1 == url2)
  39.             {
  40.                 return 50; //去掉空格完全相同的返回20
  41.             }
  42.            //http://ent.163.com/06/0908/10/2QG8EN3N00031H2L.html
  43.            
  44.            
  45.             string[] url1S = url1.Split('/');
  46.             string[] url2S = url2.Split('/');
  47.             if (url1S.Length < 3 | url2S.Length  < 3)
  48.             {
  49.                 return 0;
  50.             }
  51.             if ((url1S[0] + "_" + url1S[2]) == (url2S[0] + "_" + url2S[2]))
  52.             {
  53.                
  54.             }
  55.             else
  56.             {
  57.                 return 0;
  58.             }
  59.             //识别目录和文件 http;//sss/ss/s= http://dd.dd/ddd
  60.             //目录主体
  61.             string url1Dir = "";  
  62.             //文件主体
  63.             string url1File = "";
  64.             //目录深度
  65.             int url1DicNum = 0;
  66.             for (int i = 0; i < url1S.Length-1; i++)
  67.             {
  68.                 if (url1S[i].IndexOf('=') > -1 | url1S[i].IndexOf('?') > -1)
  69.                 {
  70.                     for (int j = i; j < url1S.Length; j++)
  71.                     {
  72.                         url1File = url1File + url1S[i] + "\";                    
  73.                     }
  74.                     //去掉多余的 "\"
  75.                     url1File = url1File.Substring(0, url1File.Length - 1);
  76.                    
  77.                     break;
  78.                 }
  79.                 else
  80.                 { 
  81.                    url1Dir= url1Dir+url1S[i]+"\";
  82.                    url1DicNum = i - 1;
  83.                 }
  84.             }
  85.             if (url1File.Length == 0)
  86.             {
  87.                 url1File = url1S[url1S.Length - 1];
  88.             }
  89.             //目录主体
  90.             string url2Dir = "";
  91.             //文件主体
  92.             string url2File = "";
  93.             //目录深度
  94.             int url2DicNum = 0;
  95.             for (int i = 0; i < url2S.Length - 1; i++)
  96.             {
  97.                 if (url2S[i].IndexOf('=') > -1 | url2S[i].IndexOf('?') > -1)
  98.                 {
  99.                     for (int j = i; j < url2S.Length; j++)
  100.                     {
  101.                         url2File = url2File + url2S[i] + "\";
  102.                     }
  103.                     //去掉多余的 "\"
  104.                     url2File = url2File.Substring(0, url2File.Length - 1);
  105.                     break;
  106.                 }
  107.                 else
  108.                 {
  109.                     url2Dir = url2Dir + url2S[i] + "\";
  110.                     url2DicNum = i - 1;
  111.                 }
  112.             }
  113.             if (url2File.Length == 0)
  114.             {
  115.                 url2File = url2S[url2S.Length - 1];
  116.             }
  117.             if (url1Dir != url2Dir)
  118.             {
  119.                 return 10;  // 1 首先 链接的目录部分必须相同 +5 否则返回 10 
  120.             }
  121.             if (url1File.IndexOf('?') > -1)
  122.             {
  123.                 if (url2File.IndexOf('?') > -1)
  124.                 {                    
  125.                     //2个都有参数
  126.                     // http://www.baidu.com/s?tn=baiduadv&q1=33&q2=&q3=&q4=&rn=10&lm=0&ct=0&ft=&q5=&q6=
  127.                     // // http://www.baidu.com/s.asp?ll=ll
  128.                     // 1 判断其名是否相同
  129.                     char[] x = { '?'};
  130.                     string[] url1FileName = url1File.Split(x);
  131.                     string[] url2FileName = url2File.Split(x);
  132.                     if (url1FileName[0] == url2FileName[0])
  133.                     {
  134.                         string CanShu1 = url1FileName[1];
  135.                         string CanShu2 = url2FileName[1];
  136.                         char[] xx ={'=','&'};
  137.                         string[] Can1S = CanShu1.Split(xx);
  138.                         string[] Can2S = CanShu2.Split(xx);
  139.                         int Tmp2 = 0;
  140.                         if (CanShu1.IndexOf('&') >-1)
  141.                         {
  142.                             if (CanShu2.IndexOf('&') >-1)
  143.                             {
  144.                                 Tmp2 = 1;
  145.                             }
  146.                             else
  147.                             {
  148.                                 return 12;
  149.                             }
  150.                         }
  151.                         else
  152.                         {
  153.                             if (CanShu2.IndexOf('&') >-1)
  154.                             {
  155.                                  return 12;
  156.                             }
  157.                             else
  158.                             {
  159.                                 Tmp2 = 2;
  160.                             }
  161.                         }
  162.                         int TmpNum = 0;
  163.                         if (CanShu1.Length != CanShu2.Length)
  164.                         {
  165.                             return 12 + Tmp2;
  166.                         }
  167.                         else
  168.                         {
  169.                             for (int h = 0; h < CanShu1.Length; h++)
  170.                             {
  171.                                 if (CanShu1[h] == CanShu2[h])
  172.                                 {
  173.                                     TmpNum = TmpNum + 1;
  174.                                 }                                                                                                                              
  175.                             }
  176.                             if (14 + TmpNum > 40)
  177.                             {
  178.                                 return 40;
  179.                             }
  180.                             else
  181.                             {
  182.                                 return 14 + TmpNum;
  183.                             }
  184.                         }
  185.                     }
  186.                     else
  187.                     {
  188.                         return 10;
  189.                     }
  190.                 }
  191.                 else
  192.                 {
  193.                     return 10;  //一个有参数 1个没参数
  194.                 }
  195.             }
  196.             else
  197.             {
  198.                 if (url2File.IndexOf('?') > -1)
  199.                 {
  200.                     return 10;  //一个有参数 1个没参数
  201.                 }
  202.                 else
  203.                 {
  204.                     //2个没有参数
  205.                     if (url1File.IndexOf('.') > 0)
  206.                     {
  207.                         if (url2File.IndexOf('.') > 0)
  208.                         {
  209.                             //2个都有扩展名
  210.                             string[] url1FileExName = url1File.Split('.');
  211.                             string[] url2FileExName = url2File.Split('.');
  212.                             if (url1FileExName[url1FileExName.Length - 1] == url2FileExName[url2FileExName.Length - 1])
  213.                             {
  214.                                 string EXN = url1FileExName[url1FileExName.Length - 1];
  215.                                 if (EXN.ToLower() == "htm" | EXN.ToLower() == "html" | EXN.ToLower() == "shtml")
  216.                                 {
  217.                                     if (12 + url2DicNum > 18)
  218.                                     {
  219.                                         return 18;
  220.                                     }
  221.                                     else
  222.                                     {
  223.                                         return 12 + url2DicNum;
  224.                                     }
  225.                                 }
  226.                                 else
  227.                                 {
  228.                                     if (11 + url2DicNum > 16)
  229.                                     {
  230.                                         return 16;
  231.                                     }
  232.                                     else
  233.                                     {
  234.                                         return 11 + url2DicNum;
  235.                                     }
  236.                                  //   return 11 + url2DicNum;
  237.                                 }
  238.                             }
  239.                             else
  240.                             {
  241.                                 return 10;
  242.                             }
  243.                         }
  244.                         else
  245.                         {
  246.                             return 10;
  247.                         }
  248.                     }
  249.                     else
  250.                     {
  251.                         if (url2File.IndexOf('.') > 0)
  252.                         {
  253.                             return 10;
  254.                         }
  255.                         else
  256.                         {
  257.                             //2 个都没有扩展名
  258.                          //   return 11 + url2DicNum;
  259.                             return 13;
  260.                         }   
  261.                     }
  262.                 }                       
  263.             }
  264.            // 比较原则
  265.            
  266.             // 对于有参数的 
  267.            // 2 文件相同  参数结构相同 + 5  如果参数结构不同 直接返回 12   
  268.            // 3 参数 相同1项 +1
  269.            //  对于没有参数的
  270.             // 
  271.             // 1 扩展名相同 + 1    不同 直接返回 10  
  272.             // 根据目录层次  不算域名层n层  + n   
  273.            //
  274.         
  275.         }
  276.         /// <summary>
  277.         /// URL相似度  同一站点 返回 10  属于1组模版  返回 12
  278.         /// </summary>
  279.         /// <param name="url1"></param>
  280.         /// <param name="url2"></param>
  281.         /// <returns></returns>
  282.         public static int Url2Url(string url1, string url2)
  283.         {
  284.             url1 = url1.ToLower().Trim();
  285.             url2 = url2.ToLower().Trim();
  286.             if (url1 + "/" == url2 | url1 == url2 + "/")
  287.             {
  288.                 return 12;
  289.             }
  290.             int RT = 0;
  291.             if (url1.IndexOf("http://") != 0 | url2.IndexOf("http://") != 0)
  292.             {
  293.                 return 0;
  294.             }
  295.             string[] ax1 = url1.Split('/');
  296.             string[] ax2 = url2.Split('/');
  297.             if (ax1[2] == ax2[2])
  298.             {
  299.                 RT = 10;  //在同一站点
  300.             }
  301.             else
  302.             {
  303.                 return 0;
  304.             }
  305.             if (ax1.Length != ax2.Length & ax1[2] == ax2[2]) 
  306.             {
  307.                 return 10; //两者在同1站点 
  308.             }
  309.             for (int i = 2; i < ax1.Length - 1; i++)
  310.             {
  311.                 if (ax1[i] != ax2[i])
  312.                 {
  313.                     return 10;
  314.                 }
  315.             
  316.             }
  317.             string url1file = ax1[ax1.Length - 1];
  318.             string url2file = ax2[ax2.Length - 1];
  319.             
  320.             // 检查是否有 ? 出现
  321.             if (url1file.IndexOf('?') > -1)
  322.             {
  323.                 if (url2file.IndexOf('?') > -1)
  324.                 {
  325.                     string[] kzms1 = url1file.Split('?');
  326.                     string[] kzms2 = url2file.Split('?');
  327.                     if (kzms1[0] == kzms2[0])
  328.                     {
  329.                         return 12;
  330.                     }
  331.                     else
  332.                     {
  333.                         return 10;
  334.                     }
  335.                 }
  336.                 else
  337.                 {
  338.                     return 10;
  339.                 }
  340.             }
  341.             else
  342.             {
  343.                 if (url2file.IndexOf('?') > -1)
  344.                 {
  345.                     return 10;
  346.                 }
  347.                 else
  348.                 {
  349.                    // 没有参数 检测扩展名是否相同
  350.                     if (url1file.IndexOf('.') == -1 | url2file.IndexOf('.') == -1)
  351.                     {
  352.                         return 10;
  353.                     }
  354.                     else
  355.                     {
  356.                         string[] kzm1 = url1file.Split('.');
  357.                         string[] kzm2 = url2file.Split('.');
  358.                         if (kzm1[kzm1.Length - 1] == kzm2[kzm2.Length - 1])
  359.                         {
  360.                             return 12;
  361.                         }
  362.                         else
  363.                         {
  364.                             return 10;
  365.                         }
  366.                     }
  367.                 }
  368.             
  369.             }
  370.         }
  371.     }
  372. }