- using System;
- using System.Collections.Generic;
- using System.ComponentModel;
- using System.Data;
- using System.Drawing;
- using System.Linq;
- using System.Text;
- using System.Windows.Forms;
- using System.Net;
- using System.IO;
- using System.Text.RegularExpressions;
- namespace WebCrawler
- {
- public partial class Form1 : Form
- {
- public Form1()
- {
- InitializeComponent();
- }
- private void Form1_Load(object sender, EventArgs e)
- {
- }
- //1、获取对应url的html源文件
- private string GetWebContent(string Url)
- {
- string strResult = "";
- try
- {
- //声明一个HttpWebRequest请求
- HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
- //设置连接超时时间
- request.Timeout = 30000;
- request.Headers.Set("Pragma", "no-cache");
- HttpWebResponse response = (HttpWebResponse)request.GetResponse();
- Stream streamReceive = response.GetResponseStream();
- Encoding encoding = Encoding.GetEncoding("GB2312");
- StreamReader streamReader = new StreamReader(streamReceive, encoding);
- strResult = streamReader.ReadToEnd();
- }
- catch
- {
- MessageBox.Show("出错");
- }
- return strResult;
- }
- private void button1_Click(object sender, EventArgs e)
- {
- //要抓取的URL地址
- string Url = "http://society.people.com.cn/GB/1062/10693940.html";
- //得到指定Url的源码
- string strWebContent = GetWebContent(Url);
- string s = "";
- /*写法一*/
- //Regex patternTitle =new Regex("<DIV id="p_title">(.*?)</div>"); //C#正则表达式提取匹配URL的模式,
- //MatchCollection mc = patternTitle.Match.Matches(strWebContent);//满足pattern的匹配集合
- //foreach (Match match in mc)
- //{
- // s = match.Value.ToString();
- //}
- //int Counter = mc.Count;
- //for (int i = 0; i < Counter; i++)
- //{
- // s = s + "Title" + mc[i].ToString();
- //}
- /*写法二*/
- //2、从源文件中匹配出需要的特定文本内容
- string patternTitle = "<DIV id="p_title">(.*?)</div>";
- Match mc=Regex.Match(strWebContent,patternTitle);
- if (mc.Success)
- {
- s = mc.Groups[0].ToString();
- }
- int start= s.IndexOf(">");
- int end = s.IndexOf("</");
- s = s.Substring(start+1,end-start-1);
- richTextBox1.Text = s;
- }
- }
- }