基于C#实现网页爬虫

前端技术 2023/09/09 C#

本文实例为大家分享了基于C#实现网页爬虫的详细代码，供大家参考，具体内容如下

HTTP请求工具类：

功能：

1、获取网页html

2、下载网络图片

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;

namespace Utils
{
  /// <summary>
  /// HTTP请求工具类
  /// </summary>
  public class HttpRequestUtil
  {
    /// <summary>
    /// 获取页面html
    /// </summary>
    public static string GetPageHtml(string url)
    {
      // 设置参数
      HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
      request.UserAgent = \"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)\";
      //发送请求并获取相应回应数据
      HttpWebResponse response = request.GetResponse() as HttpWebResponse;
      //直到request.GetResponse()程序才开始向目标网页发送Post请求
      Stream responseStream = response.GetResponseStream();
      StreamReader sr = new StreamReader(responseStream, Encoding.UTF8);
      //返回结果网页（html）代码
      string content = sr.ReadToEnd();
      return content;
    }

    /// <summary>
    /// Http下载文件
    /// </summary>
    public static void HttpDownloadFile(string url)
    {
      int pos = url.LastIndexOf(\"/\") + 1;
      string fileName = url.Substring(pos);
      string path = Application.StartupPath + \"\\\\download\";
      if (!Directory.Exists(path))
      {
        Directory.CreateDirectory(path);
      }
      string filePathName = path + \"\\\\\" + fileName;
      if (File.Exists(filePathName)) return;

      // 设置参数
      HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
      request.UserAgent = \"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)\";
      request.Proxy = null;
      //发送请求并获取相应回应数据
      HttpWebResponse response = request.GetResponse() as HttpWebResponse;
      //直到request.GetResponse()程序才开始向目标网页发送Post请求
      Stream responseStream = response.GetResponseStream();

      //创建本地文件写入流
      Stream stream = new FileStream(filePathName, FileMode.Create);

      byte[] bArr = new byte[1024];
      int size = responseStream.Read(bArr, 0, (int)bArr.Length);
      while (size > 0)
      {
        stream.Write(bArr, 0, size);
        size = responseStream.Read(bArr, 0, (int)bArr.Length);
      }
      stream.Close();
      responseStream.Close();
    }
  }
}

多线程爬取网页代码：

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;
using Utils;

namespace 爬虫
{
  public partial class Form1 : Form
  {
    List<Thread> threadList = new List<Thread>();
    Thread thread = null;

    public Form1()
    {
      InitializeComponent();
    }

    private void button1_Click(object sender, EventArgs e)
    {
      DateTime dtStart = DateTime.Now;
      button3.Enabled = true;
      button2.Enabled = true;
      button1.Enabled = false;
      int page = 0;
      int count = 0;
      int personCount = 0;
      lblPage.Text = \"已完成页数：0\";
      int index = 0;

      for (int i = 1; i <= 10; i++)
      {
        thread = new Thread(new ParameterizedThreadStart(delegate(object obj)
        {
          for (int j = 1; j <= 10; j++)
          {
            try
            {
              index = (Convert.ToInt32(obj) - 1) * 10 + j;
              string pageHtml = HttpRequestUtil.GetPageHtml(\"http://tt.mop.com/c44/0/1_\" + index.ToString() + \".html\");
              Regex regA = new Regex(\"<a[\\\\s]+class=\\\"J-userPic([^<>]*?)[\\\\s]+href=\\\"([^\\\"]*?)\\\"\");
              Regex regImg = new Regex(\"<p class=\\\"tc mb10\\\"><img[\\\\s]+src=\\\"([^\\\"]*?)\\\"\");
              MatchCollection mc = regA.Matches(pageHtml);
              foreach (Match match in mc)
              {
                int start = match.ToString().IndexOf(\"href=\\\"\");
                string url = match.ToString().Substring(start + 6);
                int end = url.IndexOf(\"\\\"\");
                url = url.Substring(0, end);
                if (url.IndexOf(\"/\") == 0)
                {
                  string imgPageHtml = HttpRequestUtil.GetPageHtml(\"http://tt.mop.com\" + url);
                  personCount++;
                  lblPerson.Invoke(new Action(delegate() { lblPerson.Text = \"已完成条数：\" + personCount.ToString(); }));
                  MatchCollection mcImgPage = regImg.Matches(imgPageHtml);
                  foreach (Match matchImgPage in mcImgPage)
                  {
                    start = matchImgPage.ToString().IndexOf(\"src=\\\"\");
                    string imgUrl = matchImgPage.ToString().Substring(start + 5);
                    end = imgUrl.IndexOf(\"\\\"\");
                    imgUrl = imgUrl.Substring(0, end);
                    if (imgUrl.IndexOf(\"http://i1\") == 0)
                    {
                      try
                      {
                        HttpRequestUtil.HttpDownloadFile(imgUrl);
                        count++;
                        lblNum.Invoke(new Action(delegate()
                        {
                          lblNum.Text = \"已下载图片数\" + count.ToString();
                          DateTime dt = DateTime.Now;
                          double time = dt.Subtract(dtStart).TotalSeconds;
                          if (time > 0)
                          {
                            lblSpeed.Text = \"速度：\" + (count / time).ToString(\"0.0\") + \"张/秒\";
                          }
                        }));
                      }
                      catch { }
                      Thread.Sleep(1);
                    }
                  }
                }
              }
            }
            catch { }
            page++;
            lblPage.Invoke(new Action(delegate() { lblPage.Text = \"已完成页数：\" + page.ToString(); }));

            if (page == 100)
            {
              button1.Invoke(new Action(delegate() { button1.Enabled = true; }));
              MessageBox.Show(\"完成！\");
            }
          }
        }));
        thread.Start(i);
        threadList.Add(thread);
      }
    }

    private void button2_Click(object sender, EventArgs e)
    {
      button1.Invoke(new Action(delegate()
      {
        foreach (Thread thread in threadList)
        {
          if (thread.ThreadState == ThreadState.Suspended)
          {
            thread.Resume();
          }
          thread.Abort();
        }
        button1.Enabled = true;
        button2.Enabled = false;
        button3.Enabled = false;
        button4.Enabled = false;
      }));
    }

    private void Form1_FormClosing(object sender, FormClosingEventArgs e)
    {
      foreach (Thread thread in threadList)
      {
        thread.Abort();
      }
    }

    private void button3_Click(object sender, EventArgs e)
    {
      foreach (Thread thread in threadList)
      {
        if (thread.ThreadState == ThreadState.Running)
        {
          thread.Suspend();
        }
      }
      button3.Enabled = false;
      button4.Enabled = true;
    }

    private void button4_Click(object sender, EventArgs e)
    {
      foreach (Thread thread in threadList)
      {
        if (thread.ThreadState == ThreadState.Suspended)
        {
          thread.Resume();
        }
      }
      button3.Enabled = true;
      button4.Enabled = false;
    }
  }
}

截图：

以上就是本文的全部内容，希望对大家的学习有所帮助。

本文地址：https://www.stayed.cn/item/23716

转载请注明出处。

本站部分内容来源于网络,如侵犯到您的权益,请联系我

微信
QQ好友
QQ空间
腾讯微博
新浪微博
人人网

我的博客

人生若只如初见，何事秋风悲画扇。

我的标签

随笔档案

2024-02(2)
2023-06(1)
2023-05(1)
2023-04(14)
2023-03(3)
2023-01(6)
2022-12(5)
2022-11(5)
2022-07(2)
2022-06(4)
2022-05(3)
2022-03(1)
2021-12(6)
2021-11(1)
2021-10(3)
2021-09(5)
2021-07(5)
2021-02(2)
2021-01(7)
2020-12(18)
2020-11(14)
2020-10(12)
2020-09(10)
2020-08(22)
2020-07(2)
2020-06(1)
2020-04(5)
2020-03(9)
2020-02(7)
2020-01(9)
2019-12(8)
2019-11(10)
2019-10(11)
2019-09(17)
2019-08(16)
2019-07(6)
2019-06(3)
2019-04(1)
2019-03(8)
2019-02(5)
2019-01(1)
2018-11(2)
2018-10(3)
2018-09(1)
2018-08(3)
2018-07(3)
2018-06(7)
2018-04(4)
2018-03(5)
2018-02(4)
2018-01(22)
2017-12(3)
2017-11(5)
2017-10(15)
2017-09(26)
2017-08(1)
2017-07(3)