C# 网站采集例子源码

基本信息
源码名称：C# 网站采集例子源码
源码大小：2.31M
文件格式：.rar
开发语言：C#
更新时间：2015-06-01
友情提示：（无需注册或充值，赞助后即可获取资源下载链接）
嘿，亲！知识可是无价之宝呢，但咱这精心整理的资料也耗费了不少心血呀。小小地破费一下，绝对物超所值哦！如有下载和支付问题，请联系我们QQ(微信同号)：78630559
本次赞助数额为： 2 元　
源码介绍
using System;
using System.Collections.Generic;
using System.Drawing;
using System.Drawing.Imaging;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Web;
using CollectBlogData.Models;

namespace CollectBlogData.Utility
{
    public sealed class HttpUtility
    {
        /// <summary>
        /// 默认获取第一页数据
        /// </summary>
        /// <returns></returns>
        public static string HttpGetHtml()
        {
            HttpWebRequest request = (HttpWebRequest) WebRequest.Create("http://www.cnblogs.com/");
            request.Accept = "text/plain, */*; q=0.01";
            request.Method = "GET";
            request.Headers.Add("Accept-Language", "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3");
            request.ContentLength = 0;
            request.KeepAlive = false ;
            request.Host = "www.cnblogs.com";
            request.UserAgent = "Mozilla/5.0 (Windows NT 6.1; rv:25.0) Gecko/20100101 Firefox/25.0";

            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
            Stream responStream = response.GetResponseStream();
            StreamReader reader = new StreamReader(responStream,Encoding.UTF8);
            string content = reader.ReadToEnd();
            return content;

        }
        /// <summary>
        /// 自动取分页数据
        /// </summary>
        /// <param name="pageIndex"></param>
        /// <returns></returns>
        public static string HttpGetPageHtml(string pageIndex)
        {
            Encoding encoding = Encoding.UTF8;
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create("http://www.cnblogs.com/mvc/AggSite/PostList.aspx");
            request.Accept = "text/plain, */*; q=0.01";
            request.Method = "POST";
            //request 语言格式
            request.Headers.Add("Accept-Language", "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3");
            //Request编码格式
            request.Headers.Add("Accept-Charset","GBK,utf-8;q=0.7,*;q=0.3");
            //向POST请求体中添加参数 (博客园的分页参数是以json形式POST过去的)
            string postData =
                "{\"CategoryType\":\"SiteHome\",\"ParentCategoryId\":0,\"CategoryId\":808,\"PageIndex\":"   pageIndex  ",\"ItemListActionName\":\"PostList\"}";
            byte[] postByte = encoding.GetBytes(postData);
            //请求体类型
            request.ContentType = "application/json; charset=UTF-8";
            //TCP/IP是否保持连接 
            request.KeepAlive = false;
            request.Host = "www.cnblogs.com";
            //客户端信息
            request.UserAgent = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.1.3.5000 Chrome/26.0.1410.43 Safari/537.1";
            request.ContentLength = postByte.Length;
           
            Stream reStream = request.GetRequestStream();
            reStream.Write(postByte,0,postByte.Length);
            
            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
            Stream responStream = response.GetResponseStream();
            StreamReader reader = new StreamReader(responStream, Encoding.UTF8);

            //保存图片
            //Image img = new Bitmap(response.GetResponseStream());
            //img.Save(@"E:\Test\test.Bmp", ImageFormat.Bmp);

            string content = reader.ReadToEnd();
            return content;

        }

        /// <summary>
        /// 正则取文章
        /// </summary>
        /// <param name="htmlString"></param>
        /// <returns></returns>
        public static List<Article> GetArticles(string htmlString)
        {
            List<Article> articleList = new List<Article>();
            Regex regex = null;
            Article article = null;
            regex = new Regex("<div class=\"post_item\">(?<content>.*?)(?=<div class=\"clear\">"   @"</div>\s*</div>)",
                              RegexOptions.Singleline);

            if (regex.IsMatch(htmlString))
            {
                MatchCollection aritcles = regex.Matches(htmlString);

                foreach (Match item in aritcles)
                {
                    article = new Article();
                    //取推荐
                    regex =new Regex(
                            "<div class=\"digg\">.*<span.*>(?<digNum>.*)"   @"</span>"  
                            ".*<div class=\"post_item_body\">", RegexOptions.Singleline);
                    article.DiggNum = regex.Match(item.Value).Groups["digNum"].Value;

                    //取文章标题 需要去除转义字符
                    regex = new Regex("<h3>(?<a>.*)</h3>", RegexOptions.Singleline);
                    string a = regex.Match(item.Value).Groups["a"].Value;
                    regex = new Regex("<a\\s.*href=\"(?<href>.*?)\".*>(?<summary>.*)</a>", RegexOptions.Singleline);
                    article.AritcleUrl = regex.Match(a).Groups["href"].Value;
                    article.AritcleTitle = regex.Match(a).Groups["summary"].Value;

                    //取作者图片 先取html img标签再取Src
                    regex = new Regex("<a.*>(?<img><img[^>].*>)</a>", RegexOptions.Singleline);
                    string img = regex.Match(item.Value).Groups["img"].Value;
                    regex = new Regex(@"<img.*src=(?<path>.*)\s .*/>", RegexOptions.Singleline);
                    article.AuthorImg = regex.Match(img).Groups["path"].Value.TrimEnd('"').TrimStart('"');

                    //取作者博客URL及链接的target属性
                    regex = new Regex("<a\\s*?href=\"(?<href>.*)\"\\s*?target=\"(?<target>.*?)\">.*</a>",
                                      RegexOptions.Singleline);
                    article.AuthorUrl = regex.Match(item.Value).Groups["href"].Value;
                    string urlTarget = regex.Match(item.Value).Groups["target"].Value;

                    //取文章简介
                    //1 先取summary Div中所有内容
                    regex = new Regex("<p class=\"post_item_summary\">(?<summary>.*)</p>", RegexOptions.Singleline);
                    string summary = regex.Match(item.Value).Groups["summary"].Value;
                    //2 取简介
                    regex = new Regex("(?<indroduct>(?<=</a>).*)", RegexOptions.Singleline);
                    article.AritcleSummary = regex.Match(summary).Groups["indroduct"].Value;


                    //取发布人与发布时间
                    regex =
                        new Regex(
                            "<div class=\"post_item_foot\">\\s*<a.*?>(?<publishName>.*)</a>(?<publishTime>.*)<span class=\"article_comment\">",
                            RegexOptions.Singleline);
                    article.Author = regex.Match(item.Value).Groups["publishName"].Value;
                    article.PublishTime = regex.Match(item.Value).Groups["publishTime"].Value.Trim();

                    //取评论数
                    regex =
                        new Regex(
                            "<span class=\"article_comment\"><a.*>(?<comment>.*)</a></span><span class=\"article_view\">",
                            RegexOptions.Singleline);
                    article.CommentNum = regex.Match(item.Value).Groups["comment"].Value;

                    //取阅读数
                    regex = new Regex("<span\\s*class=\"article_view\"><a.*>(?<readNum>.*)</a>", RegexOptions.Singleline);
                    article.ReadNum = regex.Match(item.Value).Groups["readNum"].Value;
                    articleList.Add(article);
                }

            }
            return articleList;
        }


        /// <summary>
        /// 去除"\t\r\n"特殊字符 
        /// </summary>
        /// <param name="htmlString"></param>
        /// <returns></returns>
        public static string ClearSpecialTag(string htmlString)
        {

            string htmlStr = Regex.Replace(htmlString, "\n", "", RegexOptions.IgnoreCase);
            htmlStr = Regex.Replace(htmlStr, "\t", "", RegexOptions.IgnoreCase);
            htmlStr = Regex.Replace(htmlStr, "\r", "", RegexOptions.IgnoreCase);
            htmlStr = Regex.Replace(htmlStr, "\"", "'", RegexOptions.IgnoreCase);
            return htmlStr;
        }
    }
}