基本信息
源码名称:C# 网站采集例子源码
源码大小:2.31M
文件格式:.rar
开发语言:C#
更新时间:2015-06-01
友情提示:(无需注册或充值,赞助后即可获取资源下载链接)
嘿,亲!知识可是无价之宝呢,但咱这精心整理的资料也耗费了不少心血呀。小小地破费一下,绝对物超所值哦!如有下载和支付问题,请联系我们QQ(微信同号):813200300
本次赞助数额为: 2 元×
微信扫码支付:2 元
×
请留下您的邮箱,我们将在2小时内将文件发到您的邮箱
源码介绍
using System; using System.Collections.Generic; using System.Drawing; using System.Drawing.Imaging; using System.IO; using System.Linq; using System.Net; using System.Text; using System.Text.RegularExpressions; using System.Web; using CollectBlogData.Models; namespace CollectBlogData.Utility { public sealed class HttpUtility { /// <summary> /// 默认获取第一页数据 /// </summary> /// <returns></returns> public static string HttpGetHtml() { HttpWebRequest request = (HttpWebRequest) WebRequest.Create("http://www.cnblogs.com/"); request.Accept = "text/plain, */*; q=0.01"; request.Method = "GET"; request.Headers.Add("Accept-Language", "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3"); request.ContentLength = 0; request.KeepAlive = false ; request.Host = "www.cnblogs.com"; request.UserAgent = "Mozilla/5.0 (Windows NT 6.1; rv:25.0) Gecko/20100101 Firefox/25.0"; HttpWebResponse response = (HttpWebResponse)request.GetResponse(); Stream responStream = response.GetResponseStream(); StreamReader reader = new StreamReader(responStream,Encoding.UTF8); string content = reader.ReadToEnd(); return content; } /// <summary> /// 自动取分页数据 /// </summary> /// <param name="pageIndex"></param> /// <returns></returns> public static string HttpGetPageHtml(string pageIndex) { Encoding encoding = Encoding.UTF8; HttpWebRequest request = (HttpWebRequest)WebRequest.Create("http://www.cnblogs.com/mvc/AggSite/PostList.aspx"); request.Accept = "text/plain, */*; q=0.01"; request.Method = "POST"; //request 语言格式 request.Headers.Add("Accept-Language", "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3"); //Request编码格式 request.Headers.Add("Accept-Charset","GBK,utf-8;q=0.7,*;q=0.3"); //向POST请求体中添加参数 (博客园的分页参数是以json形式POST过去的) string postData = "{\"CategoryType\":\"SiteHome\",\"ParentCategoryId\":0,\"CategoryId\":808,\"PageIndex\":" pageIndex ",\"ItemListActionName\":\"PostList\"}"; byte[] postByte = encoding.GetBytes(postData); //请求体类型 request.ContentType = "application/json; charset=UTF-8"; //TCP/IP是否保持连接 request.KeepAlive = false; request.Host = "www.cnblogs.com"; //客户端信息 request.UserAgent = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.1.3.5000 Chrome/26.0.1410.43 Safari/537.1"; request.ContentLength = postByte.Length; Stream reStream = request.GetRequestStream(); reStream.Write(postByte,0,postByte.Length); HttpWebResponse response = (HttpWebResponse)request.GetResponse(); Stream responStream = response.GetResponseStream(); StreamReader reader = new StreamReader(responStream, Encoding.UTF8); //保存图片 //Image img = new Bitmap(response.GetResponseStream()); //img.Save(@"E:\Test\test.Bmp", ImageFormat.Bmp); string content = reader.ReadToEnd(); return content; } /// <summary> /// 正则取文章 /// </summary> /// <param name="htmlString"></param> /// <returns></returns> public static List<Article> GetArticles(string htmlString) { List<Article> articleList = new List<Article>(); Regex regex = null; Article article = null; regex = new Regex("<div class=\"post_item\">(?<content>.*?)(?=<div class=\"clear\">" @"</div>\s*</div>)", RegexOptions.Singleline); if (regex.IsMatch(htmlString)) { MatchCollection aritcles = regex.Matches(htmlString); foreach (Match item in aritcles) { article = new Article(); //取推荐 regex =new Regex( "<div class=\"digg\">.*<span.*>(?<digNum>.*)" @"</span>" ".*<div class=\"post_item_body\">", RegexOptions.Singleline); article.DiggNum = regex.Match(item.Value).Groups["digNum"].Value; //取文章标题 需要去除转义字符 regex = new Regex("<h3>(?<a>.*)</h3>", RegexOptions.Singleline); string a = regex.Match(item.Value).Groups["a"].Value; regex = new Regex("<a\\s.*href=\"(?<href>.*?)\".*>(?<summary>.*)</a>", RegexOptions.Singleline); article.AritcleUrl = regex.Match(a).Groups["href"].Value; article.AritcleTitle = regex.Match(a).Groups["summary"].Value; //取作者图片 先取html img标签再取Src regex = new Regex("<a.*>(?<img><img[^>].*>)</a>", RegexOptions.Singleline); string img = regex.Match(item.Value).Groups["img"].Value; regex = new Regex(@"<img.*src=(?<path>.*)\s .*/>", RegexOptions.Singleline); article.AuthorImg = regex.Match(img).Groups["path"].Value.TrimEnd('"').TrimStart('"'); //取作者博客URL及链接的target属性 regex = new Regex("<a\\s*?href=\"(?<href>.*)\"\\s*?target=\"(?<target>.*?)\">.*</a>", RegexOptions.Singleline); article.AuthorUrl = regex.Match(item.Value).Groups["href"].Value; string urlTarget = regex.Match(item.Value).Groups["target"].Value; //取文章简介 //1 先取summary Div中所有内容 regex = new Regex("<p class=\"post_item_summary\">(?<summary>.*)</p>", RegexOptions.Singleline); string summary = regex.Match(item.Value).Groups["summary"].Value; //2 取简介 regex = new Regex("(?<indroduct>(?<=</a>).*)", RegexOptions.Singleline); article.AritcleSummary = regex.Match(summary).Groups["indroduct"].Value; //取发布人与发布时间 regex = new Regex( "<div class=\"post_item_foot\">\\s*<a.*?>(?<publishName>.*)</a>(?<publishTime>.*)<span class=\"article_comment\">", RegexOptions.Singleline); article.Author = regex.Match(item.Value).Groups["publishName"].Value; article.PublishTime = regex.Match(item.Value).Groups["publishTime"].Value.Trim(); //取评论数 regex = new Regex( "<span class=\"article_comment\"><a.*>(?<comment>.*)</a></span><span class=\"article_view\">", RegexOptions.Singleline); article.CommentNum = regex.Match(item.Value).Groups["comment"].Value; //取阅读数 regex = new Regex("<span\\s*class=\"article_view\"><a.*>(?<readNum>.*)</a>", RegexOptions.Singleline); article.ReadNum = regex.Match(item.Value).Groups["readNum"].Value; articleList.Add(article); } } return articleList; } /// <summary> /// 去除"\t\r\n"特殊字符 /// </summary> /// <param name="htmlString"></param> /// <returns></returns> public static string ClearSpecialTag(string htmlString) { string htmlStr = Regex.Replace(htmlString, "\n", "", RegexOptions.IgnoreCase); htmlStr = Regex.Replace(htmlStr, "\t", "", RegexOptions.IgnoreCase); htmlStr = Regex.Replace(htmlStr, "\r", "", RegexOptions.IgnoreCase); htmlStr = Regex.Replace(htmlStr, "\"", "'", RegexOptions.IgnoreCase); return htmlStr; } } }