基本信息
源码名称:C#写的蜘蛛程序(小偷程序)
源码大小:0.57M
文件格式:.rar
开发语言:C#
更新时间:2015-11-06
友情提示:(无需注册或充值,赞助后即可获取资源下载链接)
嘿,亲!知识可是无价之宝呢,但咱这精心整理的资料也耗费了不少心血呀。小小地破费一下,绝对物超所值哦!如有下载和支付问题,请联系我们QQ(微信同号):813200300
本次赞助数额为: 2 元×
微信扫码支付:2 元
×
请留下您的邮箱,我们将在2小时内将文件发到您的邮箱
源码介绍
C#写的蜘蛛程序也叫小偷程序
"蜘蛛"(Spider)是Internet上一种很有用的程序,搜索引擎利用蜘蛛程序将Web页面收集到数据库,企业利用蜘蛛程序监视竞争对手的网站并跟踪变动,个人用户用蜘蛛程序下载Web页面以便脱机使用,开发者利用蜘蛛程序扫描自己的Web检查无效的链接……对于不同的用户,蜘蛛程序有不同的用途。那么,蜘蛛程序到底是怎样工作的呢?
蜘蛛是一种半自动的程序,就象现实当中的蜘蛛在它的Web(蜘蛛网)上旅行一样,蜘蛛程序也按照类似的方式在Web链接织成的网上旅行。蜘蛛程序之所以是半自动的,是因为它总是需要一个初始链接(出发点),但此后的运行情况就要由它自己决定了,蜘蛛程序会扫描起始页面包含的链接,然后访问这些链接指向的页面,再分析和追踪那些页面包含的链接。从理论上看,最终蜘蛛程序会访问到Internet上的每一个页面,因为Internet上几乎每一个页面总是被其他或多或少的页面引用。
namespace Spider { /// <summary> /// Perform all of the work of a single thread for the spider. /// This involves waiting for a URL to becomve available, download /// and then processing the page. /// /// </summary> // 完成必须由单个工作线程执行的操作,包括 // 等待可用的URL,下载和处理页面 public class DocumentWorker { /// <summary> /// The base URI that is to be spidered. /// </summary> // 要扫描的基础URI private Uri m_uri; /// <summary> /// The spider that this thread "works for" /// </summary> // private Spider m_spider; /// <summary> /// The thread that is being used. /// </summary> private Thread m_thread; /// <summary> /// The thread number, used to identify this worker. /// </summary> // 线程编号,用来标识当前的工作线程 private int m_number; /// <summary> /// The name for default documents. /// </summary> // 缺省文档的名字 public const string IndexFile = "index.html"; /// <summary> /// Constructor. /// </summary> /// <param name="spider">The spider that owns this worker.</param> // 构造函数,参数表示拥有当前工作线程的蜘蛛程序 public DocumentWorker(Spider spider) { m_spider = spider; } /// <summary> /// This method will take a URI name, such ash /images/blank.gif /// and convert it into the name of a file for local storage. /// If the directory structure to hold this file does not exist, it /// will be created by this method. /// </summary> /// <param name="uri">The URI of the file about to be stored</param> /// <returns></returns> // 输入参数是一个URI名称,例如/images/blank.gif. // 把它转换成本地文件名称。如果尚未创建相应的目录 // 结构,则创建之 private string convertFilename(Uri uri) { string result = m_spider.OutputPath; int index1; int index2; // add ending slash if needed if( result[result.Length-1]!='\\' ) result = result "\\"; // strip the query if needed String path = uri.PathAndQuery; int queryIndex = path.IndexOf("?"); if( queryIndex!=-1 ) path = path.Substring(0,queryIndex); // see if an ending / is missing from a directory only int lastSlash = path.LastIndexOf('/'); int lastDot = path.LastIndexOf('.'); if( path[path.Length-1]!='/' ) { if(lastSlash>lastDot) path ="/" IndexFile; } // determine actual filename lastSlash = path.LastIndexOf('/'); string filename = ""; if(lastSlash!=-1) { filename=path.Substring(1 lastSlash); path = path.Substring(0,1 lastSlash); if(filename.Equals("") ) filename=IndexFile; } // 必要时创建目录结构 index1 = 1; do { index2 = path.IndexOf('/',index1); if(index2!=-1) { String dirpart = path.Substring(index1,index2-index1); result =dirpart; result ="\\"; Directory.CreateDirectory(result); index1 = index2 1; } } while(index2!=-1); // attach name result =filename; return result; } /// <summary> /// Save a binary file to disk. /// </summary> /// <param name="response">The response used to save the file</param> // 将二进制文件保存到磁盘 private void SaveBinaryFile(WebResponse response) { byte []buffer = new byte[1024]; if( m_spider.OutputPath==null ) return; string filename = convertFilename( response.ResponseUri ); Stream outStream = File.Create( filename ); Stream inStream = response.GetResponseStream(); int l; do { l = inStream.Read(buffer,0,buffer.Length); if(l>0) outStream.Write(buffer,0,l); } while(l>0); outStream.Close(); inStream.Close(); } /// <summary> /// Save a text file. /// </summary> /// <param name="buffer">The text to save</param> // 保存文本文件 private void SaveTextFile(string buffer) { if( m_spider.OutputPath==null ) return; string filename = convertFilename( m_uri ); StreamWriter outStream = new StreamWriter( filename ); outStream.Write(buffer); outStream.Close(); } /// <summary> /// Download a page /// </summary> /// <returns>The data downloaded from the page</returns> // 下载一个页面 private string GetPage() { WebResponse response = null; Stream stream = null; StreamReader reader = null; try { HttpWebRequest request = (HttpWebRequest)WebRequest.Create(m_uri); response = request.GetResponse(); stream = response.GetResponseStream(); if( !response.ContentType.ToLower().StartsWith("text/") ) { SaveBinaryFile(response); return null; } string buffer = "",line; reader = new StreamReader(stream); while( (line = reader.ReadLine())!=null ) { buffer =line "\r\n"; } SaveTextFile(buffer); return buffer; } catch(WebException e) { System.Console.WriteLine("下载失败,错误:" e); return null; } catch(IOException e) { System.Console.WriteLine("下载失败,错误:" e); return null; } finally { if( reader!=null ) reader.Close(); if( stream!=null ) stream.Close(); if( response!=null ) response.Close(); } } /// <summary> /// Process each link encountered. The link will be recorded /// for later spidering if it is an http or https docuent, /// has not been visited before(determined by spider class), /// and is in the same host as the original base URL. /// </summary> /// <param name="link">The URL to process</param> private void ProcessLink(string link) { Uri url; // fully expand this URL if it was a relative link try { url = new Uri(m_uri,link,false); } catch(UriFormatException e) { System.Console.WriteLine( "Invalid URI:" link " Error:" e.Message); return; } if(!url.Scheme.ToLower().Equals("http") && !url.Scheme.ToLower().Equals("https") ) return; // comment out this line if you would like to spider // the whole Internet (yeah right, but it will try) if( !url.Host.ToLower().Equals( m_uri.Host.ToLower() ) ) return; //System.Console.WriteLine( "Queue:" url ); m_spider.addURI( url ); } /// <summary> /// Process a URL /// </summary> /// <param name="page">the URL to process</param> private void ProcessPage(string page) { ParseHTML parse = new ParseHTML(); parse.Source = page; while(!parse.Eof()) { char ch = parse.Parse(); if(ch==0) { Attribute a = parse.GetTag()["HREF"]; if( a!=null ) ProcessLink(a.Value); a = parse.GetTag()["SRC"]; if( a!=null ) ProcessLink(a.Value); } } } /// <summary> /// This method is the main loop for the spider threads. /// This method will wait for URL's to become available, /// and then process them. /// </summary> public void Process() { while(!m_spider.Quit ) { m_uri = m_spider.ObtainWork(); m_spider.SpiderDone.WorkerBegin(); System.Console.WriteLine("Download(" this.Number "):" m_uri); string page = GetPage(); if(page!=null) ProcessPage(page); m_spider.SpiderDone.WorkerEnd(); } } /// <summary> /// Start the thread. /// </summary> public void start() { ThreadStart ts = new ThreadStart( this.Process ); m_thread = new Thread(ts); m_thread.Start(); } /// <summary> /// The thread number. Used only to identify this thread. /// </summary> public int Number { get { return m_number; } set { m_number = value; } } } }