C#写的蜘蛛程序(小偷程序)

基本信息

源码名称：C#写的蜘蛛程序(小偷程序)

源码大小：0.57M

文件格式：.rar

开发语言：C#

更新时间：2015-11-06

友情提示：（无需注册或充值，赞助后即可获取资源下载链接）

嘿，亲！知识可是无价之宝呢，但咱这精心整理的资料也耗费了不少心血呀。小小地破费一下，绝对物超所值哦！如有下载和支付问题，请联系我们QQ(微信同号)：78630559

本次赞助数额为： 2 元　

源码介绍

C#写的蜘蛛程序也叫小偷程序

"蜘蛛"（Spider）是Internet上一种很有用的程序，搜索引擎利用蜘蛛程序将Web页面收集到数据库，企业利用蜘蛛程序监视竞争对手的网站并跟踪变动，个人用户用蜘蛛程序下载Web页面以便脱机使用，开发者利用蜘蛛程序扫描自己的Web检查无效的链接……对于不同的用户，蜘蛛程序有不同的用途。那么，蜘蛛程序到底是怎样工作的呢？
蜘蛛是一种半自动的程序，就象现实当中的蜘蛛在它的Web（蜘蛛网）上旅行一样，蜘蛛程序也按照类似的方式在Web链接织成的网上旅行。蜘蛛程序之所以是半自动的，是因为它总是需要一个初始链接（出发点），但此后的运行情况就要由它自己决定了，蜘蛛程序会扫描起始页面包含的链接，然后访问这些链接指向的页面，再分析和追踪那些页面包含的链接。从理论上看，最终蜘蛛程序会访问到Internet上的每一个页面，因为Internet上几乎每一个页面总是被其他或多或少的页面引用。

namespace Spider
{
	/// <summary>
	/// Perform all of the work of a single thread for the spider.
	/// This involves waiting for a URL to becomve available, download
	/// and then processing the page.
	/// 
	/// </summary>
	// 完成必须由单个工作线程执行的操作，包括
	// 等待可用的URL，下载和处理页面
	public class DocumentWorker
	{
		/// <summary>
		/// The base URI that is to be spidered.
		/// </summary>
		// 要扫描的基础URI
		private Uri m_uri;

		/// <summary>
		/// The spider that this thread "works for"
		/// </summary>
		// 
		private Spider m_spider;

		/// <summary>
		/// The thread that is being used.
		/// </summary>
		private Thread m_thread;

		/// <summary>
		/// The thread number, used to identify this worker.
		/// </summary>
		// 线程编号，用来标识当前的工作线程
		private int m_number;
		

		/// <summary>
		/// The name for default documents.
		/// </summary>
		// 缺省文档的名字
		public const string IndexFile = "index.html";

		/// <summary>
		/// Constructor.
		/// </summary>
		/// <param name="spider">The spider that owns this worker.</param>
		// 构造函数，参数表示拥有当前工作线程的蜘蛛程序
		public DocumentWorker(Spider spider)
		{
			m_spider = spider;
		}

		/// <summary>
		/// This method will take a URI name, such ash /images/blank.gif
		/// and convert it into the name of a file for local storage.
		/// If the directory structure to hold this file does not exist, it
		/// will be created by this method.
		/// </summary>
		/// <param name="uri">The URI of the file about to be stored</param>
		/// <returns></returns>
		// 输入参数是一个URI名称，例如/images/blank.gif.
		// 把它转换成本地文件名称。如果尚未创建相应的目录
		// 结构，则创建之
		private string convertFilename(Uri uri)
		{
			string result = m_spider.OutputPath;
			int index1;
			int index2;			

			// add ending slash if needed
			if( result[result.Length-1]!='\\' )
				result = result "\\";

			// strip the query if needed

			String path = uri.PathAndQuery;
			int queryIndex = path.IndexOf("?");
			if( queryIndex!=-1 )
				path = path.Substring(0,queryIndex);

			// see if an ending / is missing from a directory only
			
			int lastSlash = path.LastIndexOf('/');
			int lastDot = path.LastIndexOf('.');

			if( path[path.Length-1]!='/' )
			{
				if(lastSlash>lastDot)
					path ="/" IndexFile;
			}

			// determine actual filename		
			lastSlash = path.LastIndexOf('/');

			string filename = "";
			if(lastSlash!=-1)
			{
				filename=path.Substring(1 lastSlash);
				path = path.Substring(0,1 lastSlash);
				if(filename.Equals("") )
					filename=IndexFile;
			}

			// 必要时创建目录结构			
			index1 = 1;
			do
			{
				index2 = path.IndexOf('/',index1);
				if(index2!=-1)
				{
					String dirpart = path.Substring(index1,index2-index1);
					result =dirpart;
					result ="\\";
				
				
					Directory.CreateDirectory(result);

					index1 = index2 1;					
				}
			} while(index2!=-1);			

			// attach name
			result =filename;

			return result;
		}

		/// <summary>
		/// Save a binary file to disk.
		/// </summary>
		/// <param name="response">The response used to save the file</param>
		// 将二进制文件保存到磁盘
		private void SaveBinaryFile(WebResponse response)
		{
			byte []buffer = new byte[1024];

			if( m_spider.OutputPath==null )
				return;

			string filename = convertFilename( response.ResponseUri );
			Stream outStream = File.Create( filename );
			Stream inStream = response.GetResponseStream();	
			
			int l;
			do
			{
				l = inStream.Read(buffer,0,buffer.Length);
				if(l>0)
					outStream.Write(buffer,0,l);
			}
			while(l>0);
			
			outStream.Close();
			inStream.Close();

		}

		/// <summary>
		/// Save a text file.
		/// </summary>
		/// <param name="buffer">The text to save</param>
		// 保存文本文件
		private void SaveTextFile(string buffer)
		{
			if( m_spider.OutputPath==null )
				return;

			string filename = convertFilename( m_uri );
			StreamWriter outStream = new StreamWriter( filename );
			outStream.Write(buffer);
			outStream.Close();
		}

		/// <summary>
		/// Download a page
		/// </summary>
		/// <returns>The data downloaded from the page</returns>
		// 下载一个页面
		private string GetPage()
		{
			WebResponse response = null;
			Stream stream = null;
			StreamReader reader = null;

			try
			{
				HttpWebRequest request = (HttpWebRequest)WebRequest.Create(m_uri);
							
				response = request.GetResponse();
				stream = response.GetResponseStream();	

				if( !response.ContentType.ToLower().StartsWith("text/") )
				{
					SaveBinaryFile(response);
					return null;
				}

				string buffer = "",line;

				reader = new StreamReader(stream);
			
				while( (line = reader.ReadLine())!=null )
				{
					buffer =line "\r\n";
				}
			
				SaveTextFile(buffer);
				return buffer;
			}
			catch(WebException e)
			{
				System.Console.WriteLine("下载失败，错误："   e);
				return null;
			}
			catch(IOException e)
			{
				System.Console.WriteLine("下载失败，错误："   e);
				return null;
			}
			finally
			{
				if( reader!=null ) reader.Close();
				if( stream!=null ) stream.Close();
				if( response!=null ) response.Close();
			}
		}

		/// <summary>
		/// Process each link encountered. The link will be recorded
		/// for later spidering if it is an http or https docuent, 
		/// has not been visited before(determined by spider class),
		/// and is in the same host as the original base URL.
		/// </summary>
		/// <param name="link">The URL to process</param>
		private void ProcessLink(string link)
		{
			Uri url;

			// fully expand this URL if it was a relative link
			try
			{
				url = new Uri(m_uri,link,false);
			}
			catch(UriFormatException e)
			{
				System.Console.WriteLine( "Invalid URI:"   link  " Error:"   e.Message);
				return;
			}

			if(!url.Scheme.ToLower().Equals("http") &&
				!url.Scheme.ToLower().Equals("https") )
				return;

			// comment out this line if you would like to spider
			// the whole Internet (yeah right, but it will try)
			if( !url.Host.ToLower().Equals( m_uri.Host.ToLower() ) )
				return;

			//System.Console.WriteLine( "Queue:" url );
			m_spider.addURI( url );



		}

		/// <summary>
		/// Process a URL
		/// </summary>
		/// <param name="page">the URL to process</param>
		private void ProcessPage(string page)
		{
			ParseHTML parse = new ParseHTML();
			parse.Source = page;

			while(!parse.Eof())
			{
				char ch = parse.Parse();
				if(ch==0)
				{
					Attribute a = parse.GetTag()["HREF"];
					if( a!=null )
						ProcessLink(a.Value);
					
					a = parse.GetTag()["SRC"];
					if( a!=null )
						ProcessLink(a.Value);
				}
			}
		}


		/// <summary>
		/// This method is the main loop for the spider threads.
		/// This method will wait for URL's to become available, 
		/// and then process them. 
		/// </summary>
		public void Process()
		{
			while(!m_spider.Quit )
			{
				m_uri = m_spider.ObtainWork();
				
				m_spider.SpiderDone.WorkerBegin();
				System.Console.WriteLine("Download(" this.Number "):" m_uri);			
				string page = GetPage();
				if(page!=null)
					ProcessPage(page);
				m_spider.SpiderDone.WorkerEnd();
			}
		}

		/// <summary>
		/// Start the thread.
		/// </summary>
		public void start()
		{
			ThreadStart ts = new ThreadStart( this.Process );
			m_thread = new Thread(ts);
			m_thread.Start();
		}

		/// <summary>
		/// The thread number. Used only to identify this thread.
		/// </summary>
		public int Number 
		{
			get
			{
				return m_number;
			}

			set
			{
				m_number = value;
			}
		
		}
	}
}