基本信息
源码名称:scrapy抓取安居客数据
源码大小:0.04M
文件格式:.zip
开发语言:Python
更新时间:2017-10-17
友情提示:(无需注册或充值,赞助后即可获取资源下载链接)
嘿,亲!知识可是无价之宝呢,但咱这精心整理的资料也耗费了不少心血呀。小小地破费一下,绝对物超所值哦!如有下载和支付问题,请联系我们QQ(微信同号):813200300
本次赞助数额为: 2 元×
微信扫码支付:2 元
×
请留下您的邮箱,我们将在2小时内将文件发到您的邮箱
源码介绍
采用python scrapy抓取安居客数据
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy import log
import re
from urllib import request
from fang.items import AnJuKeItem
import json
import time
import os
class AnJuKeMobileSpider(CrawlSpider):
name = "anjukemobile"
allowed_domains = ['m.anjuke.com']
start_urls = ['https://m.anjuke.com/sh/loupan/newajax/all/?q=&lat=0&lng=0&page=1']
rules = (Rule(LinkExtractor(allow='/loupan/newajax/all/?q=&lat=0&lng=0&page=\d '), follow=True),
# Rule(LinkExtractor(allow='/sh/loupan/p\d '), follow=True),
Rule(LinkExtractor(allow='/sh/loupan/\d /$'), callback="parse_main", follow=True),
Rule(LinkExtractor(allow='/sh/loupan/\d /params/'), callback="parse_params"),
Rule(LinkExtractor(allow='/sh/loupan/\d /xiangce/\d /$'), callback="parse_image"),
)
def parse_main(self, response):
log.msg(('down load url %s' % response.url), level=log.INFO)
print(response.url)
id = re.sub("\D", "", response.url)
try:
item = AnJuKeItem()
status = response.xpath('//*[@id="container"]/div[@class="lpbase"]/div[@class="lptitle"]/span/'
'em/text()').extract()
if status and "已售罄" not in status[0]:
item['status'] = status[0]
projects = response.xpath('//*[@id="container"]/div[@class="lpbase"]/div[@class="lptitle"]/span/'
'h1/text()').extract()
item['project_name'] = projects[0]
address = response.xpath('//*[@id="container"]/div[@class="lpbase"]/div[@class="lpinfo"]/'
'a/p[@class="g-overflow-third"]//text()').extract()
item['address'] = address[0].strip().replace('\xa0', '')
item['province'] = '上海市'
item['city'] = '上海市'
item['district'] = address[0].strip().replace('\xa0', '').split('-')[0]
print(projects[0])
print(item)
except Exception as error:
log.msg(error, level=log.ERROR)
def parse_params(self, response):
log.msg(('down load param %s' % response.url), level=log.INFO)
try:
item = AnJuKeItem()
status = response.xpath('//*[@id="main-page"]/div[@class="pcontainer"]/ul/li[2]/span'
'/text()').extract()
if status and "已售罄" not in status[0]:
item['status'] = status[0]
project_name = response.xpath('//*[@id="main-page"]/div[@class="pcontainer"]/h3[1]/text()').extract()
item['project_name'] = project_name[0]
delivery_time = response.xpath('//*[@id="main-page"]/div[@class="pcontainer"]/ul/li[4]/span'
'/text()').extract()
item['delivery_time'] = delivery_time[0]
item['unit_price'] = response.xpath('//*[@id="main-page"]/div[@class="pcontainer"]/ul[2]/li[1]/span'
'/text()').extract()[0]
renovation = response.xpath('//*[@id="main-page"]/div[@class="pcontainer"]/ul[3]/li[3]/label'
'/text()').extract()[0]
if "装修标准" in renovation:
item['renovation'] = response.xpath('//*[@id="main-page"]/div[@class="pcontainer"]/ul[3]/li[3]/span'
'/text()').extract()[0]
else:
item['renovation'] = None
print(item)
except Exception as error:
log.msg(response.url, level=log.ERROR)
print(response.url)
采用python scrapy抓取安居客数据
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy import log
import re
from urllib import request
from fang.items import AnJuKeItem
import json
import time
import os
class AnJuKeMobileSpider(CrawlSpider):
name = "anjukemobile"
allowed_domains = ['m.anjuke.com']
start_urls = ['https://m.anjuke.com/sh/loupan/newajax/all/?q=&lat=0&lng=0&page=1']
rules = (Rule(LinkExtractor(allow='/loupan/newajax/all/?q=&lat=0&lng=0&page=\d '), follow=True),
# Rule(LinkExtractor(allow='/sh/loupan/p\d '), follow=True),
Rule(LinkExtractor(allow='/sh/loupan/\d /$'), callback="parse_main", follow=True),
Rule(LinkExtractor(allow='/sh/loupan/\d /params/'), callback="parse_params"),
Rule(LinkExtractor(allow='/sh/loupan/\d /xiangce/\d /$'), callback="parse_image"),
)
def parse_main(self, response):
log.msg(('down load url %s' % response.url), level=log.INFO)
print(response.url)
id = re.sub("\D", "", response.url)
try:
item = AnJuKeItem()
status = response.xpath('//*[@id="container"]/div[@class="lpbase"]/div[@class="lptitle"]/span/'
'em/text()').extract()
if status and "已售罄" not in status[0]:
item['status'] = status[0]
projects = response.xpath('//*[@id="container"]/div[@class="lpbase"]/div[@class="lptitle"]/span/'
'h1/text()').extract()
item['project_name'] = projects[0]
address = response.xpath('//*[@id="container"]/div[@class="lpbase"]/div[@class="lpinfo"]/'
'a/p[@class="g-overflow-third"]//text()').extract()
item['address'] = address[0].strip().replace('\xa0', '')
item['province'] = '上海市'
item['city'] = '上海市'
item['district'] = address[0].strip().replace('\xa0', '').split('-')[0]
print(projects[0])
print(item)
except Exception as error:
log.msg(error, level=log.ERROR)
def parse_params(self, response):
log.msg(('down load param %s' % response.url), level=log.INFO)
try:
item = AnJuKeItem()
status = response.xpath('//*[@id="main-page"]/div[@class="pcontainer"]/ul/li[2]/span'
'/text()').extract()
if status and "已售罄" not in status[0]:
item['status'] = status[0]
project_name = response.xpath('//*[@id="main-page"]/div[@class="pcontainer"]/h3[1]/text()').extract()
item['project_name'] = project_name[0]
delivery_time = response.xpath('//*[@id="main-page"]/div[@class="pcontainer"]/ul/li[4]/span'
'/text()').extract()
item['delivery_time'] = delivery_time[0]
item['unit_price'] = response.xpath('//*[@id="main-page"]/div[@class="pcontainer"]/ul[2]/li[1]/span'
'/text()').extract()[0]
renovation = response.xpath('//*[@id="main-page"]/div[@class="pcontainer"]/ul[3]/li[3]/label'
'/text()').extract()[0]
if "装修标准" in renovation:
item['renovation'] = response.xpath('//*[@id="main-page"]/div[@class="pcontainer"]/ul[3]/li[3]/span'
'/text()').extract()[0]
else:
item['renovation'] = None
print(item)
except Exception as error:
log.msg(response.url, level=log.ERROR)
print(response.url)