scrapy抓取安居客数据

基本信息

源码名称：scrapy抓取安居客数据

源码大小：0.04M

文件格式：.zip

开发语言：Python

更新时间：2017-10-17

友情提示：（无需注册或充值，赞助后即可获取资源下载链接）

嘿，亲！知识可是无价之宝呢，但咱这精心整理的资料也耗费了不少心血呀。小小地破费一下，绝对物超所值哦！如有下载和支付问题，请联系我们QQ(微信同号)：813200300

本次赞助数额为： 2 元　

源码介绍

采用python scrapy抓取安居客数据

from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy import log
import re
from urllib import request
from fang.items import AnJuKeItem
import json
import time
import os

class AnJuKeMobileSpider(CrawlSpider):
name = "anjukemobile"
allowed_domains = ['m.anjuke.com']
start_urls = ['https://m.anjuke.com/sh/loupan/newajax/all/?q=&lat=0&lng=0&page=1']
rules = (Rule(LinkExtractor(allow='/loupan/newajax/all/?q=&lat=0&lng=0&page=\d '), follow=True),
# Rule(LinkExtractor(allow='/sh/loupan/p\d '), follow=True),
Rule(LinkExtractor(allow='/sh/loupan/\d /$'), callback="parse_main", follow=True),
Rule(LinkExtractor(allow='/sh/loupan/\d /params/'), callback="parse_params"),
Rule(LinkExtractor(allow='/sh/loupan/\d /xiangce/\d /$'), callback="parse_image"),
)

def parse_main(self, response):
log.msg(('down load url %s' % response.url), level=log.INFO)
print(response.url)
id = re.sub("\D", "", response.url)
try:
item = AnJuKeItem()
status = response.xpath('//*[@id="container"]/div[@class="lpbase"]/div[@class="lptitle"]/span/'
'em/text()').extract()
if status and "已售罄" not in status[0]:
item['status'] = status[0]
projects = response.xpath('//*[@id="container"]/div[@class="lpbase"]/div[@class="lptitle"]/span/'
'h1/text()').extract()
item['project_name'] = projects[0]
address = response.xpath('//*[@id="container"]/div[@class="lpbase"]/div[@class="lpinfo"]/'
'a/p[@class="g-overflow-third"]//text()').extract()
item['address'] = address[0].strip().replace('\xa0', '')
item['province'] = '上海市'
item['city'] = '上海市'
item['district'] = address[0].strip().replace('\xa0', '').split('-')[0]

print(projects[0])
print(item)
except Exception as error:
log.msg(error, level=log.ERROR)

def parse_params(self, response):
log.msg(('down load param %s' % response.url), level=log.INFO)
try:
item = AnJuKeItem()
status = response.xpath('//*[@id="main-page"]/div[@class="pcontainer"]/ul/li[2]/span'
'/text()').extract()
if status and "已售罄" not in status[0]:
item['status'] = status[0]
project_name = response.xpath('//*[@id="main-page"]/div[@class="pcontainer"]/h3[1]/text()').extract()
item['project_name'] = project_name[0]
delivery_time = response.xpath('//*[@id="main-page"]/div[@class="pcontainer"]/ul/li[4]/span'
'/text()').extract()
item['delivery_time'] = delivery_time[0]
item['unit_price'] = response.xpath('//*[@id="main-page"]/div[@class="pcontainer"]/ul[2]/li[1]/span'
'/text()').extract()[0]
renovation = response.xpath('//*[@id="main-page"]/div[@class="pcontainer"]/ul[3]/li[3]/label'
'/text()').extract()[0]
if "装修标准" in renovation:
item['renovation'] = response.xpath('//*[@id="main-page"]/div[@class="pcontainer"]/ul[3]/li[3]/span'
'/text()').extract()[0]
else:
item['renovation'] = None
print(item)
except Exception as error:
log.msg(response.url, level=log.ERROR)
print(response.url)