基本信息
源码名称:爬取头条美女图片(仅供参考思路,已不能用)
源码大小:2.07KB
文件格式:.zip
开发语言:Python
更新时间:2019-03-18
友情提示:(无需注册或充值,赞助后即可获取资源下载链接)
嘿,亲!知识可是无价之宝呢,但咱这精心整理的资料也耗费了不少心血呀。小小地破费一下,绝对物超所值哦!如有下载和支付问题,请联系我们QQ(微信同号):813200300
本次赞助数额为: 4 元×
微信扫码支付:4 元
×
请留下您的邮箱,我们将在2小时内将文件发到您的邮箱
源码介绍
爬取头条美女图片,喜欢的可以拿走欣赏
爬取头条美女图片,喜欢的可以拿走欣赏
import json
import os
from urllib.parse import urlencode
import pymongo
import requests
from bs4 import BeautifulSoup
from requests.exceptions import ConnectionError
import re
from multiprocessing import Pool
from hashlib import md5
from json.decoder import JSONDecodeError
from config import *
client = pymongo.MongoClient(MONGO_URL, connect=False)
db = client[MONGO_DB]
def get_page_index(offset, keyword):
data = {
'autoload': 'true',
'count': 20,
'cur_tab': 3,
'format': 'json',
'keyword': keyword,
'offset': offset,
}
params = urlencode(data)
base = 'http://www.toutiao.com/search_content/'
url = base '?' params
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except ConnectionError:
print('Error occurred')
return None
def download_image(url):
print('Downloading', url)
try:
response = requests.get(url)
if response.status_code == 200:
save_image(response.content)
return None
except ConnectionError:
return None
def save_image(content):
file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')
print(file_path)
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(content)
f.close()
def parse_page_index(text):
try:
data = json.loads(text)
if data and 'data' in data.keys():
for item in data.get('data'):
yield item.get('article_url')
except JSONDecodeError:
pass
def get_page_detail(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except ConnectionError:
print('Error occurred')
return None
def parse_page_detail(html, url):
soup = BeautifulSoup(html, 'lxml')
result = soup.select('title')
title = result[0].get_text() if result else ''
images_pattern = re.compile('gallery: JSON.parse\("(.*)"\)', re.S)
result = re.search(images_pattern, html)
if result:
data = json.loads(result.group(1).replace('\\', ''))
if data and 'sub_images' in data.keys():
sub_images = data.get('sub_images')
images = [item.get('url') for item in sub_images]
for image in images: download_image(image)
return {
'title': title,
'url': url,
'images': images
}
def save_to_mongo(result):
if db[MONGO_TABLE].insert(result):
print('Successfully Saved to Mongo', result)
return True
return False
def main(offset):
text = get_page_index(offset, KEYWORD)
urls = parse_page_index(text)
for url in urls:
html = get_page_detail(url)
result = parse_page_detail(html, url)
if result: save_to_mongo(result)
if __name__ == '__main__':
pool = Pool()
groups = ([x * 20 for x in range(GROUP_START, GROUP_END 1)])
pool.map(main, groups)
pool.close()
pool.join()