基本信息
源码名称:微博内容爬取
源码大小:3.59KB
文件格式:.py
开发语言:Python
更新时间:2021-02-08
友情提示:(无需注册或充值,赞助后即可获取资源下载链接)
嘿,亲!知识可是无价之宝呢,但咱这精心整理的资料也耗费了不少心血呀。小小地破费一下,绝对物超所值哦!如有下载和支付问题,请联系我们QQ(微信同号):813200300
本次赞助数额为: 2 元×
微信扫码支付:2 元
×
请留下您的邮箱,我们将在2小时内将文件发到您的邮箱
源码介绍
通过微博关键词搜索爬取某区域的微博内容。
import requests from bs4 import BeautifulSoup import openpyxl import re import time # 获取页面内容 def getHTMLText(url): try: kv = {"User-Agent": "Mozilla/5.0", "cookie": """"""} r = requests.get(url, headers=kv) r.raise_for_status() r.encoding = 'utf-8' return r.text except: return "" # 对获取的页面内容进行解析,两个变量一个是结果的列表类型,一个是html页面的信息 def parsePage(ilt,html,): soup = BeautifulSoup(html, 'html.parser') a = soup.find_all('a') tlt = [] #名字加进去 for i in a: try: name = i.attrs['nick-name'] tlt.append(name) except: continue p = soup.find_all(name='p', attrs={"class": "txt"}) plt = [] #内容加进去 for j in p: try: txt = j.text if '展开全文' not in txt: txt = txt.replace(' ', '') plt.append(txt) else: continue except: continue # 首先找所有有nick_name属性的a标签,再从这些a标签中找每个人微博的uid nick_name = re.compile(r'.*?') n_a = soup.find_all('a', attrs={'nick-name': nick_name}) ult = [] for m in n_a: try: href = m.attrs['href'] # 属性href中有个人微博网址 ult.append(re.findall(r'\d{10}', href)[0]) #ult是网址列表 except: continue alt = [] #地址列表 for weibo in ult: url = 'https://weibo.com/p/100505' weibo '/info?mod=pedit_more' html2 = getHTMLText(url) try: if html2 == "": alt.append('非西安') else: soup = BeautifulSoup(html2, 'html.parser') place = soup.find(string=re.compile('所在地')) if place =='': address = '非西安' else: if'西安' in place: address = '西安' else: address = '非西安' alt.append(address) except: alt.append('非西安') continue time.sleep(0.2) # 把两个列表变成相对应的一对 for n in range(len(tlt)): txt = plt[n] name = tlt[n] adress = alt[n] print(name,adress) ilt.append([name, txt,adress]) def main(): keywords = '回坊' depth = 36 # 爬取的页数 start_time='2016-01-01:2016-05-02' start_url = 'https://s.weibo.com/weibo?q=' keywords '&scope=ori&suball=1×cope=custom:' start_time '&Refer=g' infoList = [] for i in range(depth): i = 1 try: url = start_url '&page=' str(i) html = getHTMLText(url) # 获得相应网页内容 parsePage(infoList,html) print("第" str(i) "页") # 每个网页内容的解析过程 time.sleep(5) except: print("内容有问题") continue # 打印列表内容 print(infoList) wb = openpyxl.Workbook() ws = wb.create_sheet("sheet1") for d in range(len(infoList)): e=infoList[d] dol=d 1 ws.cell(row=dol,column=1).value = e[0] ws.cell(row=dol,column=2).value = e[1] ws.cell(row=dol,column=3).value = e[2] wb.save('回坊.xlsx') print('保存成功') wb.close() main()