最近在学习爬虫。爬取了第一页后,信心满满。但是翻不了页,就卡住了。搜了好多资料,b站都翻烂了,还是没找到合适的代码。最后在知乎找到了【2023微博评论爬虫】用python爬上千条微博评论,突破15页限制! - 知乎 (zhihu.com)
这篇文章干货满满 ,虽然我只爬出来了15页,但也是进步了,收获满满。
遇到的问题:
1、代码打完之后运行没有结果。(主函数缩进错误,有一段代码不小心写到for循环里了)
2、返回不出来json页面(重新复制了一下url)
3、时间返回不出来(大小写有错)
总结:
1、hotflow 是移动端的评论数据,commentbuild是pc端的评论数据
2、大小写不要随便就按照pycharm给出的提出改代码
3、运行不出结果不报错时,一步一步print,很重要!
import pprint
import re
from time import sleep
import requests
import json
import pandas as pd
import os
import datetime
from time import sleep
import random
from fake_useragent import UserAgent
# 转换时间
def trans_time(v_str):
GMT_FORMAT = '%a %b %d %H:%M:%S +0800 %Y'
timeArray = datetime.datetime.strptime(v_str, GMT_FORMAT)
ret_time = timeArray.strftime("%Y-%m-%d %H:%M:%S")
return ret_time
# 转换性别
def tran_gender(gender_tag):
if gender_tag == 'm':
return '男'
elif gender_tag == 'f':
return '女'
else:
return
def get_comments(v_weibo_ids, v_comment_file, v_max_page):
"""
:param v_weibo_ids: 微博id组成的列表
:param v_comment_file: 保存文件名
:param v_max_page: 最大页数
:return: None
"""
#print(1111111111, v_weibo_ids)
for weibo_id in v_weibo_ids:
# 初始化max_id
max_id = '0'
# 爬取n页,可任意修改
for page in range(1, v_max_page + 1):
wait_second = random.uniform(0, 1) # 等待时长秒
print('开始等待{}秒'.format(wait_second))
sleep(wait_second) # 随机等待
print("开始爬取{}页".format(page))
if page == 1: # 第一页,没有max_id参数
url = 'https://m.weibo.cn/comments/hotflow?id={}&mid={}&max_id_type=0'.format(weibo_id, weibo_id)
#https://m.weibo.cn/comments/hotflow?id=4934570812901749&mid=4934570812901749&max_id_type=0
else:
if max_id == '0':
print('max_id is 0,break now')
break
url = 'https://m.weibo.cn/comments/hotflow?id={}&mid={}&max_id_type=0&max_id={}'.format(weibo_id,weibo_id, max_id)
#print(222222222,url)
#https://m.weibo.cn/comments/hotflow?id=4934570812901749&mid=4934570812901749&max_id=162921594700254&max_id_type=0
#print(1111111111111111111111111111111111111111, url)
# 发送请求
ua = UserAgent()
headers = {
"user-agent": ua.random,
'cookie':'Cookie:_T_WM=53471771365; XSRF-TOKEN=918f2d; WEIBOCN_FROM=1110006030; MLOGIN=1; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWvAHGueA85rUwXGGxqn9hF5JpX5K-hUgL.FoMNSo5Ee0Mf1hn2dJLoI7XLxK.L1-eLB-B0e05c; SSOLoginState=1692011683; ALF=1694603683; SCF=Agcu37bX4pp62Fa20LbsJwL-0-g-AN1C0JSpLa1K9-zGYISA3f2KMzHPsnRcp2-RVm8heQi3oc_6Wauh0_9A0wo.; SUB=_2A25J3nzzDeRhGeFJ7VIT8ynJwzSIHXVrIQS7rDV6PUJbktANLRL_kW1Nf5DMr5x6D1W02kjPd5tQ5dcOAK_tZUZ6; mweibo_short_token=70f94b26bd; M_WEIBOCN_PARAMS=lfid%3D1076035997696411%26luicode%3D20000174%26uicode%3D20000061%26fid%3D4934600524044373%26oid%3D4934600524044373',
'Mweibo-Pwa':'1',
"Accept": "application/json, text/plain, */*",
"Accept-Encoding": "gzip, deflate, br",
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'referer': 'https://weibo.com/1496814565/Neh6Vk5o7',
}
r = requests.get(url=url, headers=headers) # 发送请求
#print(111111111111111111111111, r)
#pprint.pprint(r.json()) # 查看相应内容
try:
max_id = r.json()['data']['max_id']
datas = r.json()['data']['data']
except Exception as e:
print('excepted: ' + str(e))
continue
page_list = []
id_list = []
text_list = []
time_list = []
like_count_list = []
source_list = []
user_name_list = []
user_id_list = []
user_gender_list = []
follow_count_list = []
follower_count_list = []
datas = r.json()['data']['data']
#print(2222222222, datas)
for data in datas:
#pprint.pprint(data)
page_list.append(page)
id_list.append(page)
content = ''.join(re.findall('[\u4e00-\u9fa5]+', data['text']))
# dr = re.compile(r'[\u4e00-\u9fa5]+', re.S)
# text2 = dr.sub('', data['text'])
text_list.append(content)
time_list.append(trans_time(v_str=data['created_at']))
like_count_list.append(data['like_count'])
source_list.append(data['source'])
user_name_list.append(data['user']['screen_name'])
user_id_list.append(data['user']['id'])
user_gender_list.append(tran_gender(data['user']['gender']))
follow_count_list.append(data['user']['follow_count'])
follower_count_list.append(data['user']['followers_count'])
df = pd.DataFrame(
{
'微博id': [weibo_id] * len(time_list),
'评论页码': page_list,
'评论id': id_list,
'评论时间': time_list,
'评论点赞数': like_count_list,
'评论者IP归属地': source_list,
'评论者姓名': user_name_list,
'评论者id': user_id_list,
'评论者性别': user_gender_list,
'评论者关注数': follow_count_list,
'评论者粉丝数': follower_count_list,
'评论内容': text_list,
}
)
print(df)
if os.path.exists(v_comment_file):
header = False
else:
header = True
print(v_comment_file)
df.to_csv(v_comment_file, mode='a+', index=False, header=header, encoding='utf-8')
print('结果保存成功:{}'.format(v_comment_file))
if __name__ == '__main__':
weibo_id_list = ['4934570812901749']#4934570812901749','139144656495751','138869780012886
max_page = 20
comment_file = '四川医疗腐败.csv'
if os.path.exists(comment_file):
print('csv文件已存在,先删除', comment_file)
os.remove(comment_file)
print(weibo_id_list)
get_comments(v_weibo_ids=weibo_id_list, v_comment_file=comment_file, v_max_page=max_page)