本文共 2857 字,大约阅读时间需要 9 分钟。
其中的Cookie需要根据自己电脑端的数据进行处理。
****和路径值记得修改
代码如下:
import requestsimport timeimport osimport csvimport codecsimport sysimport jsonimport importlibfrom openpyxl import load_workbookimport openpyxlfrom bs4 import BeautifulSoupimportlib.reload(sys) #要爬取热评的起始urlurl='https://m.weibo.cn/comments/hotflow?id=4393956248857472&mid=4393956248857472&max_id='headers={ 'Cookie':'自己电脑端的cookie', 'Referer': 'https://m.weibo.cn/detail/4393956248857472', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest'} def get_page(max_id, id_type):#参数 params={ 'max_id':max_id, 'max_id_type': id_type } try: r=requests.get(url, params=params, headers=headers) if r.status_code==200: return r.json() except requests.ConnectionError as e: print('error',e.args) def parse_page(jsondata): if jsondata: items=jsondata.get('data') item_max_id={} item_max_id['max_id']=items['max_id'] item_max_id['max_id_type']=items['max_id_type'] return item_max_iddef write_excel_title(): path = 'E:/EmotionTest/weibo.xlsx' #excel左下角表单名称 sheet_name_xlsx = '陈伟霆-垃圾分类' workbook = openpyxl.Workbook() sheet = workbook.active sheet.title = sheet_name_xlsx sheet.cell(row=1, column=1, value=str('作者')) sheet.cell(row=1, column=2, value=str('时间')) sheet.cell(row=1, column=3, value=str('点赞数')) sheet.cell(row=1, column=4, value=str('楼数')) sheet.cell(row=1, column=5, value=str('评论')) workbook.save(path) print("xlsx格式表格写入数据成功!")def write_excel_xlsx(jsondata,count): path = 'E:/EmotionTest/weibo.xlsx' datas = jsondata.get('data').get('data') #在原数据的基础上加上新数据 workbook = load_workbook(path) sheet = workbook.active j=count+1 for data in datas: username = data.get("user").get("screen_name") sheet.cell(column=1, row=j+1, value=str(username)) created_at = data.get("created_at") sheet.cell(column=2, row=j+1, value=str(created_at)) like_count = data.get("like_count") sheet.cell(column=3, row=j+1, value=str(like_count)) floor_number = data.get("floor_number") sheet.cell(column=4, row=j+1, value=str(floor_number)) comment = data.get("text") comment = BeautifulSoup(comment,'lxml').get_text() sheet.cell(column=5, row=j+1, value=str(comment)) j = j+1 workbook.save(path) print("xlsx格式表格写入数据成功!") maxpage = 5m_id = 0id_type = 0write_excel_title()i = 0for page in range(0,maxpage): print(page) jsondata=get_page(m_id, id_type) write_excel_xlsx(jsondata,i*20) results=parse_page(jsondata) time.sleep(1) m_id=results['max_id'] id_type=results['max_id_type'] i = i+1
转载地址:http://jbdlf.baihongyu.com/