python微博爬虫脚本,输入关键词,调整爬取的博文时间,即可运行使用
需要自己去获取自己账号的Cookies, 放到写好的cookies位置
import datetime import json import random import re import time import traceback import pymysql import requests from lxml import etree import urllib3 import openpyxl urllib3.disable_warnings() import random # 标识头 user_agents = [ "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2 ", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36", "Opera/12.80 (Windows NT 5.1; U; en) Presto/2.10.289 Version/12.02", ] headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', # 'Cache-Control': 'no-cache', # 'Connection': 'keep-alive', # 'referer': 'https://www.google.com/', # 'Upgrade-Insecure-Requests': '1', 'User-Agent': random.choice(user_agents) } # 获取当前的Url def get_html(url): num = 0 while True: num += 1 try: print("当前请求url:", url) time.sleep(2) headers[ 'cookie'] = '这里需要你的账号的cookies' response = requests.get(url, headers=headers, timeout=10, verify=False, proxies='') if response.status_code == 200: return response elif response.status_code == 404: return '' else: print('请求响应吗错误: {} 请求url{} 重新请求'.format(response.status_code, url)) except Exception as e: print("等待代{过}{滤}理更新") time.sleep(10) pass # 编码 def decodeContent(html): import cchardet as chardet gbk_list = ["gb2312", "GB2312", "GBK", "GB18030"] if isinstance(html, bytes): char = chardet.detect(html) confidence = char['confidence'] if "encoding" in char and confidence > 0.7: items = [char["encoding"]] else: items = re.compile(r'charset=([^\'\"]*?)[\'\"/\s]*?>').findall(str(html)) if not items: items = re.compile(r'charset=[\'\"](.*?)[\'\"]').findall(str(html)) if not items: items = re.compile(r'charset=(.*?)[\'\"]').findall(str(html)) if items: charset = 'gbk' if items[0] in gbk_list else items[0] try: res = html.decode(charset) except Exception as e: if charset == 'gbk': try: res = html.decode('gbk', 'ignore') except Exception as e: res = "" else: try: res = html.decode('utf-8', 'ignore') except Exception as e: res = "" else: try: res = html.decode('utf-8') except Exception as e: try: res = html.decode('gbk') except Exception as e: try: res = html.decode('utf-8', 'ignore') except Exception as e: res = "" return res return html # 提取网页内容, 并存储到工作簿中 wb = openpyxl.Workbook() ws = wb.active ws.title = 'Sheet1' ws.append((["content"])) def comment_info(res,keyword): try: contents_lis = res.xpath( '//div[@id="pl_feedlist_index"]/div[2]//div[@class="card-wrap"]//div[@class="content"]') digg = res.xpath('//div[@id="pl_feedlist_index"]/div[2]//div[@class="card-wrap"]//div[@class="card-act"]') user_lis = res.xpath('//div[@id="pl_feedlist_index"]/div[2]//div[@class="card-wrap"]//div[@class="avator"]') print(len(contents_lis)) for index, i in enumerate(contents_lis): try: content = ''.join(i.xpath('p[@node-type="feed_list_content"]//text()')).replace("\n",'').strip() print("@@@@@@@@@@@@@@", content) result_list = [content] ws.append((result_list)) wb.save('weibo_info.xlsx') except: traceback.print_exc() except: pass # 时间表示 def time_end_start(i, start_time): aaa = datetime.datetime.strptime(start_time, '%Y-%m-%d') threeDayAgo = (aaa + datetime.timedelta(days=i)) threeDayAgosss = (threeDayAgo - datetime.timedelta(days=1)) return threeDayAgo, threeDayAgosss # 程序进程 def run(lkll): # 关键词 lis = [lkll] # 开始时间结束时间 start_time = "2021-01-01" end_time = "2022-01-01" d1 = datetime.datetime.strptime(start_time, '%Y-%m-%d') d2 = datetime.datetime.strptime(end_time, '%Y-%m-%d') delta = d2 - d1 ccc = delta.days print(ccc) for i in range(0, int(ccc) + 1): tim, threeDayAgosss = time_end_start(i, start_time) tim = str(tim).replace("00:00:00", "").replace(" ", "") threeDayAgosss = str(threeDayAgosss).replace("00:00:00", "").replace(" ", "") print(tim) if tim: for j in lis: print(tim, threeDayAgosss,j) get_page(tim, threeDayAgosss, j) else: time.sleep(60) # 通过给定信息获取Url def get_page(tim, threeDayAgosss, j): page = 1 while True: try: print("________________当前第{}页_______________".format(page)) url = 'https://s.weibo.com/weibo?q={}&typeall=1&suball=1×cope=custom:{}:{}&Refer=g&page={}'.format(j, threeDayAgosss + '-0', tim + '-0', page) print("############", url) res = get_html(url) res = etree.HTML(res.text) comment_info(res,j) pagss = ''.join(res.xpath("//div[@class='m-page']/div/span/ul/li[last()]//text()")) print("!!!!!!!", pagss) pages = pagss.replace("第", '').replace("页", '') print(pages) if pages: if page < int(pages): page += 1 else: break else: break except: print("微博cookie失效,请更换cookie") traceback.print_exc() # 程序入口 if __name__ == '__main__': lkll = input("请输入关键词:") run(lkll)