python教程

python微博爬虫脚本分享

我的站长站 2023-06-14 人阅读

python微博爬虫脚本,输入关键词,调整爬取的博文时间,即可运行使用

需要自己去获取自己账号的Cookies, 放到写好的cookies位置

import datetime
import json
import random
import re
import time
import traceback
import pymysql
import requests
from lxml import etree
import urllib3
import openpyxl
urllib3.disable_warnings()
import random
# 标识头
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2 ",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
    "Opera/12.80 (Windows NT 5.1; U; en) Presto/2.10.289 Version/12.02",
]
headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    # 'Cache-Control': 'no-cache',
    # 'Connection': 'keep-alive',
    # 'referer': 'https://www.google.com/',
    # 'Upgrade-Insecure-Requests': '1',
    'User-Agent': random.choice(user_agents)
}
# 获取当前的Url
def get_html(url):
    num = 0
    while True:
        num += 1
        try:
            print("当前请求url:", url)
            time.sleep(2)
            headers[
                'cookie'] = '这里需要你的账号的cookies'
            response = requests.get(url, headers=headers, timeout=10, verify=False, proxies='')
            if response.status_code == 200:
                return response
            elif response.status_code == 404:
                return ''
            else:
                print('请求响应吗错误: {}  请求url{}  重新请求'.format(response.status_code, url))
        except Exception as e:
            print("等待代{过}{滤}理更新")
            time.sleep(10)
            pass
# 编码
def decodeContent(html):
    import cchardet as chardet
    gbk_list = ["gb2312", "GB2312", "GBK", "GB18030"]
    if isinstance(html, bytes):
        char = chardet.detect(html)
        confidence = char['confidence']
        if "encoding" in char and confidence > 0.7:
            items = [char["encoding"]]
        else:
            items = re.compile(r'charset=([^\'\"]*?)[\'\"/\s]*?>').findall(str(html))
            if not items:
                items = re.compile(r'charset=[\'\"](.*?)[\'\"]').findall(str(html))
            if not items:
                items = re.compile(r'charset=(.*?)[\'\"]').findall(str(html))
        if items:
            charset = 'gbk' if items[0] in gbk_list else items[0]
            try:
                res = html.decode(charset)
            except Exception as e:
                if charset == 'gbk':
                    try:
                        res = html.decode('gbk', 'ignore')
                    except Exception as e:
                        res = ""
                else:
                    try:
                        res = html.decode('utf-8', 'ignore')
                    except Exception as e:
                        res = ""
        else:
            try:
                res = html.decode('utf-8')
            except Exception as e:
                try:
                    res = html.decode('gbk')
                except Exception as e:
                    try:
                        res = html.decode('utf-8', 'ignore')
                    except Exception as e:
                        res = ""
        return res
    return html
# 提取网页内容, 并存储到工作簿中
wb = openpyxl.Workbook()
ws = wb.active
ws.title = 'Sheet1'
ws.append((["content"]))
def comment_info(res,keyword):
    try:
        contents_lis = res.xpath(
            '//div[@id="pl_feedlist_index"]/div[2]//div[@class="card-wrap"]//div[@class="content"]')
        digg = res.xpath('//div[@id="pl_feedlist_index"]/div[2]//div[@class="card-wrap"]//div[@class="card-act"]')
        user_lis = res.xpath('//div[@id="pl_feedlist_index"]/div[2]//div[@class="card-wrap"]//div[@class="avator"]')
        print(len(contents_lis))
        for index, i in enumerate(contents_lis):
            try:
                content = ''.join(i.xpath('p[@node-type="feed_list_content"]//text()')).replace("\n",'').strip()
                print("@@@@@@@@@@@@@@", content)
                result_list = [content]
                ws.append((result_list))
                wb.save('weibo_info.xlsx')
            except:
                traceback.print_exc()
    except:
        pass
# 时间表示
def time_end_start(i, start_time):
    aaa = datetime.datetime.strptime(start_time, '%Y-%m-%d')
    threeDayAgo = (aaa + datetime.timedelta(days=i))
    threeDayAgosss = (threeDayAgo - datetime.timedelta(days=1))
    return threeDayAgo, threeDayAgosss
# 程序进程
def run(lkll):
    # 关键词
    lis = [lkll]
    # 开始时间结束时间
    start_time = "2021-01-01"
    end_time = "2022-01-01"
    d1 = datetime.datetime.strptime(start_time, '%Y-%m-%d')
    d2 = datetime.datetime.strptime(end_time, '%Y-%m-%d')
    delta = d2 - d1
    ccc = delta.days
    print(ccc)
    for i in range(0, int(ccc) + 1):
        tim, threeDayAgosss = time_end_start(i, start_time)
        tim = str(tim).replace("00:00:00", "").replace(" ", "")
        threeDayAgosss = str(threeDayAgosss).replace("00:00:00", "").replace(" ", "")
        print(tim)
        if tim:
            for j in lis:
                print(tim, threeDayAgosss,j)
                get_page(tim, threeDayAgosss, j)
        else:
            time.sleep(60)
# 通过给定信息获取Url
def get_page(tim, threeDayAgosss, j):
    page = 1
    while True:
        try:
            print("________________当前第{}页_______________".format(page))
            url = 'https://s.weibo.com/weibo?q={}&typeall=1&suball=1×cope=custom:{}:{}&Refer=g&page={}'.format(j,
                                                                                                                    threeDayAgosss + '-0',
                                                                                                                    tim + '-0',
                                                                                                                    page)
            print("############", url)
            res = get_html(url)
            res = etree.HTML(res.text)
            comment_info(res,j)
            pagss = ''.join(res.xpath("//div[@class='m-page']/div/span/ul/li[last()]//text()"))
            print("!!!!!!!", pagss)
            pages = pagss.replace("第", '').replace("页", '')
            print(pages)
            if pages:
                if page < int(pages):
                    page += 1
                else:
                    break
            else:
                break
        except:
            print("微博cookie失效,请更换cookie")
            traceback.print_exc()
# 程序入口
if __name__ == '__main__':
    lkll = input("请输入关键词:")
    run(lkll)