python教程

python微博爬虫脚本分享

我的站长站 2023-06-14 人阅读

python微博爬虫脚本,输入关键词,调整爬取的博文时间,即可运行使用

需要自己去获取自己账号的Cookies, 放到写好的cookies位置

import datetime
import json
import random
import re
import time
import traceback
import pymysql
import requests
from lxml import etree
import urllib3
import openpyxl
urllib3.disable_warnings()
import random
# 标识头
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2 ",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
    "Opera/12.80 (Windows NT 5.1; U; en) Presto/2.10.289 Version/12.02",
]
headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    # 'Cache-Control': 'no-cache',
    # 'Connection': 'keep-alive',
    # 'referer': 'https://www.google.com/',
    # 'Upgrade-Insecure-Requests': '1',
    'User-Agent': random.choice(user_agents)
}
# 获取当前的Url
def get_html(url):
    num = 0
    while True:
        num += 1
        try:
            print("当前请求url:", url)
            time.sleep(2)
            headers[
                'cookie'] = '这里需要你的账号的cookies'
            response = requests.get(url, headers=headers, timeout=10, verify=False, proxies='')
            if response.status_code == 200:
                return response
            elif response.status_code == 404:
                return ''
            else:
                print('请求响应吗错误: {}  请求url{}  重新请求'.format(response.status_code, url))
        except Exception as e:
            print("等待代{过}{滤}理更新")
            time.sleep(10)
            pass
# 编码
def decodeContent(html):
    import cchardet as chardet
    gbk_list = ["gb2312", "GB2312", "GBK", "GB18030"]
    if isinstance(html, bytes):
        char = chardet.detect(html)
        confidence = char['confidence']
        if "encoding" in char and confidence > 0.7:
            items = [char["encoding"]]
        else:
            items = re.compile(r'charset=([^\'\"]*?)[\'\"/\s]*?>').findall(str(html))
            if not items:
                items = re.compile(r'charset=[\'\"](.*?)[\'\"]').findall(str(html))
            if not items:
                items = re.compile(r'charset=(.*?)[\'\"]').findall(str(html))
        if items:
            charset = 'gbk' if items[0] in gbk_list else items[0]
            try:
                res = html.decode(charset)
            except Exception as e:
                if charset == 'gbk':
                    try:
                        res = html.decode('gbk', 'ignore')
                    except Exception as e:
                        res = ""
                else:
                    try:
                        res = html.decode('utf-8', 'ignore')
                    except Exception as e:
                        res = ""
        else:
            try:
                res = html.decode('utf-8')
            except Exception as e:
                try:
                    res = html.decode('gbk')
                except Exception as e:
                    try:
                        res = html.decode('utf-8', 'ignore')
                    except Exception as e:
                        res = ""
        return res
    return html
# 提取网页内容, 并存储到工作簿中
wb = openpyxl.Workbook()
ws = wb.active
ws.title = 'Sheet1'
ws.append((["content"]))
def comment_info(res,keyword):
    try:
        contents_lis = res.xpath(
            '//div[@id="pl_feedlist_index"]/div[2]//div[@class="card-wrap"]//div[@class="content"]')
        digg = res.xpath('//div[@id="pl_feedlist_index"]/div[2]//div[@class="card-wrap"]//div[@class="card-act"]')
        user_lis = res.xpath('//div[@id="pl_feedlist_index"]/div[2]//div[@class="card-wrap"]//div[@class="avator"]')
        print(len(contents_lis))
        for index, i in enumerate(contents_lis):
            try:
                content = ''.join(i.xpath('p[@node-type="feed_list_content"]//text()')).replace("\n",'').strip()
                print("@@@@@@@@@@@@@@", content)
                result_list = [content]
                ws.append((result_list))
                wb.save('weibo_info.xlsx')
            except:
                traceback.print_exc()
    except:
        pass
# 时间表示
def time_end_start(i, start_time):
    aaa = datetime.datetime.strptime(start_time, '%Y-%m-%d')
    threeDayAgo = (aaa + datetime.timedelta(days=i))
    threeDayAgosss = (threeDayAgo - datetime.timedelta(days=1))
    return threeDayAgo, threeDayAgosss
# 程序进程
def run(lkll):
    # 关键词
    lis = [lkll]
    # 开始时间结束时间
    start_time = "2021-01-01"
    end_time = "2022-01-01"
    d1 = datetime.datetime.strptime(start_time, '%Y-%m-%d')
    d2 = datetime.datetime.strptime(end_time, '%Y-%m-%d')
    delta = d2 - d1
    ccc = delta.days
    print(ccc)
    for i in range(0, int(ccc) + 1):
        tim, threeDayAgosss = time_end_start(i, start_time)
        tim = str(tim).replace("00:00:00", "").replace(" ", "")
        threeDayAgosss = str(threeDayAgosss).replace("00:00:00", "").replace(" ", "")
        print(tim)
        if tim:
            for j in lis:
                print(tim, threeDayAgosss,j)
                get_page(tim, threeDayAgosss, j)
        else:
            time.sleep(60)
# 通过给定信息获取Url
def get_page(tim, threeDayAgosss, j):
    page = 1
    while True:
        try:
            print("________________当前第{}页_______________".format(page))
            url = 'https://s.weibo.com/weibo?q={}&typeall=1&suball=1×cope=custom:{}:{}&Refer=g&page={}'.format(j,
                                                                                                                    threeDayAgosss + '-0',
                                                                                                                    tim + '-0',
                                                                                                                    page)
            print("############", url)
            res = get_html(url)
            res = etree.HTML(res.text)
            comment_info(res,j)
            pagss = ''.join(res.xpath("//div[@class='m-page']/div/span/ul/li[last()]//text()"))
            print("!!!!!!!", pagss)
            pages = pagss.replace("第", '').replace("页", '')
            print(pages)
            if pages:
                if page < int(pages):
                    page += 1
                else:
                    break
            else:
                break
        except:
            print("微博cookie失效,请更换cookie")
            traceback.print_exc()
# 程序入口
if __name__ == '__main__':
    lkll = input("请输入关键词:")
    run(lkll)


相关推荐
  • python爬虫
  • Python脚本
  • Python好看视频地址解析下载代码

    #encoding:utf-8# 好看视频下载 import socketfrom urllib.request import urlopenimport urllibimport reimport timefrom pyquery import PyQuery as pqimport requestsfrom tqdm import tqdm # 打印进度条的库import gzip print(&#39;程序开始运...

    python教程 138 3年前
  • python美女写真图库爬虫

    import requestsfrom lxml import etreeimport csvfrom time import sleepimport osfrom concurrent.futures import ThreadPoolExecutor headers = { &#39;user-agent&#39;: &#39;Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit...

    python教程 56 3年前
  • 笔趣阁小说网Python爬虫分享

    #[url=https://www.biquge.info/wanjiexiaoshuo/]https://www.biquge.info/wanjiexiaoshuo/[/url] 笔趣阁小说全本爬虫import timeimport requestsimport osimport randomfrom lxml import etreeimport webbrowserheader = { "User-Agent": "Mo...

    python教程 168 3年前
  • 监测腾讯云轻量服务器流量超标关机python脚本

    脚本介绍一款监测腾讯云轻量应用服务器流量包使用情况,并根据配置进行警告和关机的Python脚本。GitHub:https://github.com/XiaoXinYo/Tencent_Cloud_LightHouse_Server_Guardian脚本功能仅用于轻量级服务器1.自动检测流量包剩余,可设置使用比2.自动关...

    python教程 100 2年前
  • Python无需认证QQ扫码登录脚本

    无需认证QQ扫码登录脚本python脚本,盗用JD的QQ登录,也可以改成其他网址。无需自己注册腾讯开发者,无需自己有一套网址去申请应用Get_QQ返回QQ号,也可以获取到QQ头像、好友等其他信息,请勿用于非法行为import requestsimport timefrom PIL import Imagedef...

    python教程 362 3年前
  • 最新python织梦dedecms远程执行脚本

    织梦CMS是使用最多的CMS之 一,但是漏洞也非常多。分享一款python写的织梦远程文件包含漏洞。修复此漏洞方法,请见文章底部。织梦CMS漏洞代码#! /usr/bin/env python#coding=utf-8#Joseph(小续)import requestsimport sysimport redef main():try:url="...

    服务器配置 298 5年前
最新更新