add:comments

This commit is contained in:
Sheyiyuan 2025-03-23 22:32:02 +08:00
parent f92b770ce6
commit c54f5a2a18
4 changed files with 63 additions and 52 deletions

3
.gitignore vendored
View File

@ -1,5 +1,4 @@
./venv/ ./venv/
.idea/ .idea/
data/ data/
*/.DS_Store */.DS_Store
targets.txt

110
main.py
View File

@ -5,8 +5,36 @@ from bs4 import BeautifulSoup
from time import sleep from time import sleep
from random import uniform from random import uniform
from fake_useragent import UserAgent from fake_useragent import UserAgent
import datetime
import time import time
ONEBOT_HOST = "http://sheyiyuan.cn:63000" # OneBot服务地址
USER_ID = "1040843522"
def send_notification(error_msg):
try:
# OneBot私信消息协议
url = f"{ONEBOT_HOST}/send_group_msg"
data = {
"group_id": USER_ID,
"message": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + f"[SMA-bilibili监控告警]\n{error_msg}"
}
requests.post(url, json=data, timeout=5)
except Exception as e:
print(f"QQ通知发送失败: {e}")
def send_info_log(log_msg):
try:
# OneBot私信消息协议
url = f"{ONEBOT_HOST}/send_group_msg"
data = {
"group_id": USER_ID,
"message": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + f"[SMA-bilibili监控日志]\n{log_msg}"
}
requests.post(url, json=data, timeout=5)
except Exception as e:
print(f"QQ通知发送失败: {e}")
def load_targets(): def load_targets():
"""从targets.txt加载BV号列表""" """从targets.txt加载BV号列表"""
try: try:
@ -175,14 +203,16 @@ class BiliWebCrawler:
} }
except Exception as e: except Exception as e:
print(f"解析UP主数据失败: {str(e)}") print(f"解析UP主数据失败: {str(e)}")
send_notification(f"解析UP主数据失败: {str(e)}")
return None return None
try: try:
# 获取投稿列表 # 获取投稿列表
archive_url = f'https://api.bilibili.com/x/space/arc/search?mid={mid}&ps=30' archive_url = f'https://api.bilibili.com/x/space/arc/search?mid={mid}&ps=30'
sleep(5)
archive_resp = self._safe_request(archive_url) archive_resp = self._safe_request(archive_url)
if archive_resp and archive_resp.status_code == 200: if archive_resp and archive_resp.status_code == 200:
archive_data = archive_resp.json() archive_data = archive_resp.json()
print(archive_data) # print(archive_data)
videos = archive_data.get('data', {}).get('list', {}).get('vlist', []) videos = archive_data.get('data', {}).get('list', {}).get('vlist', [])
# 计算30天前的时间戳 # 计算30天前的时间戳
@ -192,12 +222,14 @@ class BiliWebCrawler:
up_info['近一个月投稿数'] = recent_count up_info['近一个月投稿数'] = recent_count
except Exception as e: except Exception as e:
print(f"获取投稿数据失败: {str(e)}") print(f"获取投稿数据失败: {str(e)}")
send_notification(f"获取投稿数据失败: {str(e)}")
return up_info return up_info
def get_danmaku(self): def get_danmaku(self):
"""获取弹幕数据""" """获取弹幕数据"""
if not self.bvid: if not self.bvid:
send_notification("未找到 BVID无法获取弹幕数据。")
print("未找到 BVID无法获取弹幕数据。") print("未找到 BVID无法获取弹幕数据。")
return [] return []
@ -209,6 +241,7 @@ class BiliWebCrawler:
data = resp.json() data = resp.json()
cid = data.get('data', {}).get('cid') cid = data.get('data', {}).get('cid')
if not cid: if not cid:
send_notification("未找到 cid无法获取弹幕数据。")
print("未找到 cid无法获取弹幕数据。") print("未找到 cid无法获取弹幕数据。")
return [] return []
@ -231,8 +264,9 @@ class BiliWebCrawler:
return danmaku return danmaku
def get_comments(self, max_pages=1000): def get_comments(self, max_pages=1000):
"""获取热门评论,包含二级评论""" """获取热门评论,仅保留一级评论"""
if not self.aid: if not self.aid:
send_notification("未找到视频 ID无法获取评论数据。")
print("未找到视频 ID无法获取评论数据。") print("未找到视频 ID无法获取评论数据。")
return [] return []
comments = [] comments = []
@ -256,49 +290,19 @@ class BiliWebCrawler:
'回复时间': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(comment['ctime'])) '回复时间': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(comment['ctime']))
} }
comments.append(comment_info) comments.append(comment_info)
replies = self.fetch_comment_replies(comment['rpid'], comment['member']['uname']) # 移除获取二级评论的代码
comments.extend(replies)
else: else:
# 当当前页没有评论时,跳出循环 # 当当前页没有评论时,跳出循环
break break
except requests.RequestException as e: except requests.RequestException as e:
send_notification(f"评论请求出错: {e}")
print(f"请求出错: {e}") print(f"请求出错: {e}")
break break
# 适当调整请求间隔 # 适当调整请求间隔
sleep(1) sleep(5)
return comments return comments
def fetch_comment_replies(self, comment_id, parent_user_name, max_pages=1000): # 移除 fetch_comment_replies 方法(原方法不再需要)
replies = []
for page in range(1, max_pages + 1):
url = f'https://api.bilibili.com/x/v2/reply/reply?oid={self.aid}&type=1&root={comment_id}&ps=10&pn={page}'
try:
response = self._safe_request(url)
if response and response.status_code == 200:
response.encoding = 'utf-8'
data = response.json()
if data and data.get('data') and data['data'].get('replies'):
for reply in data['data']['replies']:
reply_info = {
'用户昵称': reply['member']['uname'],
'评论内容': reply['content']['message'],
'被回复用户': parent_user_name,
'评论层级': '二级评论',
'性别': reply['member']['sex'],
'用户当前等级': reply['member']['level_info']['current_level'],
'点赞数量': reply['like'],
'回复时间': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(reply['ctime']))
}
replies.append(reply_info)
else:
# 当当前页没有二级评论时,跳出循环
break
except requests.RequestException as e:
print(f"请求二级评论出错: {e}")
break
# 适当调整请求间隔
sleep(1)
return replies
def _parse_count(self, text): def _parse_count(self, text):
"""统一处理数量文本""" """统一处理数量文本"""
@ -338,24 +342,30 @@ class BiliWebCrawler:
video_info_filename = os.path.join(video_dir, f'{self.bvid}_{play_count}_info.csv') video_info_filename = os.path.join(video_dir, f'{self.bvid}_{play_count}_info.csv')
self.save_to_csv([video_info], video_info_filename) self.save_to_csv([video_info], video_info_filename)
sleep(5)
# 新增弹幕抓取 # 新增弹幕抓取
print("正在抓取弹幕数据...") print("正在抓取弹幕数据...")
#danmaku = self.get_danmaku() danmaku = self.get_danmaku()
#danmaku_filename = os.path.join(video_dir, f'{self.bvid}_{len(danmaku)}_danmaku.csv') danmaku_filename = os.path.join(video_dir, f'{self.bvid}_{len(danmaku)}_danmaku.csv')
#self.save_to_csv(danmaku, danmaku_filename) self.save_to_csv(danmaku, danmaku_filename)
sleep(5)
# 新增评论抓取 # 新增评论抓取
print("正在抓取评论数据...") print("正在抓取评论数据...")
#comments = self.get_comments() comments = self.get_comments()
#comments_filename = os.path.join(video_dir, f'{self.bvid}_{len(comments)}_comments.csv') comments_filename = os.path.join(video_dir, f'{self.bvid}_{len(comments)}_comments.csv')
#self.save_to_csv(comments, comments_filename) self.save_to_csv(comments, comments_filename)
sleep(5)
# 新增UP主信息记录 # 新增UP主信息记录
print("正在获取UP主信息...") print("正在获取UP主信息...")
#up_info = self.get_up_info(video_info.get('up主UID')) up_info = self.get_up_info(video_info.get('up主UID'))
#up_info['BV号'] = self.bvid up_info['BV号'] = self.bvid
#up_csv_path = os.path.join(base_dir, 'up_info.csv') up_csv_path = os.path.join(base_dir, 'up_info.csv')
#self.save_to_csv([up_info], up_csv_path, mode='a') self.save_to_csv([up_info], up_csv_path, mode='a')
print(f"抓取完成!结果已保存到 {video_dir}/") print(f"抓取完成!结果已保存到 {video_dir}/")
else: else:
@ -363,13 +373,15 @@ class BiliWebCrawler:
if __name__ == "__main__": if __name__ == "__main__":
cookie = "buvid3=669D9192-9030-AE04-8149-45A24D82CBB985872infoc; b_nut=1728112285; _uuid=BDD29A64-331010-1578-7AB2-6985DCD1EC10586028infoc; enable_web_push=DISABLE; buvid4=02E86127-F707-C9D6-1E0E-62127CDB94EA86683-024100507-5HwKLZoKiRRAzdUiyP1DtUb99jVdMKHnip8nMCxMvnyDueJx41kzeR6uEnG0C2HY; DedeUserID=399209972; DedeUserID__ckMd5=9ad9de58e979dbdf; header_theme_version=CLOSE; rpdid=0zbfAHZlOT|utNc6ahG|2XT|3w1SWYYy; buvid_fp_plain=undefined; hit-dyn-v2=1; is-2022-channel=1; LIVE_BUVID=AUTO7717291772934669; PVID=2; enable_feed_channel=ENABLE; CURRENT_QUALITY=80; fingerprint=262ed395815a48ea928a2b3cf305da95; buvid_fp=262ed395815a48ea928a2b3cf305da95; bsource=search_bing; CURRENT_FNVAL=4048; bp_t_offset_399209972=1046334052841291776; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3NDI3ODE1NzgsImlhdCI6MTc0MjUyMjMxOCwicGx0IjotMX0.yseN1sbG5qDIlo5L0mUKqRr2tCL0OqPBnSnnHUYCWtE; bili_ticket_expires=1742781518; home_feed_column=5; browser_resolution=1702-986; b_lsid=EAE310D10E_195C32A76ED; SESSDATA=d0449397%2C1758291057%2C666e5%2A32CjDmj_WpEXUbv2oTIls3PPvM1wODDzR-gnPMRP5gwm09UIZ7YGdhTsJ7ssNg5Tb19qESVmlUMktwdlhkTnNlM0dTU05kZlBOTERNM3JqeTVSNGFaRzdwZmtNWUpjTHhPWmdxTzJiWmdDSVZxNkpEb0VGNHJFTEdaTlJtcVhBUlMzbEZJTzdCeXNnIIEC; bili_jct=e7f4029b2be38fe915678f49aa5b36f7; sid=nxbw3hdh"
# 批量处理targets.txt中的BV号 # 批量处理targets.txt中的BV号
targets = load_targets() targets = load_targets()
if not targets: if not targets:
print("未找到有效的BV号程序退出") print("未找到有效的BV号程序退出")
exit() exit()
send_info_log(f"开始批量处理 {len(targets)} 个视频")
for bvid in targets: for bvid in targets:
print(f"\n{'=' * 30} 开始处理 {bvid} {'=' * 30}") print(f"\n{'=' * 30} 开始处理 {bvid} {'=' * 30}")
crawler = BiliWebCrawler(f"https://www.bilibili.com/video/{bvid}") crawler = BiliWebCrawler(f"https://www.bilibili.com/video/{bvid}",cookie)
crawler.run() crawler.run()
sleep(5)

View File

@ -561,4 +561,4 @@ BV1KLXVYREoh
BV1jFQrY1EtZ BV1jFQrY1EtZ
BV1PxXLYdEvh BV1PxXLYdEvh
BV1XCQNYzEs5 BV1XCQNYzEs5
BV1kcQBY7Erz BV1kcQBY7Erz