SAM-bilibili/main.py

445 lines
18 KiB
Python
Raw Permalink Normal View History

2025-03-18 19:13:56 +08:00
import requests
import re
import csv
from bs4 import BeautifulSoup
from time import sleep
from random import uniform
from fake_useragent import UserAgent
2025-03-23 22:32:02 +08:00
import datetime
2025-03-18 19:13:56 +08:00
import time
2025-03-23 22:32:02 +08:00
ONEBOT_HOST = "http://sheyiyuan.cn:63000" # OneBot服务地址
USER_ID = "1040843522"
def send_notification(error_msg):
try:
# OneBot私信消息协议
url = f"{ONEBOT_HOST}/send_group_msg"
data = {
"group_id": USER_ID,
"message": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + f"[SMA-bilibili监控告警]\n{error_msg}"
}
requests.post(url, json=data, timeout=5)
except Exception as e:
print(f"QQ通知发送失败: {e}")
def send_info_log(log_msg):
try:
# OneBot私信消息协议
url = f"{ONEBOT_HOST}/send_group_msg"
data = {
"group_id": USER_ID,
"message": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + f"[SMA-bilibili监控日志]\n{log_msg}"
}
requests.post(url, json=data, timeout=5)
except Exception as e:
print(f"QQ通知发送失败: {e}")
2025-03-18 19:13:56 +08:00
def load_targets():
"""从targets.txt加载BV号列表"""
try:
with open('targets.txt', 'r', encoding='utf-8') as f:
return [line.strip() for line in f if line.strip().startswith('BV')]
except FileNotFoundError:
print("未找到targets.txt文件请创建文件并添加BV号")
return []
class BiliWebCrawler:
def __init__(self, url, cookie=None):
self.url = url
self.bvid = self._extract_bvid()
self.user_agent = UserAgent()
self.headers = {
2025-03-24 09:28:43 +08:00
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Referer': f'https://www.bilibili.com/video/{self.bvid}',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
2025-03-18 19:13:56 +08:00
}
if cookie:
self.headers['Cookie'] = cookie
self.session = requests.Session()
self.aid = self._get_video_id()
def _extract_bvid(self):
"""从视频链接中提取 BVID"""
match = re.search(r'(BV[A-Za-z0-9]+)', self.url)
if match:
return match.group(1)
return None
def _get_video_id(self):
if not self.bvid:
return None
url = f'https://api.bilibili.com/x/web-interface/view?bvid={self.bvid}'
resp = self._safe_request(url)
if resp:
data = resp.json()
return data.get('data', {}).get('aid')
return None
def _safe_request(self, url, params=None, retry=3): # 添加 params 参数
"""带重试机制的请求"""
for i in range(retry):
try:
resp = self.session.get(url, headers=self.headers, timeout=10, params=params) # 使用 params 参数
resp.raise_for_status()
return resp
except Exception as e:
print(f"请求失败: {str(e)}, 第{i + 1}次重试...")
2025-03-23 22:36:33 +08:00
send_notification(f"请求失败: {str(e)}, 第{i + 1}次重试...")
2025-03-18 19:13:56 +08:00
sleep(uniform(1, 3))
return None
def get_video_info(self):
"""获取视频基本信息"""
if not self.bvid:
return None
# 获取视频基本信息
url = f'https://api.bilibili.com/x/web-interface/view?bvid={self.bvid}'
resp = self._safe_request(url)
if not resp:
return None
data = resp.json()
video_data = data.get('data')
if not video_data:
return None
# 获取视频最高分辨率基于dimension对象
max_width = 0
max_height = 0
2025-03-19 09:30:30 +08:00
dimension = video_data.get('dimension', {})
width = dimension.get('width', 0)
height = dimension.get('height', 0)
rotate = dimension.get('rotate', 0)
2025-03-18 19:13:56 +08:00
2025-03-19 09:30:30 +08:00
# 处理视频旋转当rotate=1时宽高互换
if rotate == 1:
width, height = height, width
2025-03-18 19:13:56 +08:00
2025-03-19 09:30:30 +08:00
# 通过像素总量比较分辨率
if (width * height) > (max_width * max_height):
max_width = width
max_height = height
2025-03-18 19:13:56 +08:00
# 将分辨率格式化为 "宽x高" 的字符串
resolution_str = f"{max_width}x{max_height}" if max_width and max_height else "未知"
2025-03-24 10:14:31 +08:00
sleep(uniform(1, 2))
2025-03-18 19:13:56 +08:00
# 获取视频标签
tag_url = f'https://api.bilibili.com/x/web-interface/view/detail/tag?bvid={self.bvid}'
tag_resp = self._safe_request(tag_url)
tag_data = []
2025-03-24 09:35:14 +08:00
try:
if tag_resp:
tag_json = tag_resp.json()
tag_data = [tag['tag_name'] for tag in tag_json.get('data', [])]
except Exception as e:
print(f"获取视频标签失败: {str(e)}")
send_notification(f"获取视频{self.bvid}标签失败: {str(e)}")
2025-03-24 10:14:31 +08:00
if tag_resp:
send_info_log(f"原始响应内容: {tag_resp.text[:500]}")
2025-03-18 19:13:56 +08:00
2025-03-24 08:25:37 +08:00
subtitle_data = []
try:
subtitle_raw = video_data.get('subtitle', {}).get('list', [])
except Exception as e:
subtitle_raw = []
for subtitle in subtitle_raw:
subtitle_data.append({
'语言': subtitle.get('lan_doc', ''),
'字幕数量': subtitle.get('subtitles', []),
'字幕URL': subtitle.get('subtitle_url', ''),
})
honor_data = []
2025-03-24 09:35:14 +08:00
try:
honors = video_data.get('honor', [])
if isinstance(honors, list): # 确保是列表类型
for honor in honors:
if isinstance(honor, dict): # 检查是否为字典类型
honor_data.append({
'type': honor.get('type', 0),
'desc': honor.get('desc', '')
})
# 如果honors是字符串则直接记录
elif isinstance(honors, str):
honor_data.append({'desc': honors})
except Exception as e:
honor_data = []
2025-03-24 08:25:37 +08:00
2025-03-24 09:10:24 +08:00
# 获取staff列表
try:
staff= video_data.get('staff', [])
if len(staff)>0:
is_unity_up = True
else:
is_unity_up = False
except Exception as e:
is_unity_up = False
2025-03-18 19:13:56 +08:00
info = {
2025-03-19 09:30:30 +08:00
'BV号': self.bvid,
2025-03-18 19:13:56 +08:00
'title': video_data.get('title', ''),
2025-03-19 09:30:30 +08:00
'up主名称': video_data.get('owner', {}).get('name', ''), # 新增字段
'up主UID': video_data.get('owner', {}).get('mid', ''), # 新增UID字段
2025-03-18 19:13:56 +08:00
'播放量': video_data.get('stat', {}).get('view', 0),
'弹幕量': video_data.get('stat', {}).get('danmaku', 0),
'点赞量': video_data.get('stat', {}).get('like', 0),
'投币量': video_data.get('stat', {}).get('coin', 0),
'收藏量': video_data.get('stat', {}).get('favorite', 0),
2025-03-19 09:30:30 +08:00
'分享量': video_data.get('stat', {}).get('share', 0),
'评论量': video_data.get('stat', {}).get('reply', 0),
'发布时间的timestamp': video_data.get('pubdate', 0),
2025-03-24 08:25:37 +08:00
'视频荣誉': honor_data,
2025-03-18 19:13:56 +08:00
'发布时间': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(video_data.get('pubdate', 0))),
'分区': video_data.get('tname', ''),
'标签': tag_data,
2025-03-24 09:10:24 +08:00
'是否为联合投稿': is_unity_up,
2025-03-19 09:30:30 +08:00
'视频方向': self._get_video_orientation(video_data.get('dimension', {})),
2025-03-18 19:13:56 +08:00
'视频最高分辨率': resolution_str,
2025-03-19 09:30:30 +08:00
'视频类型': ["","自制", "转载"][video_data.get('copyright', 0)],
'视频分p数': len(video_data.get('pages', [])),
2025-03-24 08:25:37 +08:00
'视频字幕': subtitle_data,
2025-03-19 09:30:30 +08:00
'视频总时长': self.get_video_length(video_data.get('pages', [])),
2025-03-23 19:33:37 +08:00
'视频封面URL': video_data.get('pic', ''),
2025-03-19 09:30:30 +08:00
'简介': video_data.get('desc', '').replace('\n', '\\n'),
2025-03-24 08:25:37 +08:00
2025-03-18 19:13:56 +08:00
}
return info
2025-03-19 09:30:30 +08:00
def get_video_length(self,pages):
"""获取视频总时长"""
length = 0
for page in pages:
length += page.get('duration', 0)
return length
def _get_video_orientation(self, dimension):
"""判断视频方向(横屏/竖屏)"""
width = dimension.get('width', 0)
height = dimension.get('height', 0)
rotate = dimension.get('rotate', 0)
# 处理视频旋转90度或270度旋转时需要交换宽高
if rotate in [1, 3]:
width, height = height, width
return "横屏" if width >= height else "竖屏"
# 在类方法中添加以下新方法(建议放在 get_video_info 方法之后)
def get_up_info(self, mid):
"""获取UP主详细信息"""
if not mid:
return None
url = f"https://api.bilibili.com/x/web-interface/card?mid={mid}&photo=false"
resp = self._safe_request(url)
if not resp:
return None
try:
data = resp.json().get('data', {})
card = data.get('card')
up_info = {
'uid': mid,
'昵称': card['name'],
'性别': card['sex'],
'头像': card['face'],
'等级': card['level_info']['current_level'],
'粉丝数': card['fans'],
'稿件数': data['archive_count'],
'获赞数': data['like_num'],
}
except Exception as e:
print(f"解析UP主数据失败: {str(e)}")
2025-03-23 22:32:02 +08:00
send_notification(f"解析UP主数据失败: {str(e)}")
2025-03-19 09:30:30 +08:00
return None
2025-03-24 07:35:33 +08:00
# try:
# # 获取投稿列表
# archive_url = f'https://api.bilibili.com/x/space/arc/search?mid={mid}&ps=30'
# sleep(1)
# archive_resp = self._safe_request(archive_url)
# if archive_resp and archive_resp.status_code == 200:
# archive_data = archive_resp.json()
# # print(archive_data)
# videos = archive_data.get('data', {}).get('list', {}).get('vlist', [])
#
# # 计算30天前的时间戳
# month_ago = time.time() - 30 * 86400
# # 统计符合时间条件的视频
# recent_count = sum(1 for v in videos if v.get('created') > month_ago)
# up_info['近一个月投稿数'] = recent_count
# except Exception as e:
# print(f"获取投稿数据失败: {str(e)}")
# send_notification(f"获取投稿数据失败: {str(e)}")
2025-03-19 09:30:30 +08:00
return up_info
2025-03-18 19:13:56 +08:00
def get_danmaku(self):
"""获取弹幕数据"""
if not self.bvid:
2025-03-23 22:32:02 +08:00
send_notification("未找到 BVID无法获取弹幕数据。")
2025-03-18 19:13:56 +08:00
print("未找到 BVID无法获取弹幕数据。")
return []
url = f"https://api.bilibili.com/x/web-interface/view?bvid={self.bvid}"
resp = self._safe_request(url)
if not resp:
return []
data = resp.json()
cid = data.get('data', {}).get('cid')
if not cid:
2025-03-23 22:32:02 +08:00
send_notification("未找到 cid无法获取弹幕数据。")
2025-03-18 19:13:56 +08:00
print("未找到 cid无法获取弹幕数据。")
return []
danmaku_url = f'https://comment.bilibili.com/{cid}.xml'
resp = self._safe_request(danmaku_url)
if not resp:
return []
danmaku = []
soup = BeautifulSoup(resp.content, 'lxml-xml')
for d in soup.find_all('d'):
attrs = d['p'].split(',')
danmaku.append({
'时间': float(attrs[0]),
'模式': attrs[1],
'颜色': f'#{int(attrs[3]):06X}',
'弹幕内容': d.text
})
return danmaku
2025-03-24 09:10:24 +08:00
def get_comments(self, max_pages=5):
2025-03-23 22:32:02 +08:00
"""获取热门评论,仅保留一级评论"""
2025-03-18 19:13:56 +08:00
if not self.aid:
2025-03-23 22:32:02 +08:00
send_notification("未找到视频 ID无法获取评论数据。")
2025-03-18 19:13:56 +08:00
print("未找到视频 ID无法获取评论数据。")
return []
comments = []
for page in range(1, max_pages + 1):
url = f'https://api.bilibili.com/x/v2/reply?pn={page}&type=1&oid={self.aid}&sort=2'
try:
response = self._safe_request(url)
if response and response.status_code == 200:
response.encoding = 'utf-8'
data = response.json()
if data and data.get('data') and data['data'].get('replies'):
for comment in data['data']['replies']:
comment_info = {
'用户昵称': comment['member']['uname'],
'评论内容': comment['content']['message'],
'被回复用户': '',
'评论层级': '一级评论',
'性别': comment['member']['sex'],
'用户当前等级': comment['member']['level_info']['current_level'],
'点赞数量': comment['like'],
'回复时间': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(comment['ctime']))
}
comments.append(comment_info)
2025-03-23 22:32:02 +08:00
# 移除获取二级评论的代码
2025-03-18 19:13:56 +08:00
else:
# 当当前页没有评论时,跳出循环
break
except requests.RequestException as e:
2025-03-23 22:32:02 +08:00
send_notification(f"评论请求出错: {e}")
2025-03-18 19:13:56 +08:00
print(f"请求出错: {e}")
break
# 适当调整请求间隔
2025-03-24 07:35:33 +08:00
sleep(1)
2025-03-18 19:13:56 +08:00
return comments
2025-03-23 22:32:02 +08:00
# 移除 fetch_comment_replies 方法(原方法不再需要)
2025-03-18 19:13:56 +08:00
def _parse_count(self, text):
"""统一处理数量文本"""
if '' in text:
return int(float(text.replace('', '')) * 10000)
return int(text)
def save_to_csv(self, data, filename, mode='w'):
"""保存数据到CSV"""
if not data:
return
keys = data[0].keys()
with open(filename, mode, newline='', encoding='utf-8-sig') as f:
writer = csv.DictWriter(f, fieldnames=keys)
if f.tell() == 0: # 新文件写入表头
writer.writeheader()
writer.writerows(data)
def run(self):
"""执行完整流程"""
print("正在获取视频基本信息...")
video_info = self.get_video_info()
if video_info:
import os
partition = video_info.get('分区', '其他')
base_dir = os.path.join('data', partition)
video_dir = os.path.join(base_dir, self.bvid)
os.makedirs(base_dir, exist_ok=True)
os.makedirs(video_dir, exist_ok=True)
# 保存视频信息
info_csv_path = os.path.join(base_dir, 'info.csv')
self.save_to_csv([video_info], info_csv_path, mode='a')
play_count = video_info.get('播放量', 0)
video_info_filename = os.path.join(video_dir, f'{self.bvid}_{play_count}_info.csv')
self.save_to_csv([video_info], video_info_filename)
2025-03-24 07:35:33 +08:00
sleep(1)
2025-03-23 22:32:02 +08:00
2025-03-18 19:13:56 +08:00
# 新增弹幕抓取
print("正在抓取弹幕数据...")
2025-03-23 22:32:02 +08:00
danmaku = self.get_danmaku()
danmaku_filename = os.path.join(video_dir, f'{self.bvid}_{len(danmaku)}_danmaku.csv')
self.save_to_csv(danmaku, danmaku_filename)
2025-03-24 07:35:33 +08:00
sleep(1)
2025-03-18 19:13:56 +08:00
# 新增评论抓取
print("正在抓取评论数据...")
2025-03-23 22:32:02 +08:00
comments = self.get_comments()
comments_filename = os.path.join(video_dir, f'{self.bvid}_{len(comments)}_comments.csv')
self.save_to_csv(comments, comments_filename)
2025-03-24 07:35:33 +08:00
sleep(1)
2025-03-18 19:13:56 +08:00
2025-03-19 09:30:30 +08:00
# 新增UP主信息记录
print("正在获取UP主信息...")
2025-03-23 22:32:02 +08:00
up_info = self.get_up_info(video_info.get('up主UID'))
up_info['BV号'] = self.bvid
up_csv_path = os.path.join(base_dir, 'up_info.csv')
self.save_to_csv([up_info], up_csv_path, mode='a')
2025-03-19 09:30:30 +08:00
2025-03-18 19:13:56 +08:00
print(f"抓取完成!结果已保存到 {video_dir}/")
else:
print("未获取到视频信息,无法进行抓取。")
if __name__ == "__main__":
2025-03-24 09:25:40 +08:00
cookie = "buvid3=669D9192-9030-AE04-8149-45A24D82CBB985872infoc; b_nut=1728112285; _uuid=BDD29A64-331010-1578-7AB2-6985DCD1EC10586028infoc; enable_web_push=DISABLE; buvid4=02E86127-F707-C9D6-1E0E-62127CDB94EA86683-024100507-5HwKLZoKiRRAzdUiyP1DtUb99jVdMKHnip8nMCxMvnyDueJx41kzeR6uEnG0C2HY; DedeUserID=399209972; DedeUserID__ckMd5=9ad9de58e979dbdf; header_theme_version=CLOSE; rpdid=0zbfAHZlOT|utNc6ahG|2XT|3w1SWYYy; buvid_fp_plain=undefined; hit-dyn-v2=1; is-2022-channel=1; LIVE_BUVID=AUTO7717291772934669; PVID=2; enable_feed_channel=ENABLE; CURRENT_QUALITY=80; fingerprint=262ed395815a48ea928a2b3cf305da95; buvid_fp=262ed395815a48ea928a2b3cf305da95; bsource=search_bing; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3NDI3ODE1NzgsImlhdCI6MTc0MjUyMjMxOCwicGx0IjotMX0.yseN1sbG5qDIlo5L0mUKqRr2tCL0OqPBnSnnHUYCWtE; bili_ticket_expires=1742781518; home_feed_column=5; browser_resolution=1702-986; SESSDATA=d0449397%2C1758291057%2C666e5%2A32CjDmj_WpEXUbv2oTIls3PPvM1wODDzR-gnPMRP5gwm09UIZ7YGdhTsJ7ssNg5Tb19qESVmlUMktwdlhkTnNlM0dTU05kZlBOTERNM3JqeTVSNGFaRzdwZmtNWUpjTHhPWmdxTzJiWmdDSVZxNkpEb0VGNHJFTEdaTlJtcVhBUlMzbEZJTzdCeXNnIIEC; bili_jct=e7f4029b2be38fe915678f49aa5b36f7; sid=6samqc5x; CURRENT_FNVAL=4048; bp_t_offset_399209972=1047699216786259968; b_lsid=919AF1F2_195C5ADCCC6"
2025-03-18 19:13:56 +08:00
# 批量处理targets.txt中的BV号
targets = load_targets()
if not targets:
print("未找到有效的BV号程序退出")
2025-03-23 22:36:33 +08:00
send_notification("未找到有效的BV号程序退出")
2025-03-18 19:13:56 +08:00
exit()
2025-03-23 22:32:02 +08:00
send_info_log(f"开始批量处理 {len(targets)} 个视频")
2025-03-24 07:35:33 +08:00
i=0
2025-03-18 19:13:56 +08:00
for bvid in targets:
print(f"\n{'=' * 30} 开始处理 {bvid} {'=' * 30}")
2025-03-23 22:32:02 +08:00
crawler = BiliWebCrawler(f"https://www.bilibili.com/video/{bvid}",cookie)
crawler.run()
2025-03-24 07:35:33 +08:00
i=i+1
if i%20==0:
send_info_log(f"已完成 {i} 个视频,进度:{i/len(targets)*100:.2f}%")
sleep(1)
send_info_log(f"批量处理完成!")