From 2dec059f178dc82959679a8a23dd4ae56891f081 Mon Sep 17 00:00:00 2001 From: Sheyiyuan <2125107118@qq.com> Date: Tue, 18 Mar 2025 19:13:56 +0800 Subject: [PATCH] init --- .gitignore | 3 + LICENSE | 35 +++++++ main.py | 295 ++++++++++++++++++++++++++++++++++++++++++++++++++++ readme.md | 39 +++++++ require.txt | 3 + targets.txt | 4 + 6 files changed, 379 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 main.py create mode 100644 readme.md create mode 100644 require.txt create mode 100644 targets.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..cd919e2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +./venv/ +.idea/ +data/ \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..0444cc7 --- /dev/null +++ b/LICENSE @@ -0,0 +1,35 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, and distribution +as defined by Sections 1 through 9 of this document. + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [2025] [Sheyiyuan] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/main.py b/main.py new file mode 100644 index 0000000..8cd0036 --- /dev/null +++ b/main.py @@ -0,0 +1,295 @@ +import requests +import re +import csv +from bs4 import BeautifulSoup +from time import sleep +from random import uniform +from fake_useragent import UserAgent +import time + +def load_targets(): + """从targets.txt加载BV号列表""" + try: + with open('targets.txt', 'r', encoding='utf-8') as f: + return [line.strip() for line in f if line.strip().startswith('BV')] + except FileNotFoundError: + print("未找到targets.txt文件,请创建文件并添加BV号") + return [] + +class BiliWebCrawler: + def __init__(self, url, cookie=None): + self.url = url + self.bvid = self._extract_bvid() + self.user_agent = UserAgent() + self.headers = { + 'User-Agent': self.user_agent.random, + 'Referer': 'https://www.bilibili.com/', + } + if cookie: + self.headers['Cookie'] = cookie + self.session = requests.Session() + self.aid = self._get_video_id() + + def _extract_bvid(self): + """从视频链接中提取 BVID""" + match = re.search(r'(BV[A-Za-z0-9]+)', self.url) + if match: + return match.group(1) + return None + + def _get_video_id(self): + if not self.bvid: + return None + url = f'https://api.bilibili.com/x/web-interface/view?bvid={self.bvid}' + resp = self._safe_request(url) + if resp: + data = resp.json() + return data.get('data', {}).get('aid') + return None + + def _safe_request(self, url, params=None, retry=3): # 添加 params 参数 + """带重试机制的请求""" + for i in range(retry): + try: + resp = self.session.get(url, headers=self.headers, timeout=10, params=params) # 使用 params 参数 + resp.raise_for_status() + return resp + except Exception as e: + print(f"请求失败: {str(e)}, 第{i + 1}次重试...") + sleep(uniform(1, 3)) + return None + + def get_video_info(self): + """获取视频基本信息""" + if not self.bvid: + return None + + # 获取视频基本信息 + url = f'https://api.bilibili.com/x/web-interface/view?bvid={self.bvid}' + resp = self._safe_request(url) + if not resp: + return None + + data = resp.json() + video_data = data.get('data') + if not video_data: + return None + + # 获取视频最高分辨率(基于dimension对象) + max_width = 0 + max_height = 0 + for format_info in video_data.get('formats', []): + dimension = format_info.get('dimension', {}) + width = dimension.get('width', 0) + height = dimension.get('height', 0) + rotate = dimension.get('rotate', 0) + + # 处理视频旋转(当rotate=1时宽高互换) + if rotate == 1: + width, height = height, width + + # 通过像素总量比较分辨率 + if (width * height) > (max_width * max_height): + max_width = width + max_height = height + + # 将分辨率格式化为 "宽x高" 的字符串 + resolution_str = f"{max_width}x{max_height}" if max_width and max_height else "未知" + + # 获取视频标签 + tag_url = f'https://api.bilibili.com/x/web-interface/view/detail/tag?bvid={self.bvid}' + tag_resp = self._safe_request(tag_url) + tag_data = [] + if tag_resp: + tag_json = tag_resp.json() + tag_data = [tag['tag_name'] for tag in tag_json.get('data', [])] + + info = { + 'title': video_data.get('title', ''), + 'up主': video_data.get('owner', {}).get('name', ''), + '播放量': video_data.get('stat', {}).get('view', 0), + '弹幕量': video_data.get('stat', {}).get('danmaku', 0), + '点赞量': video_data.get('stat', {}).get('like', 0), + '投币量': video_data.get('stat', {}).get('coin', 0), + '收藏量': video_data.get('stat', {}).get('favorite', 0), + '发布时间': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(video_data.get('pubdate', 0))), + '分区': video_data.get('tname', ''), + '标签': tag_data, + '视频最高分辨率': resolution_str, + '视频类型': video_data.get('copyright', 0), + '视频分p数': len(video_data.get('pages', [])) + } + + return info + + def get_danmaku(self): + """获取弹幕数据""" + if not self.bvid: + print("未找到 BVID,无法获取弹幕数据。") + return [] + + url = f"https://api.bilibili.com/x/web-interface/view?bvid={self.bvid}" + resp = self._safe_request(url) + if not resp: + return [] + + data = resp.json() + cid = data.get('data', {}).get('cid') + if not cid: + print("未找到 cid,无法获取弹幕数据。") + return [] + + danmaku_url = f'https://comment.bilibili.com/{cid}.xml' + + resp = self._safe_request(danmaku_url) + if not resp: + return [] + + danmaku = [] + soup = BeautifulSoup(resp.content, 'lxml-xml') + for d in soup.find_all('d'): + attrs = d['p'].split(',') + danmaku.append({ + '时间': float(attrs[0]), + '模式': attrs[1], + '颜色': f'#{int(attrs[3]):06X}', + '弹幕内容': d.text + }) + return danmaku + + def get_comments(self, max_pages=1000): + """获取热门评论,包含二级评论""" + if not self.aid: + print("未找到视频 ID,无法获取评论数据。") + return [] + comments = [] + for page in range(1, max_pages + 1): + url = f'https://api.bilibili.com/x/v2/reply?pn={page}&type=1&oid={self.aid}&sort=2' + try: + response = self._safe_request(url) + if response and response.status_code == 200: + response.encoding = 'utf-8' + data = response.json() + if data and data.get('data') and data['data'].get('replies'): + for comment in data['data']['replies']: + comment_info = { + '用户昵称': comment['member']['uname'], + '评论内容': comment['content']['message'], + '被回复用户': '', + '评论层级': '一级评论', + '性别': comment['member']['sex'], + '用户当前等级': comment['member']['level_info']['current_level'], + '点赞数量': comment['like'], + '回复时间': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(comment['ctime'])) + } + comments.append(comment_info) + replies = self.fetch_comment_replies(comment['rpid'], comment['member']['uname']) + comments.extend(replies) + else: + # 当当前页没有评论时,跳出循环 + break + except requests.RequestException as e: + print(f"请求出错: {e}") + break + # 适当调整请求间隔 + sleep(1) + return comments + + def fetch_comment_replies(self, comment_id, parent_user_name, max_pages=1000): + replies = [] + for page in range(1, max_pages + 1): + url = f'https://api.bilibili.com/x/v2/reply/reply?oid={self.aid}&type=1&root={comment_id}&ps=10&pn={page}' + try: + response = self._safe_request(url) + if response and response.status_code == 200: + response.encoding = 'utf-8' + data = response.json() + if data and data.get('data') and data['data'].get('replies'): + for reply in data['data']['replies']: + reply_info = { + '用户昵称': reply['member']['uname'], + '评论内容': reply['content']['message'], + '被回复用户': parent_user_name, + '评论层级': '二级评论', + '性别': reply['member']['sex'], + '用户当前等级': reply['member']['level_info']['current_level'], + '点赞数量': reply['like'], + '回复时间': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(reply['ctime'])) + } + replies.append(reply_info) + else: + # 当当前页没有二级评论时,跳出循环 + break + except requests.RequestException as e: + print(f"请求二级评论出错: {e}") + break + # 适当调整请求间隔 + sleep(1) + return replies + + def _parse_count(self, text): + """统一处理数量文本""" + if '万' in text: + return int(float(text.replace('万', '')) * 10000) + return int(text) + + def save_to_csv(self, data, filename, mode='w'): + """保存数据到CSV""" + if not data: + return + keys = data[0].keys() + with open(filename, mode, newline='', encoding='utf-8-sig') as f: + writer = csv.DictWriter(f, fieldnames=keys) + if f.tell() == 0: # 新文件写入表头 + writer.writeheader() + writer.writerows(data) + + def run(self): + """执行完整流程""" + print("正在获取视频基本信息...") + video_info = self.get_video_info() + if video_info: + import os + partition = video_info.get('分区', '其他') + base_dir = os.path.join('data', partition) + video_dir = os.path.join(base_dir, self.bvid) + + os.makedirs(base_dir, exist_ok=True) + os.makedirs(video_dir, exist_ok=True) + + # 保存视频信息 + info_csv_path = os.path.join(base_dir, 'info.csv') + self.save_to_csv([video_info], info_csv_path, mode='a') + + play_count = video_info.get('播放量', 0) + video_info_filename = os.path.join(video_dir, f'{self.bvid}_{play_count}_info.csv') + self.save_to_csv([video_info], video_info_filename) + + # 新增弹幕抓取 + print("正在抓取弹幕数据...") + danmaku = self.get_danmaku() + danmaku_filename = os.path.join(video_dir, f'{self.bvid}_{len(danmaku)}_danmaku.csv') + self.save_to_csv(danmaku, danmaku_filename) + + # 新增评论抓取 + print("正在抓取评论数据...") + comments = self.get_comments() + comments_filename = os.path.join(video_dir, f'{self.bvid}_{len(comments)}_comments.csv') + self.save_to_csv(comments, comments_filename) + + print(f"抓取完成!结果已保存到 {video_dir}/") + else: + print("未获取到视频信息,无法进行抓取。") + + +if __name__ == "__main__": + # 批量处理targets.txt中的BV号 + targets = load_targets() + if not targets: + print("未找到有效的BV号,程序退出") + exit() + + for bvid in targets: + print(f"\n{'=' * 30} 开始处理 {bvid} {'=' * 30}") + crawler = BiliWebCrawler(f"https://www.bilibili.com/video/{bvid}") + crawler.run() \ No newline at end of file diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..49a94c5 --- /dev/null +++ b/readme.md @@ -0,0 +1,39 @@ +## 开源协议 +本项目采用 Apache License 2.0 开源协议,允许: +- 商业使用 +- 修改代码 +- 专利授权 +- 私有使用 + +完整协议内容请查看 [LICENSE](LICENSE) 文件。 + +## 使用指南 + +### 环境准备 +需要预先安装以下Python第三方库: +- 网络请求库 requests +- 网页解析库 beautifulsoup4 +- 随机请求头生成库 fake-useragent +- CSV处理库 csvkit + +### 数据采集流程 +1. 在项目主目录创建名为`targets.txt`的文本文件 +2. 将需要采集的B站视频BV号逐行填入该文件(示例格式参考工程内示例) +3. 启动主程序文件开始自动采集 +4. 程序运行完成后,在data目录下查看结构化存储结果 + +### 结果查看 +采集完成后的数据存储路径结构示例: +- 数据根目录 + └─ 视频分区类别 + ├─ 全分区视频信息汇总表 + └─ 视频BV号专属文件夹 + ├─ 含播放量的视频元数据文件 + ├─ 弹幕数据文件 + └─ 评论数据文件 + +### 个性化设置 +如需采集需要登录才能访问的视频内容,可在程序初始化时传入有效的网站身份凭证参数。采集间隔时间等参数可直接在源代码中调整相关配置项。 + +### 致谢 +感谢[bilibili-API](https://github.com/SocialSisterYi/bilibili-API)项目提供的API接口,使得本项目的开发更加简单。 \ No newline at end of file diff --git a/require.txt b/require.txt new file mode 100644 index 0000000..603e000 --- /dev/null +++ b/require.txt @@ -0,0 +1,3 @@ +requests>=2.26.0 +beautifulsoup4>=4.10.0 +fake-useragent>=1.1.3 diff --git a/targets.txt b/targets.txt new file mode 100644 index 0000000..451a8cd --- /dev/null +++ b/targets.txt @@ -0,0 +1,4 @@ +BV1a5Q3Y9EX5 +BV1qQQiYvEhF +BV1PP411W7SG +BV1xr4y157BY \ No newline at end of file