From 2dec059f178dc82959679a8a23dd4ae56891f081 Mon Sep 17 00:00:00 2001
From: Sheyiyuan <2125107118@qq.com>
Date: Tue, 18 Mar 2025 19:13:56 +0800
Subject: [PATCH] init

---
 .gitignore  |   3 +
 LICENSE     |  35 +++++++
 main.py     | 295 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 readme.md   |  39 +++++++
 require.txt |   3 +
 targets.txt |   4 +
 6 files changed, 379 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 LICENSE
 create mode 100644 main.py
 create mode 100644 readme.md
 create mode 100644 require.txt
 create mode 100644 targets.txt

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..cd919e2
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+./venv/
+.idea/
+data/
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..0444cc7
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,35 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+"License" shall mean the terms and conditions for use, reproduction, and distribution
+as defined by Sections 1 through 9 of this document.
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [2025] [Sheyiyuan]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..8cd0036
--- /dev/null
+++ b/main.py
@@ -0,0 +1,295 @@
+import requests
+import re
+import csv
+from bs4 import BeautifulSoup
+from time import sleep
+from random import uniform
+from fake_useragent import UserAgent
+import time
+
+def load_targets():
+    """从targets.txt加载BV号列表"""
+    try:
+        with open('targets.txt', 'r', encoding='utf-8') as f:
+            return [line.strip() for line in f if line.strip().startswith('BV')]
+    except FileNotFoundError:
+        print("未找到targets.txt文件，请创建文件并添加BV号")
+        return []
+
+class BiliWebCrawler:
+    def __init__(self, url, cookie=None):
+        self.url = url
+        self.bvid = self._extract_bvid()
+        self.user_agent = UserAgent()
+        self.headers = {
+            'User-Agent': self.user_agent.random,
+            'Referer': 'https://www.bilibili.com/',
+        }
+        if cookie:
+            self.headers['Cookie'] = cookie
+        self.session = requests.Session()
+        self.aid = self._get_video_id()
+
+    def _extract_bvid(self):
+        """从视频链接中提取 BVID"""
+        match = re.search(r'(BV[A-Za-z0-9]+)', self.url)
+        if match:
+            return match.group(1)
+        return None
+
+    def _get_video_id(self):
+        if not self.bvid:
+            return None
+        url = f'https://api.bilibili.com/x/web-interface/view?bvid={self.bvid}'
+        resp = self._safe_request(url)
+        if resp:
+            data = resp.json()
+            return data.get('data', {}).get('aid')
+        return None
+
+    def _safe_request(self, url, params=None, retry=3):  # 添加 params 参数
+        """带重试机制的请求"""
+        for i in range(retry):
+            try:
+                resp = self.session.get(url, headers=self.headers, timeout=10, params=params)  # 使用 params 参数
+                resp.raise_for_status()
+                return resp
+            except Exception as e:
+                print(f"请求失败: {str(e)}, 第{i + 1}次重试...")
+                sleep(uniform(1, 3))
+        return None
+
+    def get_video_info(self):
+        """获取视频基本信息"""
+        if not self.bvid:
+            return None
+
+        # 获取视频基本信息
+        url = f'https://api.bilibili.com/x/web-interface/view?bvid={self.bvid}'
+        resp = self._safe_request(url)
+        if not resp:
+            return None
+
+        data = resp.json()
+        video_data = data.get('data')
+        if not video_data:
+            return None
+
+        # 获取视频最高分辨率（基于dimension对象）
+        max_width = 0
+        max_height = 0
+        for format_info in video_data.get('formats', []):
+            dimension = format_info.get('dimension', {})
+            width = dimension.get('width', 0)
+            height = dimension.get('height', 0)
+            rotate = dimension.get('rotate', 0)
+
+            # 处理视频旋转（当rotate=1时宽高互换）
+            if rotate == 1:
+                width, height = height, width
+
+            # 通过像素总量比较分辨率
+            if (width * height) > (max_width * max_height):
+                max_width = width
+                max_height = height
+
+        # 将分辨率格式化为 "宽x高" 的字符串
+        resolution_str = f"{max_width}x{max_height}" if max_width and max_height else "未知"
+
+        # 获取视频标签
+        tag_url = f'https://api.bilibili.com/x/web-interface/view/detail/tag?bvid={self.bvid}'
+        tag_resp = self._safe_request(tag_url)
+        tag_data = []
+        if tag_resp:
+            tag_json = tag_resp.json()
+            tag_data = [tag['tag_name'] for tag in tag_json.get('data', [])]
+
+        info = {
+            'title': video_data.get('title', ''),
+            'up主': video_data.get('owner', {}).get('name', ''),
+            '播放量': video_data.get('stat', {}).get('view', 0),
+            '弹幕量': video_data.get('stat', {}).get('danmaku', 0),
+            '点赞量': video_data.get('stat', {}).get('like', 0),
+            '投币量': video_data.get('stat', {}).get('coin', 0),
+            '收藏量': video_data.get('stat', {}).get('favorite', 0),
+            '发布时间': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(video_data.get('pubdate', 0))),
+            '分区': video_data.get('tname', ''),
+            '标签': tag_data,
+            '视频最高分辨率': resolution_str,
+            '视频类型': video_data.get('copyright', 0),
+            '视频分p数': len(video_data.get('pages', []))
+        }
+
+        return info
+
+    def get_danmaku(self):
+        """获取弹幕数据"""
+        if not self.bvid:
+            print("未找到 BVID，无法获取弹幕数据。")
+            return []
+
+        url = f"https://api.bilibili.com/x/web-interface/view?bvid={self.bvid}"
+        resp = self._safe_request(url)
+        if not resp:
+            return []
+
+        data = resp.json()
+        cid = data.get('data', {}).get('cid')
+        if not cid:
+            print("未找到 cid，无法获取弹幕数据。")
+            return []
+
+        danmaku_url = f'https://comment.bilibili.com/{cid}.xml'
+
+        resp = self._safe_request(danmaku_url)
+        if not resp:
+            return []
+
+        danmaku = []
+        soup = BeautifulSoup(resp.content, 'lxml-xml')
+        for d in soup.find_all('d'):
+            attrs = d['p'].split(',')
+            danmaku.append({
+                '时间': float(attrs[0]),
+                '模式': attrs[1],
+                '颜色': f'#{int(attrs[3]):06X}',
+                '弹幕内容': d.text
+            })
+        return danmaku
+
+    def get_comments(self, max_pages=1000):
+        """获取热门评论，包含二级评论"""
+        if not self.aid:
+            print("未找到视频 ID，无法获取评论数据。")
+            return []
+        comments = []
+        for page in range(1, max_pages + 1):
+            url = f'https://api.bilibili.com/x/v2/reply?pn={page}&type=1&oid={self.aid}&sort=2'
+            try:
+                response = self._safe_request(url)
+                if response and response.status_code == 200:
+                    response.encoding = 'utf-8'
+                    data = response.json()
+                    if data and data.get('data') and data['data'].get('replies'):
+                        for comment in data['data']['replies']:
+                            comment_info = {
+                                '用户昵称': comment['member']['uname'],
+                                '评论内容': comment['content']['message'],
+                                '被回复用户': '',
+                                '评论层级': '一级评论',
+                                '性别': comment['member']['sex'],
+                                '用户当前等级': comment['member']['level_info']['current_level'],
+                                '点赞数量': comment['like'],
+                                '回复时间': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(comment['ctime']))
+                            }
+                            comments.append(comment_info)
+                            replies = self.fetch_comment_replies(comment['rpid'], comment['member']['uname'])
+                            comments.extend(replies)
+                    else:
+                        # 当当前页没有评论时，跳出循环
+                        break
+            except requests.RequestException as e:
+                print(f"请求出错: {e}")
+                break
+            # 适当调整请求间隔
+            sleep(1)
+        return comments
+
+    def fetch_comment_replies(self, comment_id, parent_user_name, max_pages=1000):
+        replies = []
+        for page in range(1, max_pages + 1):
+            url = f'https://api.bilibili.com/x/v2/reply/reply?oid={self.aid}&type=1&root={comment_id}&ps=10&pn={page}'
+            try:
+                response = self._safe_request(url)
+                if response and response.status_code == 200:
+                    response.encoding = 'utf-8'
+                    data = response.json()
+                    if data and data.get('data') and data['data'].get('replies'):
+                        for reply in data['data']['replies']:
+                            reply_info = {
+                                '用户昵称': reply['member']['uname'],
+                                '评论内容': reply['content']['message'],
+                                '被回复用户': parent_user_name,
+                                '评论层级': '二级评论',
+                                '性别': reply['member']['sex'],
+                                '用户当前等级': reply['member']['level_info']['current_level'],
+                                '点赞数量': reply['like'],
+                                '回复时间': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(reply['ctime']))
+                            }
+                            replies.append(reply_info)
+                    else:
+                        # 当当前页没有二级评论时，跳出循环
+                        break
+            except requests.RequestException as e:
+                print(f"请求二级评论出错: {e}")
+                break
+            # 适当调整请求间隔
+            sleep(1)
+        return replies
+
+    def _parse_count(self, text):
+        """统一处理数量文本"""
+        if '万' in text:
+            return int(float(text.replace('万', '')) * 10000)
+        return int(text)
+
+    def save_to_csv(self, data, filename, mode='w'):
+        """保存数据到CSV"""
+        if not data:
+            return
+        keys = data[0].keys()
+        with open(filename, mode, newline='', encoding='utf-8-sig') as f:
+            writer = csv.DictWriter(f, fieldnames=keys)
+            if f.tell() == 0:  # 新文件写入表头
+                writer.writeheader()
+            writer.writerows(data)
+
+    def run(self):
+        """执行完整流程"""
+        print("正在获取视频基本信息...")
+        video_info = self.get_video_info()
+        if video_info:
+            import os
+            partition = video_info.get('分区', '其他')
+            base_dir = os.path.join('data', partition)
+            video_dir = os.path.join(base_dir, self.bvid)
+
+            os.makedirs(base_dir, exist_ok=True)
+            os.makedirs(video_dir, exist_ok=True)
+
+            # 保存视频信息
+            info_csv_path = os.path.join(base_dir, 'info.csv')
+            self.save_to_csv([video_info], info_csv_path, mode='a')
+
+            play_count = video_info.get('播放量', 0)
+            video_info_filename = os.path.join(video_dir, f'{self.bvid}_{play_count}_info.csv')
+            self.save_to_csv([video_info], video_info_filename)
+
+            # 新增弹幕抓取
+            print("正在抓取弹幕数据...")
+            danmaku = self.get_danmaku()
+            danmaku_filename = os.path.join(video_dir, f'{self.bvid}_{len(danmaku)}_danmaku.csv')
+            self.save_to_csv(danmaku, danmaku_filename)
+
+            # 新增评论抓取
+            print("正在抓取评论数据...")
+            comments = self.get_comments()
+            comments_filename = os.path.join(video_dir, f'{self.bvid}_{len(comments)}_comments.csv')
+            self.save_to_csv(comments, comments_filename)
+
+            print(f"抓取完成！结果已保存到 {video_dir}/")
+        else:
+            print("未获取到视频信息，无法进行抓取。")
+
+
+if __name__ == "__main__":
+    # 批量处理targets.txt中的BV号
+    targets = load_targets()
+    if not targets:
+        print("未找到有效的BV号，程序退出")
+        exit()
+
+    for bvid in targets:
+        print(f"\n{'=' * 30} 开始处理 {bvid} {'=' * 30}")
+        crawler = BiliWebCrawler(f"https://www.bilibili.com/video/{bvid}")
+        crawler.run()
\ No newline at end of file
diff --git a/readme.md b/readme.md
new file mode 100644
index 0000000..49a94c5
--- /dev/null
+++ b/readme.md
@@ -0,0 +1,39 @@
+## 开源协议
+本项目采用 Apache License 2.0 开源协议，允许：
+- 商业使用
+- 修改代码
+- 专利授权
+- 私有使用
+
+完整协议内容请查看 [LICENSE](LICENSE) 文件。
+
+## 使用指南
+
+### 环境准备
+需要预先安装以下Python第三方库：
+- 网络请求库 requests
+- 网页解析库 beautifulsoup4
+- 随机请求头生成库 fake-useragent
+- CSV处理库 csvkit
+
+### 数据采集流程
+1. 在项目主目录创建名为`targets.txt`的文本文件
+2. 将需要采集的B站视频BV号逐行填入该文件（示例格式参考工程内示例）
+3. 启动主程序文件开始自动采集
+4. 程序运行完成后，在data目录下查看结构化存储结果
+
+### 结果查看
+采集完成后的数据存储路径结构示例：
+- 数据根目录
+  └─ 视频分区类别 
+     ├─ 全分区视频信息汇总表
+     └─ 视频BV号专属文件夹
+        ├─ 含播放量的视频元数据文件
+        ├─ 弹幕数据文件
+        └─ 评论数据文件
+
+### 个性化设置
+如需采集需要登录才能访问的视频内容，可在程序初始化时传入有效的网站身份凭证参数。采集间隔时间等参数可直接在源代码中调整相关配置项。
+
+### 致谢
+感谢[bilibili-API](https://github.com/SocialSisterYi/bilibili-API)项目提供的API接口，使得本项目的开发更加简单。
\ No newline at end of file
diff --git a/require.txt b/require.txt
new file mode 100644
index 0000000..603e000
--- /dev/null
+++ b/require.txt
@@ -0,0 +1,3 @@
+requests>=2.26.0
+beautifulsoup4>=4.10.0
+fake-useragent>=1.1.3
diff --git a/targets.txt b/targets.txt
new file mode 100644
index 0000000..451a8cd
--- /dev/null
+++ b/targets.txt
@@ -0,0 +1,4 @@
+BV1a5Q3Y9EX5
+BV1qQQiYvEhF
+BV1PP411W7SG
+BV1xr4y157BY
\ No newline at end of file