init

2025-03-18 19:13:56 +08:00 · 2025-03-18 19:13:56 +08:00 · 2dec059f17
commit 2dec059f17
6 changed files with 379 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
 ./venv/
 .idea/
 data/
--- a/35
+++ b/35
@ -0,0 +1,35 @@
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/
 TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
 1. Definitions.
 "License" shall mean the terms and conditions for use, reproduction, and distribution
 as defined by Sections 1 through 9 of this document.
   APPENDIX: How to apply the Apache License to your work.
      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.
   Copyright [2025] [Sheyiyuan]
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
--- a/main.py
+++ b/main.py
@ -0,0 +1,295 @@
 import requests
 import re
 import csv
 from bs4 import BeautifulSoup
 from time import sleep
 from random import uniform
 from fake_useragent import UserAgent
 import time
 def load_targets():
    """从targets.txt加载BV号列表"""
    try:
        with open('targets.txt', 'r', encoding='utf-8') as f:
            return [line.strip() for line in f if line.strip().startswith('BV')]
    except FileNotFoundError:
        print("未找到targets.txt文件，请创建文件并添加BV号")
        return []
 class BiliWebCrawler:
    def __init__(self, url, cookie=None):
        self.url = url
        self.bvid = self._extract_bvid()
        self.user_agent = UserAgent()
        self.headers = {
            'User-Agent': self.user_agent.random,
            'Referer': 'https://www.bilibili.com/',
        }
        if cookie:
            self.headers['Cookie'] = cookie
        self.session = requests.Session()
        self.aid = self._get_video_id()
    def _extract_bvid(self):
        """从视频链接中提取 BVID"""
        match = re.search(r'(BV[A-Za-z0-9]+)', self.url)
        if match:
            return match.group(1)
        return None
    def _get_video_id(self):
        if not self.bvid:
            return None
        url = f'https://api.bilibili.com/x/web-interface/view?bvid={self.bvid}'
        resp = self._safe_request(url)
        if resp:
            data = resp.json()
            return data.get('data', {}).get('aid')
        return None
    def _safe_request(self, url, params=None, retry=3):  # 添加 params 参数
        """带重试机制的请求"""
        for i in range(retry):
            try:
                resp = self.session.get(url, headers=self.headers, timeout=10, params=params)  # 使用 params 参数
                resp.raise_for_status()
                return resp
            except Exception as e:
                print(f"请求失败: {str(e)}, 第{i + 1}次重试...")
                sleep(uniform(1, 3))
        return None
    def get_video_info(self):
        """获取视频基本信息"""
        if not self.bvid:
            return None
        # 获取视频基本信息
        url = f'https://api.bilibili.com/x/web-interface/view?bvid={self.bvid}'
        resp = self._safe_request(url)
        if not resp:
            return None
        data = resp.json()
        video_data = data.get('data')
        if not video_data:
            return None
        # 获取视频最高分辨率（基于dimension对象）
        max_width = 0
        max_height = 0
        for format_info in video_data.get('formats', []):
            dimension = format_info.get('dimension', {})
            width = dimension.get('width', 0)
            height = dimension.get('height', 0)
            rotate = dimension.get('rotate', 0)
            # 处理视频旋转（当rotate=1时宽高互换）
            if rotate == 1:
                width, height = height, width
            # 通过像素总量比较分辨率
            if (width * height) > (max_width * max_height):
                max_width = width
                max_height = height
        # 将分辨率格式化为 "宽x高" 的字符串
        resolution_str = f"{max_width}x{max_height}" if max_width and max_height else "未知"
        # 获取视频标签
        tag_url = f'https://api.bilibili.com/x/web-interface/view/detail/tag?bvid={self.bvid}'
        tag_resp = self._safe_request(tag_url)
        tag_data = []
        if tag_resp:
            tag_json = tag_resp.json()
            tag_data = [tag['tag_name'] for tag in tag_json.get('data', [])]
        info = {
            'title': video_data.get('title', ''),
            'up主': video_data.get('owner', {}).get('name', ''),
            '播放量': video_data.get('stat', {}).get('view', 0),
            '弹幕量': video_data.get('stat', {}).get('danmaku', 0),
            '点赞量': video_data.get('stat', {}).get('like', 0),
            '投币量': video_data.get('stat', {}).get('coin', 0),
            '收藏量': video_data.get('stat', {}).get('favorite', 0),
            '发布时间': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(video_data.get('pubdate', 0))),
            '分区': video_data.get('tname', ''),
            '标签': tag_data,
            '视频最高分辨率': resolution_str,
            '视频类型': video_data.get('copyright', 0),
            '视频分p数': len(video_data.get('pages', []))
        }
        return info
    def get_danmaku(self):
        """获取弹幕数据"""
        if not self.bvid:
            print("未找到 BVID，无法获取弹幕数据。")
            return []
        url = f"https://api.bilibili.com/x/web-interface/view?bvid={self.bvid}"
        resp = self._safe_request(url)
        if not resp:
            return []
        data = resp.json()
        cid = data.get('data', {}).get('cid')
        if not cid:
            print("未找到 cid，无法获取弹幕数据。")
            return []
        danmaku_url = f'https://comment.bilibili.com/{cid}.xml'
        resp = self._safe_request(danmaku_url)
        if not resp:
            return []
        danmaku = []
        soup = BeautifulSoup(resp.content, 'lxml-xml')
        for d in soup.find_all('d'):
            attrs = d['p'].split(',')
            danmaku.append({
                '时间': float(attrs[0]),
                '模式': attrs[1],
                '颜色': f'#{int(attrs[3]):06X}',
                '弹幕内容': d.text
            })
        return danmaku
    def get_comments(self, max_pages=1000):
        """获取热门评论，包含二级评论"""
        if not self.aid:
            print("未找到视频 ID，无法获取评论数据。")
            return []
        comments = []
        for page in range(1, max_pages + 1):
            url = f'https://api.bilibili.com/x/v2/reply?pn={page}&type=1&oid={self.aid}&sort=2'
            try:
                response = self._safe_request(url)
                if response and response.status_code == 200:
                    response.encoding = 'utf-8'
                    data = response.json()
                    if data and data.get('data') and data['data'].get('replies'):
                        for comment in data['data']['replies']:
                            comment_info = {
                                '用户昵称': comment['member']['uname'],
                                '评论内容': comment['content']['message'],
                                '被回复用户': '',
                                '评论层级': '一级评论',
                                '性别': comment['member']['sex'],
                                '用户当前等级': comment['member']['level_info']['current_level'],
                                '点赞数量': comment['like'],
                                '回复时间': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(comment['ctime']))
                            }
                            comments.append(comment_info)
                            replies = self.fetch_comment_replies(comment['rpid'], comment['member']['uname'])
                            comments.extend(replies)
                    else:
                        # 当当前页没有评论时，跳出循环
                        break
            except requests.RequestException as e:
                print(f"请求出错: {e}")
                break
            # 适当调整请求间隔
            sleep(1)
        return comments
    def fetch_comment_replies(self, comment_id, parent_user_name, max_pages=1000):
        replies = []
        for page in range(1, max_pages + 1):
            url = f'https://api.bilibili.com/x/v2/reply/reply?oid={self.aid}&type=1&root={comment_id}&ps=10&pn={page}'
            try:
                response = self._safe_request(url)
                if response and response.status_code == 200:
                    response.encoding = 'utf-8'
                    data = response.json()
                    if data and data.get('data') and data['data'].get('replies'):
                        for reply in data['data']['replies']:
                            reply_info = {
                                '用户昵称': reply['member']['uname'],
                                '评论内容': reply['content']['message'],
                                '被回复用户': parent_user_name,
                                '评论层级': '二级评论',
                                '性别': reply['member']['sex'],
                                '用户当前等级': reply['member']['level_info']['current_level'],
                                '点赞数量': reply['like'],
                                '回复时间': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(reply['ctime']))
                            }
                            replies.append(reply_info)
                    else:
                        # 当当前页没有二级评论时，跳出循环
                        break
            except requests.RequestException as e:
                print(f"请求二级评论出错: {e}")
                break
            # 适当调整请求间隔
            sleep(1)
        return replies
    def _parse_count(self, text):
        """统一处理数量文本"""
        if '万' in text:
            return int(float(text.replace('万', '')) * 10000)
        return int(text)
    def save_to_csv(self, data, filename, mode='w'):
        """保存数据到CSV"""
        if not data:
            return
        keys = data[0].keys()
        with open(filename, mode, newline='', encoding='utf-8-sig') as f:
            writer = csv.DictWriter(f, fieldnames=keys)
            if f.tell() == 0:  # 新文件写入表头
                writer.writeheader()
            writer.writerows(data)
    def run(self):
        """执行完整流程"""
        print("正在获取视频基本信息...")
        video_info = self.get_video_info()
        if video_info:
            import os
            partition = video_info.get('分区', '其他')
            base_dir = os.path.join('data', partition)
            video_dir = os.path.join(base_dir, self.bvid)
            os.makedirs(base_dir, exist_ok=True)
            os.makedirs(video_dir, exist_ok=True)
            # 保存视频信息
            info_csv_path = os.path.join(base_dir, 'info.csv')
            self.save_to_csv([video_info], info_csv_path, mode='a')
            play_count = video_info.get('播放量', 0)
            video_info_filename = os.path.join(video_dir, f'{self.bvid}_{play_count}_info.csv')
            self.save_to_csv([video_info], video_info_filename)
            # 新增弹幕抓取
            print("正在抓取弹幕数据...")
            danmaku = self.get_danmaku()
            danmaku_filename = os.path.join(video_dir, f'{self.bvid}_{len(danmaku)}_danmaku.csv')
            self.save_to_csv(danmaku, danmaku_filename)
            # 新增评论抓取
            print("正在抓取评论数据...")
            comments = self.get_comments()
            comments_filename = os.path.join(video_dir, f'{self.bvid}_{len(comments)}_comments.csv')
            self.save_to_csv(comments, comments_filename)
            print(f"抓取完成！结果已保存到 {video_dir}/")
        else:
            print("未获取到视频信息，无法进行抓取。")
 if __name__ == "__main__":
    # 批量处理targets.txt中的BV号
    targets = load_targets()
    if not targets:
        print("未找到有效的BV号，程序退出")
        exit()
    for bvid in targets:
        print(f"\n{'=' * 30} 开始处理 {bvid} {'=' * 30}")
        crawler = BiliWebCrawler(f"https://www.bilibili.com/video/{bvid}")
        crawler.run()
--- a/readme.md
+++ b/readme.md
@ -0,0 +1,39 @@
 ## 开源协议
 本项目采用 Apache License 2.0 开源协议，允许：
 - 商业使用
 - 修改代码
 - 专利授权
 - 私有使用
 完整协议内容请查看 [LICENSE](LICENSE) 文件。
 ## 使用指南
 ### 环境准备
 需要预先安装以下Python第三方库：
 - 网络请求库 requests
 - 网页解析库 beautifulsoup4
 - 随机请求头生成库 fake-useragent
 - CSV处理库 csvkit
 ### 数据采集流程
 1. 在项目主目录创建名为`targets.txt`的文本文件
 2. 将需要采集的B站视频BV号逐行填入该文件（示例格式参考工程内示例）
 3. 启动主程序文件开始自动采集
 4. 程序运行完成后，在data目录下查看结构化存储结果
 ### 结果查看
 采集完成后的数据存储路径结构示例：
 - 数据根目录
  └─ 视频分区类别 
     ├─ 全分区视频信息汇总表
     └─ 视频BV号专属文件夹
        ├─ 含播放量的视频元数据文件
        ├─ 弹幕数据文件
        └─ 评论数据文件
 ### 个性化设置
 如需采集需要登录才能访问的视频内容，可在程序初始化时传入有效的网站身份凭证参数。采集间隔时间等参数可直接在源代码中调整相关配置项。
 ### 致谢
 感谢[bilibili-API](https://github.com/SocialSisterYi/bilibili-API)项目提供的API接口，使得本项目的开发更加简单。
--- a/require.txt
+++ b/require.txt
@ -0,0 +1,3 @@
 requests>=2.26.0
 beautifulsoup4>=4.10.0
 fake-useragent>=1.1.3
--- a/targets.txt
+++ b/targets.txt
@ -0,0 +1,4 @@
 BV1a5Q3Y9EX5
 BV1qQQiYvEhF
 BV1PP411W7SG
 BV1xr4y157BY