Compare commits

..

No commits in common. "ceaaed2ffa2a744831df4eafc25c57db7d7d05cb" and "3425c8e8ba3bf00a7c490601b4299a6f2a52f72e" have entirely different histories.

10 changed files with 2 additions and 80284 deletions

2
.idea/misc.xml generated
View File

@ -3,5 +3,5 @@
<component name="Black">
<option name="sdkName" value="SAM-bilibil" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="SAM-bilibil" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12" project-jdk-type="Python SDK" />
</project>

View File

@ -2,7 +2,7 @@
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="SAM-bilibil" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="Python 3.12" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyDocumentationSettings">

BIN
FSR.xlsx

Binary file not shown.

View File

@ -1,42 +0,0 @@
# 数据处理
## 合并数据文件
### 1. 合并热门数据
- 数据文件
- 视频: hot_data/分区/info
- up: hot_data/分区/up_info
- 弹幕/评论: hot_data/分区/BV号/...
- 单分区处理
- 按bv号匹配视频对应up指标添加到info.csv
- 依序读取弹幕输出情感评分,(如果顺序没变的话) 直接添加一列到info.csv
- 合并: 遍历分区info文件创建总文件并给“视频荣誉”改名成“是否热门”并赋值为1
### 2. 合并非热数据
- 同上并赋值为0
### 3. 合并两个文件
- 根据URL获取封面
- 按发布时间排序
### 4. 文本数据合并
- 评论文本 (仅热门): 直接合并成列,人工筛选高质量文本
- 标签合并 (放一起整一个txt即可拉个词云了事)
- 简介合并 (同上)
## 数据预处理
原合并文件+量化后文件两个文件
### 单独处理
- 是否为系列 (标题关键词分析)(excel使用find函数即可)否0是1
- 封面处理并量化
### 数据量化可以用excel实现)
- up主uid输出uid位数越小表示号越老
- up主性别 男0女1保密2
- 播放量对数转换, 输出logV
- 发布时间: 区分为0-6点6-12点12-18点18-24点四个时段依次赋值为1-4
- 小分区映射到大分区,具体命名规则见[文件](FSR.xlsx)
- 是否为联投: 否0是1
- 视频方向竖屏0横屏1
- 视频分辨率: 360、720、1080、2k、4k、8k(近似匹配赋值为1-6
- 视频类型搬运0自制1
- 字幕: 无字幕为0剩下为1
- 视频总时长输出小于60的之间的和大于600的赋值为123方便后续描述性分析
### 删除不用指标
- 发布时间等上述被处理过的指标(原播放量要保留)
- 视频简介、标签
- 调整顺序(基础信息放前面,其次是连续型变量,最后是分类变量)

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

160
tool.py
View File

@ -1,160 +0,0 @@
import os
import pandas as pd
from tqdm import tqdm
import requests
from PIL import Image
import numpy as np
# 新增分区映射函数
def load_partition_mapping(mapping_file='FSR.xlsx'):
"""加载分区映射关系"""
mapping_df = pd.read_excel(mapping_file)
return dict(zip(mapping_df['分区名称'], mapping_df['大分区']))
def merge_data():
# 加载分区映射关系
partition_mapping = load_partition_mapping()
# 合并热门和非热门数据
hot_data = merge_partition_data("hot_data", is_hot=1, partition_mapping=partition_mapping)
nohot_data = merge_partition_data("nohot_data", is_hot=0, partition_mapping=partition_mapping)
# 合并所有数据
all_data = pd.concat([hot_data, nohot_data], ignore_index=True)
# 按发布时间排序
all_data['发布时间的timestamp'] = pd.to_numeric(all_data['发布时间的timestamp'])
all_data = all_data.sort_values('发布时间的timestamp')
# 保存非文本数据
all_data.to_csv('data_all.csv', index=False, encoding='utf-8-sig')
# 处理并保存文本数据
save_text_data(all_data, 'data_text_all')
delete_comments_info()
def delete_comments_info():
# 读取文件./data_all.csv
data_all = pd.read_csv('data_all.csv', encoding='utf-8-sig')
# 删除评论内容列
data_all = data_all.drop(columns=['评论内容'], errors='ignore')
# 保存文件
data_all.to_csv('data_all.csv', index=False, encoding='utf-8-sig')
def merge_partition_data(base_path, is_hot, partition_mapping):
all_data = []
# 遍历所有分区
partitions = [d for d in os.listdir(base_path)
if os.path.isdir(os.path.join(base_path, d))]
for partition in tqdm(partitions, desc=f"处理{'热门' if is_hot else '非热门'}数据"):
partition_path = os.path.join(base_path, partition)
# 读取info文件
info_file = os.path.join(partition_path, 'info.csv')
if not os.path.exists(info_file):
continue
info_df = pd.read_csv(info_file, encoding='utf-8')
# 添加大分区映射
info_df['大分区'] = info_df['分区'].map(partition_mapping)
# 读取up_info文件并合并
up_info_file = os.path.join(partition_path, 'up_info.csv')
if os.path.exists(up_info_file):
up_df = pd.read_csv(up_info_file, encoding='utf-8')
# 删除不需要的列
up_df = up_df.drop(columns=['uid', '昵称'], errors='ignore')
# 为up主字段添加前缀
up_df = up_df.rename(columns=lambda x: f'up主{x}' if x != 'BV号' else x)
info_df = pd.merge(info_df, up_df, on='BV号', how='left')
# 删除视频荣誉序列
info_df = info_df.drop(columns=['视频荣誉'], errors='ignore')
# 添加是否热门标记
info_df['是否热门'] = is_hot
# 加载评论数据但不加入data_all.csv
# (仅用于生成单独的评论文件)
comments_list = []
for bv in info_df['BV号']:
bv_dir = os.path.join(partition_path, bv)
if os.path.exists(bv_dir):
# 使用glob匹配带数字后缀的评论文件
comment_files = [f for f in os.listdir(bv_dir)
if f.startswith(f"{bv}_") and f.endswith("_comments.csv")]
if comment_files:
try:
# 读取第一个匹配的评论文件
comment_file = os.path.join(bv_dir, comment_files[0])
comments = pd.read_csv(comment_file, encoding='utf-8')
comments_text = "\n".join(comments['评论内容'].dropna().astype(str))
comments_list.append(comments_text)
except Exception as e:
print(f"读取评论文件 {comment_file} 失败: {str(e)}")
comments_list.append("")
else:
comments_list.append("")
else:
comments_list.append("")
info_df['评论内容'] = comments_list
all_data.append(info_df)
return pd.concat(all_data, ignore_index=True)
def save_text_data(df, output_dir):
# 创建输出目录
os.makedirs(output_dir, exist_ok=True)
# 需要保存的文本字段
text_fields = {
'title': '标题',
'标签': 'tags',
'简介': 'description',
'弹幕内容': 'danmaku',
'评论内容': 'comments'
}
# 保存每个文本字段
for field, filename in text_fields.items():
if field in df.columns:
# 特殊处理标签字段从JSON列表转换为纯文本
if field == '标签':
tags_texts = []
for tags_json in df[field].dropna():
try:
# 处理JSON格式的标签列表
tags = eval(tags_json) if isinstance(tags_json, str) else tags_json
tags_texts.extend([tag.strip("'\" ") for tag in tags])
except:
continue
with open(f'{output_dir}/{filename}.txt', 'w', encoding='utf-8') as f:
f.write('\n'.join(tags_texts))
# 特殊处理热门评论(仅保存热门视频的评论)
elif field == '评论内容' and '是否热门' in df.columns:
hot_comments = df[df['是否热门'] == 1][field].dropna().astype(str).tolist()
with open(f'{output_dir}/{filename}_hot.txt', 'w', encoding='utf-8') as f:
f.write('\n'.join(hot_comments))
# 普通文本字段处理
else:
texts = df[field].dropna().astype(str).tolist()
with open(f'{output_dir}/{filename}.txt', 'w', encoding='utf-8') as f:
f.write('\n'.join(texts))
# 从dataframe中删除评论内容列
df = df.drop(columns=['评论内容'], errors='ignore')
return df
if __name__ == '__main__':
merge_data()