Compare commits
No commits in common. "ceaaed2ffa2a744831df4eafc25c57db7d7d05cb" and "3425c8e8ba3bf00a7c490601b4299a6f2a52f72e" have entirely different histories.
ceaaed2ffa
...
3425c8e8ba
2
.idea/misc.xml
generated
2
.idea/misc.xml
generated
@ -3,5 +3,5 @@
|
||||
<component name="Black">
|
||||
<option name="sdkName" value="SAM-bilibil" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="SAM-bilibil" project-jdk-type="Python SDK" />
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12" project-jdk-type="Python SDK" />
|
||||
</project>
|
2
.idea/statistics_model2025.iml
generated
2
.idea/statistics_model2025.iml
generated
@ -2,7 +2,7 @@
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="jdk" jdkName="SAM-bilibil" jdkType="Python SDK" />
|
||||
<orderEntry type="jdk" jdkName="Python 3.12" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="PyDocumentationSettings">
|
||||
|
42
dataDeal.md
42
dataDeal.md
@ -1,42 +0,0 @@
|
||||
# 数据处理
|
||||
|
||||
## 合并数据文件
|
||||
### 1. 合并热门数据
|
||||
- 数据文件
|
||||
- 视频: hot_data/分区/info
|
||||
- up: hot_data/分区/up_info
|
||||
- 弹幕/评论: hot_data/分区/BV号/...
|
||||
- 单分区处理
|
||||
- 按bv号匹配视频对应up指标,添加到info.csv
|
||||
- 依序读取弹幕输出情感评分,(如果顺序没变的话) 直接添加一列到info.csv
|
||||
- 合并: 遍历分区info文件创建总文件,并给“视频荣誉”改名成“是否热门”并赋值为1
|
||||
### 2. 合并非热数据
|
||||
- 同上,并赋值为0
|
||||
### 3. 合并两个文件
|
||||
- 根据URL获取封面
|
||||
- 按发布时间排序
|
||||
### 4. 文本数据合并
|
||||
- 评论文本 (仅热门): 直接合并成列,人工筛选高质量文本
|
||||
- 标签合并 (放一起整一个txt即可,拉个词云了事)
|
||||
- 简介合并 (同上)
|
||||
## 数据预处理
|
||||
原合并文件+量化后文件两个文件
|
||||
### 单独处理
|
||||
- 是否为系列 (标题关键词分析)(excel使用find函数即可)(否0是1)
|
||||
- 封面处理并量化
|
||||
### 数据量化(可以用excel实现)
|
||||
- up主uid:输出uid位数(越小表示号越老)
|
||||
- up主性别: 男0,女1,保密2
|
||||
- 播放量对数转换, 输出logV
|
||||
- 发布时间: 区分为0-6点,6-12点,12-18点,18-24点四个时段,依次赋值为1-4
|
||||
- 小分区映射到大分区,具体命名规则见[文件](FSR.xlsx)
|
||||
- 是否为联投: 否0,是1
|
||||
- 视频方向:竖屏0,横屏1
|
||||
- 视频分辨率: 360、720、1080、2k、4k、8k(近似匹配),赋值为1-6
|
||||
- 视频类型:搬运0,自制1
|
||||
- 字幕: 无字幕为0,剩下为1
|
||||
- 视频总时长:输出小于60的,之间的,和大于600的,赋值为1,2,3,方便后续描述性分析
|
||||
### 删除不用指标
|
||||
- 发布时间等上述被处理过的指标(原播放量要保留)
|
||||
- 视频简介、标签
|
||||
- 调整顺序(基础信息放前面,其次是连续型变量,最后是分类变量)
|
1137
data_all.csv
1137
data_all.csv
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
1136
data_text_all/标题.txt
1136
data_text_all/标题.txt
File diff suppressed because it is too large
Load Diff
160
tool.py
160
tool.py
@ -1,160 +0,0 @@
|
||||
import os
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
import requests
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
|
||||
# 新增分区映射函数
|
||||
def load_partition_mapping(mapping_file='FSR.xlsx'):
|
||||
"""加载分区映射关系"""
|
||||
mapping_df = pd.read_excel(mapping_file)
|
||||
return dict(zip(mapping_df['分区名称'], mapping_df['大分区']))
|
||||
|
||||
def merge_data():
|
||||
# 加载分区映射关系
|
||||
partition_mapping = load_partition_mapping()
|
||||
|
||||
# 合并热门和非热门数据
|
||||
hot_data = merge_partition_data("hot_data", is_hot=1, partition_mapping=partition_mapping)
|
||||
nohot_data = merge_partition_data("nohot_data", is_hot=0, partition_mapping=partition_mapping)
|
||||
|
||||
# 合并所有数据
|
||||
all_data = pd.concat([hot_data, nohot_data], ignore_index=True)
|
||||
|
||||
# 按发布时间排序
|
||||
all_data['发布时间的timestamp'] = pd.to_numeric(all_data['发布时间的timestamp'])
|
||||
all_data = all_data.sort_values('发布时间的timestamp')
|
||||
|
||||
# 保存非文本数据
|
||||
all_data.to_csv('data_all.csv', index=False, encoding='utf-8-sig')
|
||||
|
||||
# 处理并保存文本数据
|
||||
save_text_data(all_data, 'data_text_all')
|
||||
|
||||
delete_comments_info()
|
||||
|
||||
def delete_comments_info():
|
||||
# 读取文件./data_all.csv
|
||||
data_all = pd.read_csv('data_all.csv', encoding='utf-8-sig')
|
||||
|
||||
# 删除评论内容列
|
||||
data_all = data_all.drop(columns=['评论内容'], errors='ignore')
|
||||
|
||||
# 保存文件
|
||||
data_all.to_csv('data_all.csv', index=False, encoding='utf-8-sig')
|
||||
|
||||
def merge_partition_data(base_path, is_hot, partition_mapping):
|
||||
all_data = []
|
||||
|
||||
# 遍历所有分区
|
||||
partitions = [d for d in os.listdir(base_path)
|
||||
if os.path.isdir(os.path.join(base_path, d))]
|
||||
|
||||
for partition in tqdm(partitions, desc=f"处理{'热门' if is_hot else '非热门'}数据"):
|
||||
partition_path = os.path.join(base_path, partition)
|
||||
|
||||
# 读取info文件
|
||||
info_file = os.path.join(partition_path, 'info.csv')
|
||||
if not os.path.exists(info_file):
|
||||
continue
|
||||
|
||||
info_df = pd.read_csv(info_file, encoding='utf-8')
|
||||
|
||||
# 添加大分区映射
|
||||
info_df['大分区'] = info_df['分区'].map(partition_mapping)
|
||||
|
||||
# 读取up_info文件并合并
|
||||
up_info_file = os.path.join(partition_path, 'up_info.csv')
|
||||
if os.path.exists(up_info_file):
|
||||
up_df = pd.read_csv(up_info_file, encoding='utf-8')
|
||||
# 删除不需要的列
|
||||
up_df = up_df.drop(columns=['uid', '昵称'], errors='ignore')
|
||||
# 为up主字段添加前缀
|
||||
up_df = up_df.rename(columns=lambda x: f'up主{x}' if x != 'BV号' else x)
|
||||
info_df = pd.merge(info_df, up_df, on='BV号', how='left')
|
||||
|
||||
# 删除视频荣誉序列
|
||||
info_df = info_df.drop(columns=['视频荣誉'], errors='ignore')
|
||||
|
||||
# 添加是否热门标记
|
||||
info_df['是否热门'] = is_hot
|
||||
|
||||
# 加载评论数据但不加入data_all.csv
|
||||
# (仅用于生成单独的评论文件)
|
||||
comments_list = []
|
||||
for bv in info_df['BV号']:
|
||||
bv_dir = os.path.join(partition_path, bv)
|
||||
if os.path.exists(bv_dir):
|
||||
# 使用glob匹配带数字后缀的评论文件
|
||||
comment_files = [f for f in os.listdir(bv_dir)
|
||||
if f.startswith(f"{bv}_") and f.endswith("_comments.csv")]
|
||||
|
||||
if comment_files:
|
||||
try:
|
||||
# 读取第一个匹配的评论文件
|
||||
comment_file = os.path.join(bv_dir, comment_files[0])
|
||||
comments = pd.read_csv(comment_file, encoding='utf-8')
|
||||
comments_text = "\n".join(comments['评论内容'].dropna().astype(str))
|
||||
comments_list.append(comments_text)
|
||||
except Exception as e:
|
||||
print(f"读取评论文件 {comment_file} 失败: {str(e)}")
|
||||
comments_list.append("")
|
||||
else:
|
||||
comments_list.append("")
|
||||
else:
|
||||
comments_list.append("")
|
||||
|
||||
info_df['评论内容'] = comments_list
|
||||
|
||||
all_data.append(info_df)
|
||||
|
||||
return pd.concat(all_data, ignore_index=True)
|
||||
|
||||
def save_text_data(df, output_dir):
|
||||
# 创建输出目录
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# 需要保存的文本字段
|
||||
text_fields = {
|
||||
'title': '标题',
|
||||
'标签': 'tags',
|
||||
'简介': 'description',
|
||||
'弹幕内容': 'danmaku',
|
||||
'评论内容': 'comments'
|
||||
}
|
||||
|
||||
# 保存每个文本字段
|
||||
for field, filename in text_fields.items():
|
||||
if field in df.columns:
|
||||
# 特殊处理标签字段(从JSON列表转换为纯文本)
|
||||
if field == '标签':
|
||||
tags_texts = []
|
||||
for tags_json in df[field].dropna():
|
||||
try:
|
||||
# 处理JSON格式的标签列表
|
||||
tags = eval(tags_json) if isinstance(tags_json, str) else tags_json
|
||||
tags_texts.extend([tag.strip("'\" ") for tag in tags])
|
||||
except:
|
||||
continue
|
||||
with open(f'{output_dir}/{filename}.txt', 'w', encoding='utf-8') as f:
|
||||
f.write('\n'.join(tags_texts))
|
||||
|
||||
# 特殊处理热门评论(仅保存热门视频的评论)
|
||||
elif field == '评论内容' and '是否热门' in df.columns:
|
||||
hot_comments = df[df['是否热门'] == 1][field].dropna().astype(str).tolist()
|
||||
with open(f'{output_dir}/{filename}_hot.txt', 'w', encoding='utf-8') as f:
|
||||
f.write('\n'.join(hot_comments))
|
||||
|
||||
# 普通文本字段处理
|
||||
else:
|
||||
texts = df[field].dropna().astype(str).tolist()
|
||||
with open(f'{output_dir}/{filename}.txt', 'w', encoding='utf-8') as f:
|
||||
f.write('\n'.join(texts))
|
||||
|
||||
# 从dataframe中删除评论内容列
|
||||
df = df.drop(columns=['评论内容'], errors='ignore')
|
||||
return df
|
||||
|
||||
if __name__ == '__main__':
|
||||
merge_data()
|
Loading…
x
Reference in New Issue
Block a user