add:data_merge_tool
This commit is contained in:
parent
fe45cddb8c
commit
6a7d9e48a5
2
.idea/misc.xml
generated
2
.idea/misc.xml
generated
@ -3,5 +3,5 @@
|
||||
<component name="Black">
|
||||
<option name="sdkName" value="SAM-bilibil" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12" project-jdk-type="Python SDK" />
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="SAM-bilibil" project-jdk-type="Python SDK" />
|
||||
</project>
|
2
.idea/statistics_model2025.iml
generated
2
.idea/statistics_model2025.iml
generated
@ -2,7 +2,7 @@
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="jdk" jdkName="Python 3.12" jdkType="Python SDK" />
|
||||
<orderEntry type="jdk" jdkName="SAM-bilibil" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="PyDocumentationSettings">
|
||||
|
42
dataDeal.md
Normal file
42
dataDeal.md
Normal file
@ -0,0 +1,42 @@
|
||||
# 数据处理
|
||||
|
||||
## 合并数据文件
|
||||
### 1. 合并热门数据
|
||||
- 数据文件
|
||||
- 视频: hot_data/分区/info
|
||||
- up: hot_data/分区/up_info
|
||||
- 弹幕/评论: hot_data/分区/BV号/...
|
||||
- 单分区处理
|
||||
- 按bv号匹配视频对应up指标,添加到info.csv
|
||||
- 依序读取弹幕输出情感评分,(如果顺序没变的话) 直接添加一列到info.csv
|
||||
- 合并: 遍历分区info文件创建总文件,并给“视频荣誉”改名成“是否热门”并赋值为1
|
||||
### 2. 合并非热数据
|
||||
- 同上,并赋值为0
|
||||
### 3. 合并两个文件
|
||||
- 根据URL获取封面
|
||||
- 按发布时间排序
|
||||
### 4. 文本数据合并
|
||||
- 评论文本 (仅热门): 直接合并成列,人工筛选高质量文本
|
||||
- 标签合并 (放一起整一个txt即可,拉个词云了事)
|
||||
- 简介合并 (同上)
|
||||
## 数据预处理
|
||||
原合并文件+量化后文件两个文件
|
||||
### 单独处理
|
||||
- 是否为系列 (标题关键词分析)(excel使用find函数即可)(否0是1)
|
||||
- 封面处理并量化
|
||||
### 数据量化(可以用excel实现)
|
||||
- up主uid:输出uid位数(越小表示号越老)
|
||||
- up主性别: 男0,女1,保密2
|
||||
- 播放量对数转换, 输出logV
|
||||
- 发布时间: 区分为0-6点,6-12点,12-18点,18-24点四个时段,依次赋值为1-4
|
||||
- 小分区映射到大分区,具体命名规则见[文件](FSR.xlsx)
|
||||
- 是否为联投: 否0,是1
|
||||
- 视频方向:竖屏0,横屏1
|
||||
- 视频分辨率: 360、720、1080、2k、4k、8k(近似匹配),赋值为1-6
|
||||
- 视频类型:搬运0,自制1
|
||||
- 字幕: 无字幕为0,剩下为1
|
||||
- 视频总时长:输出小于60的,之间的,和大于600的,赋值为1,2,3,方便后续描述性分析
|
||||
### 删除不用指标
|
||||
- 发布时间等上述被处理过的指标(原播放量要保留)
|
||||
- 视频简介、标签
|
||||
- 调整顺序(基础信息放前面,其次是连续型变量,最后是分类变量)
|
1137
data_all.csv
Normal file
1137
data_all.csv
Normal file
File diff suppressed because one or more lines are too long
68573
data_text_all/comments_hot.txt
Normal file
68573
data_text_all/comments_hot.txt
Normal file
File diff suppressed because it is too large
Load Diff
923
data_text_all/description.txt
Normal file
923
data_text_all/description.txt
Normal file
File diff suppressed because one or more lines are too long
8311
data_text_all/tags.txt
Normal file
8311
data_text_all/tags.txt
Normal file
File diff suppressed because it is too large
Load Diff
1136
data_text_all/标题.txt
Normal file
1136
data_text_all/标题.txt
Normal file
File diff suppressed because it is too large
Load Diff
160
tool.py
Normal file
160
tool.py
Normal file
@ -0,0 +1,160 @@
|
||||
import os
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
import requests
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
|
||||
# 新增分区映射函数
|
||||
def load_partition_mapping(mapping_file='FSR.xlsx'):
|
||||
"""加载分区映射关系"""
|
||||
mapping_df = pd.read_excel(mapping_file)
|
||||
return dict(zip(mapping_df['分区名称'], mapping_df['大分区']))
|
||||
|
||||
def merge_data():
|
||||
# 加载分区映射关系
|
||||
partition_mapping = load_partition_mapping()
|
||||
|
||||
# 合并热门和非热门数据
|
||||
hot_data = merge_partition_data("hot_data", is_hot=1, partition_mapping=partition_mapping)
|
||||
nohot_data = merge_partition_data("nohot_data", is_hot=0, partition_mapping=partition_mapping)
|
||||
|
||||
# 合并所有数据
|
||||
all_data = pd.concat([hot_data, nohot_data], ignore_index=True)
|
||||
|
||||
# 按发布时间排序
|
||||
all_data['发布时间的timestamp'] = pd.to_numeric(all_data['发布时间的timestamp'])
|
||||
all_data = all_data.sort_values('发布时间的timestamp')
|
||||
|
||||
# 保存非文本数据
|
||||
all_data.to_csv('data_all.csv', index=False, encoding='utf-8-sig')
|
||||
|
||||
# 处理并保存文本数据
|
||||
save_text_data(all_data, 'data_text_all')
|
||||
|
||||
delete_comments_info()
|
||||
|
||||
def delete_comments_info():
|
||||
# 读取文件./data_all.csv
|
||||
data_all = pd.read_csv('data_all.csv', encoding='utf-8-sig')
|
||||
|
||||
# 删除评论内容列
|
||||
data_all = data_all.drop(columns=['评论内容'], errors='ignore')
|
||||
|
||||
# 保存文件
|
||||
data_all.to_csv('data_all.csv', index=False, encoding='utf-8-sig')
|
||||
|
||||
def merge_partition_data(base_path, is_hot, partition_mapping):
|
||||
all_data = []
|
||||
|
||||
# 遍历所有分区
|
||||
partitions = [d for d in os.listdir(base_path)
|
||||
if os.path.isdir(os.path.join(base_path, d))]
|
||||
|
||||
for partition in tqdm(partitions, desc=f"处理{'热门' if is_hot else '非热门'}数据"):
|
||||
partition_path = os.path.join(base_path, partition)
|
||||
|
||||
# 读取info文件
|
||||
info_file = os.path.join(partition_path, 'info.csv')
|
||||
if not os.path.exists(info_file):
|
||||
continue
|
||||
|
||||
info_df = pd.read_csv(info_file, encoding='utf-8')
|
||||
|
||||
# 添加大分区映射
|
||||
info_df['大分区'] = info_df['分区'].map(partition_mapping)
|
||||
|
||||
# 读取up_info文件并合并
|
||||
up_info_file = os.path.join(partition_path, 'up_info.csv')
|
||||
if os.path.exists(up_info_file):
|
||||
up_df = pd.read_csv(up_info_file, encoding='utf-8')
|
||||
# 删除不需要的列
|
||||
up_df = up_df.drop(columns=['uid', '昵称'], errors='ignore')
|
||||
# 为up主字段添加前缀
|
||||
up_df = up_df.rename(columns=lambda x: f'up主{x}' if x != 'BV号' else x)
|
||||
info_df = pd.merge(info_df, up_df, on='BV号', how='left')
|
||||
|
||||
# 删除视频荣誉序列
|
||||
info_df = info_df.drop(columns=['视频荣誉'], errors='ignore')
|
||||
|
||||
# 添加是否热门标记
|
||||
info_df['是否热门'] = is_hot
|
||||
|
||||
# 加载评论数据但不加入data_all.csv
|
||||
# (仅用于生成单独的评论文件)
|
||||
comments_list = []
|
||||
for bv in info_df['BV号']:
|
||||
bv_dir = os.path.join(partition_path, bv)
|
||||
if os.path.exists(bv_dir):
|
||||
# 使用glob匹配带数字后缀的评论文件
|
||||
comment_files = [f for f in os.listdir(bv_dir)
|
||||
if f.startswith(f"{bv}_") and f.endswith("_comments.csv")]
|
||||
|
||||
if comment_files:
|
||||
try:
|
||||
# 读取第一个匹配的评论文件
|
||||
comment_file = os.path.join(bv_dir, comment_files[0])
|
||||
comments = pd.read_csv(comment_file, encoding='utf-8')
|
||||
comments_text = "\n".join(comments['评论内容'].dropna().astype(str))
|
||||
comments_list.append(comments_text)
|
||||
except Exception as e:
|
||||
print(f"读取评论文件 {comment_file} 失败: {str(e)}")
|
||||
comments_list.append("")
|
||||
else:
|
||||
comments_list.append("")
|
||||
else:
|
||||
comments_list.append("")
|
||||
|
||||
info_df['评论内容'] = comments_list
|
||||
|
||||
all_data.append(info_df)
|
||||
|
||||
return pd.concat(all_data, ignore_index=True)
|
||||
|
||||
def save_text_data(df, output_dir):
|
||||
# 创建输出目录
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# 需要保存的文本字段
|
||||
text_fields = {
|
||||
'title': '标题',
|
||||
'标签': 'tags',
|
||||
'简介': 'description',
|
||||
'弹幕内容': 'danmaku',
|
||||
'评论内容': 'comments'
|
||||
}
|
||||
|
||||
# 保存每个文本字段
|
||||
for field, filename in text_fields.items():
|
||||
if field in df.columns:
|
||||
# 特殊处理标签字段(从JSON列表转换为纯文本)
|
||||
if field == '标签':
|
||||
tags_texts = []
|
||||
for tags_json in df[field].dropna():
|
||||
try:
|
||||
# 处理JSON格式的标签列表
|
||||
tags = eval(tags_json) if isinstance(tags_json, str) else tags_json
|
||||
tags_texts.extend([tag.strip("'\" ") for tag in tags])
|
||||
except:
|
||||
continue
|
||||
with open(f'{output_dir}/{filename}.txt', 'w', encoding='utf-8') as f:
|
||||
f.write('\n'.join(tags_texts))
|
||||
|
||||
# 特殊处理热门评论(仅保存热门视频的评论)
|
||||
elif field == '评论内容' and '是否热门' in df.columns:
|
||||
hot_comments = df[df['是否热门'] == 1][field].dropna().astype(str).tolist()
|
||||
with open(f'{output_dir}/{filename}_hot.txt', 'w', encoding='utf-8') as f:
|
||||
f.write('\n'.join(hot_comments))
|
||||
|
||||
# 普通文本字段处理
|
||||
else:
|
||||
texts = df[field].dropna().astype(str).tolist()
|
||||
with open(f'{output_dir}/{filename}.txt', 'w', encoding='utf-8') as f:
|
||||
f.write('\n'.join(texts))
|
||||
|
||||
# 从dataframe中删除评论内容列
|
||||
df = df.drop(columns=['评论内容'], errors='ignore')
|
||||
return df
|
||||
|
||||
if __name__ == '__main__':
|
||||
merge_data()
|
BIN
~$FSR.xlsx
BIN
~$FSR.xlsx
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user