161 lines
6.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import pandas as pd
from tqdm import tqdm
import requests
from PIL import Image
import numpy as np
# 新增分区映射函数
def load_partition_mapping(mapping_file='FSR.xlsx'):
"""加载分区映射关系"""
mapping_df = pd.read_excel(mapping_file)
return dict(zip(mapping_df['分区名称'], mapping_df['大分区']))
def merge_data():
# 加载分区映射关系
partition_mapping = load_partition_mapping()
# 合并热门和非热门数据
hot_data = merge_partition_data("hot_data", is_hot=1, partition_mapping=partition_mapping)
nohot_data = merge_partition_data("nohot_data", is_hot=0, partition_mapping=partition_mapping)
# 合并所有数据
all_data = pd.concat([hot_data, nohot_data], ignore_index=True)
# 按发布时间排序
all_data['发布时间的timestamp'] = pd.to_numeric(all_data['发布时间的timestamp'])
all_data = all_data.sort_values('发布时间的timestamp')
# 保存非文本数据
all_data.to_csv('data_all_second_ver.csv', index=False, encoding='utf-8-sig')
# 处理并保存文本数据
save_text_data(all_data, 'data_text_all')
delete_comments_info()
def delete_comments_info():
# 读取文件./data_all.csv
data_all = pd.read_csv('data_all_second_ver.csv', encoding='utf-8-sig')
# 删除评论内容列
data_all = data_all.drop(columns=['评论内容'], errors='ignore')
# 保存文件
data_all.to_csv('data_all_second_ver.csv', index=False, encoding='utf-8-sig')
def merge_partition_data(base_path, is_hot, partition_mapping):
all_data = []
# 遍历所有分区
partitions = [d for d in os.listdir(base_path)
if os.path.isdir(os.path.join(base_path, d))]
for partition in tqdm(partitions, desc=f"处理{'热门' if is_hot else '非热门'}数据"):
partition_path = os.path.join(base_path, partition)
# 读取info文件
info_file = os.path.join(partition_path, 'info.csv')
if not os.path.exists(info_file):
continue
info_df = pd.read_csv(info_file, encoding='utf-8')
# 添加大分区映射
info_df['大分区'] = info_df['分区'].map(partition_mapping)
# 读取up_info文件并合并
up_info_file = os.path.join(partition_path, 'up_info.csv')
if os.path.exists(up_info_file):
up_df = pd.read_csv(up_info_file, encoding='utf-8')
# 删除不需要的列
up_df = up_df.drop(columns=['uid', '昵称'], errors='ignore')
# 为up主字段添加前缀
up_df = up_df.rename(columns=lambda x: f'up主{x}' if x != 'BV号' else x)
info_df = pd.merge(info_df, up_df, on='BV号', how='left')
# 删除视频荣誉序列
info_df = info_df.drop(columns=['视频荣誉'], errors='ignore')
# 添加是否热门标记
info_df['是否热门'] = is_hot
# 加载评论数据但不加入data_all.csv
# (仅用于生成单独的评论文件)
comments_list = []
for bv in info_df['BV号']:
bv_dir = os.path.join(partition_path, bv)
if os.path.exists(bv_dir):
# 使用glob匹配带数字后缀的评论文件
comment_files = [f for f in os.listdir(bv_dir)
if f.startswith(f"{bv}_") and f.endswith("_comments.csv")]
if comment_files:
try:
# 读取第一个匹配的评论文件
comment_file = os.path.join(bv_dir, comment_files[0])
comments = pd.read_csv(comment_file, encoding='utf-8')
comments_text = "\n".join(comments['评论内容'].dropna().astype(str))
comments_list.append(comments_text)
except Exception as e:
print(f"读取评论文件 {comment_file} 失败: {str(e)}")
comments_list.append("")
else:
comments_list.append("")
else:
comments_list.append("")
info_df['评论内容'] = comments_list
all_data.append(info_df)
return pd.concat(all_data, ignore_index=True)
def save_text_data(df, output_dir):
# 创建输出目录
os.makedirs(output_dir, exist_ok=True)
# 需要保存的文本字段
text_fields = {
'title': '标题',
'标签': 'tags',
'简介': 'description',
'弹幕内容': 'danmaku',
'评论内容': 'comments'
}
# 保存每个文本字段
for field, filename in text_fields.items():
if field in df.columns:
# 特殊处理标签字段从JSON列表转换为纯文本
if field == '标签':
tags_texts = []
for tags_json in df[field].dropna():
try:
# 处理JSON格式的标签列表
tags = eval(tags_json) if isinstance(tags_json, str) else tags_json
tags_texts.extend([tag.strip("'\" ") for tag in tags])
except:
continue
with open(f'{output_dir}/{filename}.txt', 'w', encoding='utf-8') as f:
f.write('\n'.join(tags_texts))
# 特殊处理热门评论(仅保存热门视频的评论)
elif field == '评论内容' and '是否热门' in df.columns:
hot_comments = df[df['是否热门'] == 1][field].dropna().astype(str).tolist()
with open(f'{output_dir}/{filename}_hot.txt', 'w', encoding='utf-8') as f:
f.write('\n'.join(hot_comments))
# 普通文本字段处理
else:
texts = df[field].dropna().astype(str).tolist()
with open(f'{output_dir}/{filename}.txt', 'w', encoding='utf-8') as f:
f.write('\n'.join(texts))
# 从dataframe中删除评论内容列
df = df.drop(columns=['评论内容'], errors='ignore')
return df
if __name__ == '__main__':
merge_data()