import os import pandas as pd from tqdm import tqdm import requests from PIL import Image import numpy as np # 新增分区映射函数 def load_partition_mapping(mapping_file='FSR.xlsx'): """加载分区映射关系""" mapping_df = pd.read_excel(mapping_file) return dict(zip(mapping_df['分区名称'], mapping_df['大分区'])) def merge_data(): # 加载分区映射关系 partition_mapping = load_partition_mapping() # 合并热门和非热门数据 hot_data = merge_partition_data("hot_data", is_hot=1, partition_mapping=partition_mapping) nohot_data = merge_partition_data("nohot_data", is_hot=0, partition_mapping=partition_mapping) # 合并所有数据 all_data = pd.concat([hot_data, nohot_data], ignore_index=True) # 按发布时间排序 all_data['发布时间的timestamp'] = pd.to_numeric(all_data['发布时间的timestamp']) all_data = all_data.sort_values('发布时间的timestamp') # 保存非文本数据 all_data.to_csv('data_all_second_ver.csv', index=False, encoding='utf-8-sig') # 处理并保存文本数据 save_text_data(all_data, 'data_text_all') delete_comments_info() def delete_comments_info(): # 读取文件./data_all.csv data_all = pd.read_csv('data_all_second_ver.csv', encoding='utf-8-sig') # 删除评论内容列 data_all = data_all.drop(columns=['评论内容'], errors='ignore') # 保存文件 data_all.to_csv('data_all_second_ver.csv', index=False, encoding='utf-8-sig') def merge_partition_data(base_path, is_hot, partition_mapping): all_data = [] # 遍历所有分区 partitions = [d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d))] for partition in tqdm(partitions, desc=f"处理{'热门' if is_hot else '非热门'}数据"): partition_path = os.path.join(base_path, partition) # 读取info文件 info_file = os.path.join(partition_path, 'info.csv') if not os.path.exists(info_file): continue info_df = pd.read_csv(info_file, encoding='utf-8') # 添加大分区映射 info_df['大分区'] = info_df['分区'].map(partition_mapping) # 读取up_info文件并合并 up_info_file = os.path.join(partition_path, 'up_info.csv') if os.path.exists(up_info_file): up_df = pd.read_csv(up_info_file, encoding='utf-8') # 删除不需要的列 up_df = up_df.drop(columns=['uid', '昵称'], errors='ignore') # 为up主字段添加前缀 up_df = up_df.rename(columns=lambda x: f'up主{x}' if x != 'BV号' else x) info_df = pd.merge(info_df, up_df, on='BV号', how='left') # 删除视频荣誉序列 info_df = info_df.drop(columns=['视频荣誉'], errors='ignore') # 添加是否热门标记 info_df['是否热门'] = is_hot # 加载评论数据但不加入data_all.csv # (仅用于生成单独的评论文件) comments_list = [] for bv in info_df['BV号']: bv_dir = os.path.join(partition_path, bv) if os.path.exists(bv_dir): # 使用glob匹配带数字后缀的评论文件 comment_files = [f for f in os.listdir(bv_dir) if f.startswith(f"{bv}_") and f.endswith("_comments.csv")] if comment_files: try: # 读取第一个匹配的评论文件 comment_file = os.path.join(bv_dir, comment_files[0]) comments = pd.read_csv(comment_file, encoding='utf-8') comments_text = "\n".join(comments['评论内容'].dropna().astype(str)) comments_list.append(comments_text) except Exception as e: print(f"读取评论文件 {comment_file} 失败: {str(e)}") comments_list.append("") else: comments_list.append("") else: comments_list.append("") info_df['评论内容'] = comments_list all_data.append(info_df) return pd.concat(all_data, ignore_index=True) def save_text_data(df, output_dir): # 创建输出目录 os.makedirs(output_dir, exist_ok=True) # 需要保存的文本字段 text_fields = { 'title': '标题', '标签': 'tags', '简介': 'description', '弹幕内容': 'danmaku', '评论内容': 'comments' } # 保存每个文本字段 for field, filename in text_fields.items(): if field in df.columns: # 特殊处理标签字段(从JSON列表转换为纯文本) if field == '标签': tags_texts = [] for tags_json in df[field].dropna(): try: # 处理JSON格式的标签列表 tags = eval(tags_json) if isinstance(tags_json, str) else tags_json tags_texts.extend([tag.strip("'\" ") for tag in tags]) except: continue with open(f'{output_dir}/{filename}.txt', 'w', encoding='utf-8') as f: f.write('\n'.join(tags_texts)) # 特殊处理热门评论(仅保存热门视频的评论) elif field == '评论内容' and '是否热门' in df.columns: hot_comments = df[df['是否热门'] == 1][field].dropna().astype(str).tolist() with open(f'{output_dir}/{filename}_hot.txt', 'w', encoding='utf-8') as f: f.write('\n'.join(hot_comments)) # 普通文本字段处理 else: texts = df[field].dropna().astype(str).tolist() with open(f'{output_dir}/{filename}.txt', 'w', encoding='utf-8') as f: f.write('\n'.join(texts)) # 从dataframe中删除评论内容列 df = df.drop(columns=['评论内容'], errors='ignore') return df if __name__ == '__main__': merge_data()