2025-03-29 14:02:40 +08:00
|
|
|
|
import pandas as pd
|
2025-03-29 15:41:31 +08:00
|
|
|
|
import numpy as np
|
2025-03-29 14:02:40 +08:00
|
|
|
|
from snownlp import SnowNLP
|
2025-03-29 15:41:31 +08:00
|
|
|
|
import os
|
2025-03-29 14:02:40 +08:00
|
|
|
|
|
|
|
|
|
def load_data(file_path):
|
|
|
|
|
try:
|
|
|
|
|
df = pd.read_csv(file_path, usecols=['弹幕内容'], engine='python')
|
|
|
|
|
return df['弹幕内容'].dropna().astype(str).tolist()
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"数据加载失败: {str(e)}")
|
|
|
|
|
return []
|
|
|
|
|
|
2025-03-29 15:41:31 +08:00
|
|
|
|
def analyze_sentiment(danmu_texts):
|
|
|
|
|
sentiment_scores = []
|
2025-03-29 14:02:40 +08:00
|
|
|
|
|
2025-03-29 15:41:31 +08:00
|
|
|
|
for item in danmu_texts:
|
2025-03-29 18:30:20 +08:00
|
|
|
|
s=SnowNLP(item)
|
|
|
|
|
sentiment_scores.append(s.sentiments)
|
2025-03-29 14:02:40 +08:00
|
|
|
|
|
2025-03-29 15:41:31 +08:00
|
|
|
|
avg_score = np.mean(sentiment_scores)
|
2025-03-29 18:30:20 +08:00
|
|
|
|
return avg_score
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def process_all_partitions(base_path):
|
|
|
|
|
# 获取所有分区目录
|
|
|
|
|
partitions = [d for d in os.listdir(base_path)
|
|
|
|
|
if os.path.isdir(os.path.join(base_path, d))]
|
2025-03-29 15:41:31 +08:00
|
|
|
|
|
2025-03-29 18:30:20 +08:00
|
|
|
|
for partition in partitions:
|
|
|
|
|
partition_path = os.path.join(base_path, partition)
|
|
|
|
|
print(f"正在处理分区: {partition}")
|
|
|
|
|
process_partition(partition_path)
|
2025-03-29 15:41:31 +08:00
|
|
|
|
|
2025-03-29 18:30:20 +08:00
|
|
|
|
# process_partition函数
|
2025-03-29 15:41:31 +08:00
|
|
|
|
def process_partition(partition_path):
|
|
|
|
|
info_file = os.path.join(partition_path, 'info.csv')
|
|
|
|
|
if not os.path.exists(info_file):
|
|
|
|
|
print(f"未找到info文件: {info_file}")
|
|
|
|
|
return
|
|
|
|
|
|
2025-03-29 18:30:20 +08:00
|
|
|
|
info_df = pd.read_csv(info_file, encoding='utf-8')
|
|
|
|
|
# 创建与info_df行数相同的空列表,初始值为None
|
|
|
|
|
scores = [None] * len(info_df)
|
2025-03-29 15:41:31 +08:00
|
|
|
|
|
2025-03-29 18:30:20 +08:00
|
|
|
|
for idx, bv in enumerate(info_df['BV号']):
|
|
|
|
|
# 构建弹幕文件目录路径
|
|
|
|
|
danmu_dir = os.path.join(partition_path, bv)
|
|
|
|
|
if not os.path.exists(danmu_dir):
|
|
|
|
|
continue # 保持None值
|
2025-03-29 15:41:31 +08:00
|
|
|
|
|
2025-03-29 18:30:20 +08:00
|
|
|
|
# 查找匹配的弹幕文件
|
|
|
|
|
danmu_files = [f for f in os.listdir(danmu_dir)
|
|
|
|
|
if f.startswith(bv) and f.endswith('danmaku.csv')]
|
|
|
|
|
|
|
|
|
|
if not danmu_files:
|
|
|
|
|
continue # 保持None值
|
|
|
|
|
|
|
|
|
|
danmu_file = os.path.join(danmu_dir, danmu_files[0])
|
2025-03-29 15:41:31 +08:00
|
|
|
|
danmu_texts = load_data(danmu_file)
|
2025-03-29 18:30:20 +08:00
|
|
|
|
|
2025-03-29 15:41:31 +08:00
|
|
|
|
if not danmu_texts:
|
2025-03-29 18:30:20 +08:00
|
|
|
|
continue # 保持None值
|
2025-03-29 15:41:31 +08:00
|
|
|
|
|
2025-03-29 18:30:20 +08:00
|
|
|
|
# 将结果放入对应的索引位置
|
|
|
|
|
scores[idx] = analyze_sentiment(danmu_texts)
|
2025-03-29 15:41:31 +08:00
|
|
|
|
|
2025-03-29 18:30:20 +08:00
|
|
|
|
info_df['弹幕情感评分snowNLP'] = scores
|
|
|
|
|
info_df.to_csv(info_file, index=False, encoding='utf-8-sig')
|
2025-03-29 15:41:31 +08:00
|
|
|
|
|
|
|
|
|
|
2025-03-29 18:30:20 +08:00
|
|
|
|
# 使用示例 - 处理所有分区
|
|
|
|
|
process_all_partitions("hot_data")
|
|
|
|
|
process_all_partitions("nohot_data")
|