statistics_model2025/snowNLP_danmu sentiment_analyzer.py

122 lines
4.1 KiB
Python
Raw Permalink Normal View History

2025-03-29 14:02:40 +08:00
import pandas as pd
2025-03-29 15:41:31 +08:00
import numpy as np
2025-03-29 14:02:40 +08:00
from snownlp import SnowNLP
2025-03-29 15:41:31 +08:00
import os
2025-03-29 14:02:40 +08:00
def load_data(file_path):
try:
df = pd.read_csv(file_path, usecols=['弹幕内容'], engine='python')
return df['弹幕内容'].dropna().astype(str).tolist()
except Exception as e:
print(f"数据加载失败: {str(e)}")
return []
2025-03-29 15:41:31 +08:00
def analyze_sentiment(danmu_texts):
2025-03-29 22:45:36 +08:00
# 添加特殊词汇处理以原词典中很好为0.78一般为0.52差为0.14为标准手动添加)
special_cases = {
# 高强度正能量词
"爷青回": 0.9, # 情怀向
"yyds": 0.9, # 永远滴神
"YYDS": 0.9, # 永远滴神
"kksk":0.8, # 很喜欢
2025-03-29 22:45:36 +08:00
"awsl": 0.8, # 啊我死了(感动)
'阿伟死了': 0.8, # 谐上(感动)
"类目": 0.8, # 感动场景(谐泪目)
"排面": 0.8, # 排场十足
"文艺复兴": 0.8, # 经典重现
'绝绝子': 0.7, # 绝
"双厨狂喜": 0.7, # 跨界联动
"梦幻联动": 0.7, # 跨作品合作
"注入灵魂": 0.7, # 高能片段
# 玩梗互动词
"下次一定": 0.55, # 投币拖延梗
"你币没了": 0.45, # 威胁不投币
"": 0.3, # 经典复读(含贬义)
# 高能名场面
"名场面": 0.85, # 经典片段
"神仙打架": 0.9, # 高手对决
"前方高能": 0.7, # 高潮预警
# 数字
"10":0.85,#十分制打分
"100分":0.85,#百分制打分
"5":0.85,#五分制打分
2025-03-29 22:45:36 +08:00
"666": 0.75, # 玩得厉害
"999": 0.75, # 6翻了
"2333": 0.6, # 笑
# 抽象文化
"": 0.55, # 笑(中性)
"生草": 0.6, # 搞笑场景
# 破防场景
"破防了": 0.4, # 心理防线崩溃
"我裂开了": 0.3, # 心态炸裂
# 特定领域梗
"奥利给": 0.8, # 加油打气
"DNA动了": 0.8, # 触发记忆
"有内味了": 0.7, # 特色到位
# 负向场景
"阴间": 0.3, # 诡异内容
"血压上来了": 0.3 # 令人烦躁
}
2025-03-29 15:41:31 +08:00
sentiment_scores = []
2025-03-29 14:02:40 +08:00
2025-03-29 15:41:31 +08:00
for item in danmu_texts:
2025-03-29 22:45:36 +08:00
if item in special_cases:
sentiment_scores.append(special_cases[item])
else:
s = SnowNLP(item)
sentiment_scores.append(s.sentiments)
2025-03-29 14:02:40 +08:00
2025-03-29 15:41:31 +08:00
avg_score = np.mean(sentiment_scores)
return avg_score
def process_all_partitions(base_path):
# 获取所有分区目录
partitions = [d for d in os.listdir(base_path)
if os.path.isdir(os.path.join(base_path, d))]
2025-03-29 15:41:31 +08:00
for partition in partitions:
partition_path = os.path.join(base_path, partition)
print(f"正在处理分区: {partition}")
process_partition(partition_path)
2025-03-29 15:41:31 +08:00
# process_partition函数
2025-03-29 15:41:31 +08:00
def process_partition(partition_path):
info_file = os.path.join(partition_path, 'info.csv')
if not os.path.exists(info_file):
print(f"未找到info文件: {info_file}")
return
info_df = pd.read_csv(info_file, encoding='utf-8')
# 创建与info_df行数相同的空列表初始值为None
scores = [None] * len(info_df)
2025-03-29 15:41:31 +08:00
for idx, bv in enumerate(info_df['BV号']):
# 构建弹幕文件目录路径
danmu_dir = os.path.join(partition_path, bv)
if not os.path.exists(danmu_dir):
continue # 保持None值
2025-03-29 15:41:31 +08:00
# 查找匹配的弹幕文件
2025-03-29 22:45:36 +08:00
danmu_file = [f for f in os.listdir(danmu_dir)
if f.startswith(bv) and f.endswith('danmaku.csv')]
2025-03-29 22:45:36 +08:00
if not danmu_file:
continue # 保持None值
2025-03-29 22:45:36 +08:00
danmu_path=os.path.join(danmu_dir, danmu_file[0])
danmu_texts = load_data(danmu_path)
2025-03-29 15:41:31 +08:00
if not danmu_texts:
continue # 保持None值
2025-03-29 15:41:31 +08:00
# 将结果放入对应的索引位置
scores[idx] = analyze_sentiment(danmu_texts)
2025-03-29 15:41:31 +08:00
info_df['弹幕情感评分snowNLP'] = scores
info_df.to_csv(info_file, index=False, encoding='utf-8-sig')
2025-03-29 15:41:31 +08:00
# 使用示例 - 处理所有分区
process_all_partitions("hot_data")
process_all_partitions("nohot_data")