statistics_model2025/BERT_danmu_sentiment_analyzer.py

49 lines
1.6 KiB
Python
Raw Normal View History

2025-03-29 16:18:20 +08:00
# 修改导入部分
from transformers import AutoModelForSequenceClassification, AutoTokenizer # 替换为 transformers 库
2025-03-29 14:02:40 +08:00
import pandas as pd
import torch
2025-03-29 15:46:52 +08:00
def load_data(file_path):
"""优化后的数据加载函数"""
try:
df = pd.read_csv(file_path, usecols=['弹幕内容'], engine='python')
return df['弹幕内容'].dropna().astype(str).tolist()
except Exception as e:
print(f"数据加载失败: {str(e)}")
return []
def analyze_sentiment(texts):
"""改进的情感分析函数"""
try:
2025-03-29 16:18:20 +08:00
# 使用 HuggingFace 的模型
model_name = "IDEA-CCNL/Erlangshen-Roberta-330M-Sentiment"
2025-03-29 15:46:52 +08:00
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
# 批量处理提升效率
inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
# 调整概率计算方式
probs = torch.softmax(outputs.logits, dim=1)
2025-03-29 16:18:20 +08:00
return probs[:, 1].mean().item()
2025-03-29 15:46:52 +08:00
except Exception as e:
2025-03-29 16:18:20 +08:00
print(f"情感分析失败: {str(e)}")
return 0.5 # 错误时返回中性值
2025-03-29 15:41:31 +08:00
2025-03-29 14:02:40 +08:00
if __name__ == "__main__":
2025-03-29 16:18:20 +08:00
# 示例文件路径
2025-03-29 15:41:31 +08:00
file_path = "hot_data/GMV/BV1ajXMYUE6S/BV1ajXMYUE6S_273_danmaku.csv"
2025-03-29 16:18:20 +08:00
# 执行分析
danmu_texts = load_data(file_path)
if danmu_texts:
final_score = analyze_sentiment(danmu_texts)
print(f"B站弹幕情感评分{final_score:.4f}")
else:
print("未找到有效弹幕数据")