statistics_model2025/弹幕情感评分.py

48 lines
1.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer # 修改为从transformers导入
def load_data(file_path):
"""优化后的数据加载函数"""
try:
df = pd.read_csv(file_path, usecols=['弹幕内容'], engine='python')
return df['弹幕内容'].dropna().astype(str).tolist()
except Exception as e:
print(f"数据加载失败: {str(e)}")
return []
def analyze_sentiment(texts):
"""改进的情感分析函数"""
# 使用新的模型配置
model_name = "IDEA-CCNL/Erlangshen-Roberta-330M-Sentiment"
try:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
# 批量处理提升效率
inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
# 调整概率计算方式
probs = torch.softmax(outputs.logits, dim=1)
return probs[:, 1].mean().item() # 假设正例在位置1
except Exception as e:
print(f"模型加载失败: {str(e)}")
return 0.5 # 返回中性评分作为默认值
if __name__ == "__main__":
# 示例文件路径
file_path = "hot_data/GMV/BV1ajXMYUE6S/BV1ajXMYUE6S_273_danmaku.csv"
# 执行分析
danmu_texts = load_data(file_path)
if danmu_texts:
final_score = analyze_sentiment(danmu_texts)
print(f"B站弹幕情感评分{final_score:.4f}")
else:
print("未找到有效弹幕数据")