2025-03-29 14:02:40 +08:00
|
|
|
|
import pandas as pd
|
|
|
|
|
import torch
|
2025-03-29 15:37:12 +08:00
|
|
|
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer # 修改为从transformers导入
|
2025-03-29 14:02:40 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_data(file_path):
|
|
|
|
|
"""优化后的数据加载函数"""
|
|
|
|
|
try:
|
|
|
|
|
df = pd.read_csv(file_path, usecols=['弹幕内容'], engine='python')
|
|
|
|
|
return df['弹幕内容'].dropna().astype(str).tolist()
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"数据加载失败: {str(e)}")
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def analyze_sentiment(texts):
|
|
|
|
|
"""改进的情感分析函数"""
|
2025-03-29 15:37:12 +08:00
|
|
|
|
# 使用新的模型配置
|
|
|
|
|
model_name = "IDEA-CCNL/Erlangshen-Roberta-330M-Sentiment"
|
|
|
|
|
try:
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
|
|
|
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
|
|
|
|
|
|
|
|
|
# 批量处理提升效率
|
|
|
|
|
inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
|
|
|
|
|
with torch.no_grad():
|
|
|
|
|
outputs = model(**inputs)
|
|
|
|
|
|
|
|
|
|
# 调整概率计算方式
|
|
|
|
|
probs = torch.softmax(outputs.logits, dim=1)
|
|
|
|
|
return probs[:, 1].mean().item() # 假设正例在位置1
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"模型加载失败: {str(e)}")
|
|
|
|
|
return 0.5 # 返回中性评分作为默认值
|
2025-03-29 14:02:40 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
# 示例文件路径
|
|
|
|
|
file_path = "hot_data/GMV/BV1ajXMYUE6S/BV1ajXMYUE6S_273_danmaku.csv"
|
|
|
|
|
|
|
|
|
|
# 执行分析
|
|
|
|
|
danmu_texts = load_data(file_path)
|
|
|
|
|
if danmu_texts:
|
|
|
|
|
final_score = analyze_sentiment(danmu_texts)
|
|
|
|
|
print(f"B站弹幕情感评分:{final_score:.4f}")
|
|
|
|
|
else:
|
|
|
|
|
print("未找到有效弹幕数据")
|