statistics_model2025/BERT_danmu_sentiment_analyzer.py
2025-03-29 15:46:52 +08:00

152 lines
5.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pandas as pd
import torch
<<<<<<< HEAD
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from pathlib import Path
class SentimentAnalyzer:
def __init__(self, model_path: str = "uer/roberta-base-finetuned-dianping-chinese"):
"""
初始化情感分析模型
:param model_path: 本地模型路径或HuggingFace模型名称
"""
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
try:
# 优先尝试加载本地模型
local_path = Path(model_path)
if local_path.exists():
self.tokenizer = AutoTokenizer.from_pretrained(local_path)
self.model = AutoModelForSequenceClassification.from_pretrained(local_path)
else:
# 从HuggingFace加载使用国内镜像
self.tokenizer = AutoTokenizer.from_pretrained(model_path,
mirror="https://hf-mirror.com")
self.model = AutoModelForSequenceClassification.from_pretrained(model_path,
mirror="https://hf-mirror.com")
self.model = self.model.to(self.device)
self.model.eval()
print(f"成功加载模型: {model_path}")
except Exception as e:
raise RuntimeError(f"模型加载失败: {str(e)}")
def analyze(self, texts: list, batch_size: int = 32) -> list:
"""
批量情感分析
:param texts: 待分析文本列表
:param batch_size: 批处理大小
:return: 情感概率列表0-1之间
"""
if not texts:
return []
# 自动调整批大小防止内存溢出
mem = torch.cuda.mem_get_info()[0] if torch.cuda.is_available() else 2e9
safe_batch_size = min(batch_size, max(1, int(mem // 1e7)))
all_probs = []
for i in range(0, len(texts), safe_batch_size):
batch = texts[i:i + safe_batch_size]
try:
inputs = self.tokenizer(
batch,
padding=True,
truncation=True,
max_length=128,
return_tensors="pt"
).to(self.device)
with torch.no_grad():
outputs = self.model(**inputs)
probs = torch.softmax(outputs.logits, dim=1)[:, 1]
all_probs.extend(probs.cpu().numpy().tolist())
except RuntimeError as e:
if "CUDA out of memory" in str(e):
safe_batch_size = max(1, safe_batch_size // 2)
print(f"检测到显存不足,调整批大小为: {safe_batch_size}")
continue
raise
return all_probs
=======
from transformers import AutoModelForSequenceClassification, AutoTokenizer # 修改为从transformers导入
def load_data(file_path):
"""优化后的数据加载函数"""
try:
df = pd.read_csv(file_path, usecols=['弹幕内容'], engine='python')
return df['弹幕内容'].dropna().astype(str).tolist()
except Exception as e:
print(f"数据加载失败: {str(e)}")
return []
def analyze_sentiment(texts):
"""改进的情感分析函数"""
# 使用新的模型配置
model_name = "IDEA-CCNL/Erlangshen-Roberta-330M-Sentiment"
try:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
# 批量处理提升效率
inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
# 调整概率计算方式
probs = torch.softmax(outputs.logits, dim=1)
return probs[:, 1].mean().item() # 假设正例在位置1
except Exception as e:
print(f"模型加载失败: {str(e)}")
return 0.5 # 返回中性评分作为默认值
>>>>>>> 0618b31b4ac22dbf912798c562bc80043e8e91b8
# ----------------- 使用示例 -----------------
if __name__ == "__main__":
# 初始化分析器(自动选择本地/在线模型)
analyzer = SentimentAnalyzer("./local_models/sentiment") # 优先尝试本地模型
# 测试数据
test_texts = [
"这个视频真的太棒了!",
"完全看不懂在讲什么",
"浪费时间,不建议观看",
"画面精美,内容有深度"
]
# 获取情感概率
scores = analyzer.analyze(test_texts)
# 输出结果
for text, score in zip(test_texts, scores):
print(f"{text}」=> 积极概率: {score:.4f}")
# 从CSV文件读取弹幕
def load_danmu(file_path: str) -> list:
try:
df = pd.read_csv(file_path)
return df['弹幕内容'].dropna().astype(str).tolist()
except Exception as e:
print(f"文件读取失败: {str(e)}")
return []
# 实际使用
# 文件路径
file_path = "hot_data/GMV/BV1ajXMYUE6S/BV1ajXMYUE6S_273_danmaku.csv"
danmu_list = load_danmu(file_path)
if danmu_list:
danmu_scores = analyzer.analyze(danmu_list)
avg_score = sum(danmu_scores) / len(danmu_scores)
print(f"\n弹幕平均情感评分:{avg_score:.4f}")