fix:修复了模型情感分析

This commit is contained in:
Sheyiyuan 2025-03-29 16:18:20 +08:00
parent 7153f7a097
commit 28a9620879

View File

@ -1,81 +1,7 @@
# 修改导入部分
from transformers import AutoModelForSequenceClassification, AutoTokenizer # 替换为 transformers 库
import pandas as pd
import torch
<<<<<<< HEAD
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from pathlib import Path
class SentimentAnalyzer:
def __init__(self, model_path: str = "uer/roberta-base-finetuned-dianping-chinese"):
"""
初始化情感分析模型
:param model_path: 本地模型路径或HuggingFace模型名称
"""
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
try:
# 优先尝试加载本地模型
local_path = Path(model_path)
if local_path.exists():
self.tokenizer = AutoTokenizer.from_pretrained(local_path)
self.model = AutoModelForSequenceClassification.from_pretrained(local_path)
else:
# 从HuggingFace加载使用国内镜像
self.tokenizer = AutoTokenizer.from_pretrained(model_path,
mirror="https://hf-mirror.com")
self.model = AutoModelForSequenceClassification.from_pretrained(model_path,
mirror="https://hf-mirror.com")
self.model = self.model.to(self.device)
self.model.eval()
print(f"成功加载模型: {model_path}")
except Exception as e:
raise RuntimeError(f"模型加载失败: {str(e)}")
def analyze(self, texts: list, batch_size: int = 32) -> list:
"""
批量情感分析
:param texts: 待分析文本列表
:param batch_size: 批处理大小
:return: 情感概率列表0-1之间
"""
if not texts:
return []
# 自动调整批大小防止内存溢出
mem = torch.cuda.mem_get_info()[0] if torch.cuda.is_available() else 2e9
safe_batch_size = min(batch_size, max(1, int(mem // 1e7)))
all_probs = []
for i in range(0, len(texts), safe_batch_size):
batch = texts[i:i + safe_batch_size]
try:
inputs = self.tokenizer(
batch,
padding=True,
truncation=True,
max_length=128,
return_tensors="pt"
).to(self.device)
with torch.no_grad():
outputs = self.model(**inputs)
probs = torch.softmax(outputs.logits, dim=1)[:, 1]
all_probs.extend(probs.cpu().numpy().tolist())
except RuntimeError as e:
if "CUDA out of memory" in str(e):
safe_batch_size = max(1, safe_batch_size // 2)
print(f"检测到显存不足,调整批大小为: {safe_batch_size}")
continue
raise
return all_probs
=======
from transformers import AutoModelForSequenceClassification, AutoTokenizer # 修改为从transformers导入
def load_data(file_path):
@ -90,9 +16,9 @@ def load_data(file_path):
def analyze_sentiment(texts):
"""改进的情感分析函数"""
# 使用新的模型配置
model_name = "IDEA-CCNL/Erlangshen-Roberta-330M-Sentiment"
try:
# 使用 HuggingFace 的模型
model_name = "IDEA-CCNL/Erlangshen-Roberta-330M-Sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
@ -103,50 +29,21 @@ def analyze_sentiment(texts):
# 调整概率计算方式
probs = torch.softmax(outputs.logits, dim=1)
return probs[:, 1].mean().item() # 假设正例在位置1
return probs[:, 1].mean().item()
except Exception as e:
print(f"模型加载失败: {str(e)}")
return 0.5 # 返回中性评分作为默认值
>>>>>>> 0618b31b4ac22dbf912798c562bc80043e8e91b8
print(f"情感分析失败: {str(e)}")
return 0.5 # 错误时返回中性值
# ----------------- 使用示例 -----------------
if __name__ == "__main__":
# 初始化分析器(自动选择本地/在线模型)
analyzer = SentimentAnalyzer("./local_models/sentiment") # 优先尝试本地模型
# 测试数据
test_texts = [
"这个视频真的太棒了!",
"完全看不懂在讲什么",
"浪费时间,不建议观看",
"画面精美,内容有深度"
]
# 获取情感概率
scores = analyzer.analyze(test_texts)
# 输出结果
for text, score in zip(test_texts, scores):
print(f"{text}」=> 积极概率: {score:.4f}")
# 从CSV文件读取弹幕
def load_danmu(file_path: str) -> list:
try:
df = pd.read_csv(file_path)
return df['弹幕内容'].dropna().astype(str).tolist()
except Exception as e:
print(f"文件读取失败: {str(e)}")
return []
# 实际使用
# 文件路径
# 示例文件路径
file_path = "hot_data/GMV/BV1ajXMYUE6S/BV1ajXMYUE6S_273_danmaku.csv"
danmu_list = load_danmu(file_path)
if danmu_list:
danmu_scores = analyzer.analyze(danmu_list)
avg_score = sum(danmu_scores) / len(danmu_scores)
print(f"\n弹幕平均情感评分:{avg_score:.4f}")
# 执行分析
danmu_texts = load_data(file_path)
if danmu_texts:
final_score = analyze_sentiment(danmu_texts)
print(f"B站弹幕情感评分{final_score:.4f}")
else:
print("未找到有效弹幕数据")