fix:修复了模型情感分析

2025-03-29 16:18:20 +08:00 · 2025-03-29 16:18:20 +08:00 · 28a9620879
commit 28a9620879
parent 7153f7a097
1 changed files with 16 additions and 119 deletions
--- a/BERT_danmu_sentiment_analyzer.py
+++ b/BERT_danmu_sentiment_analyzer.py
@ -1,81 +1,7 @@
+# 修改导入部分
+from transformers import AutoModelForSequenceClassification, AutoTokenizer  # 替换为 transformers 库
 import pandas as pd
 import torch
-<<<<<<< HEAD
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
-from pathlib import Path
-
-
-class SentimentAnalyzer:
-    def __init__(self, model_path: str = "uer/roberta-base-finetuned-dianping-chinese"):
-        """
-        初始化情感分析模型
-        :param model_path: 本地模型路径或HuggingFace模型名称
-        """
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
-        try:
-            # 优先尝试加载本地模型
-            local_path = Path(model_path)
-            if local_path.exists():
-                self.tokenizer = AutoTokenizer.from_pretrained(local_path)
-                self.model = AutoModelForSequenceClassification.from_pretrained(local_path)
-            else:
-                # 从HuggingFace加载（使用国内镜像）
-                self.tokenizer = AutoTokenizer.from_pretrained(model_path,
-                                                               mirror="https://hf-mirror.com")
-                self.model = AutoModelForSequenceClassification.from_pretrained(model_path,
-                                                                                mirror="https://hf-mirror.com")
-
-            self.model = self.model.to(self.device)
-            self.model.eval()
-            print(f"成功加载模型: {model_path}")
-
-        except Exception as e:
-            raise RuntimeError(f"模型加载失败: {str(e)}")
-
-    def analyze(self, texts: list, batch_size: int = 32) -> list:
-        """
-        批量情感分析
-        :param texts: 待分析文本列表
-        :param batch_size: 批处理大小
-        :return: 情感概率列表（0-1之间）
-        """
-        if not texts:
-            return []
-
-        # 自动调整批大小防止内存溢出
-        mem = torch.cuda.mem_get_info()[0] if torch.cuda.is_available() else 2e9
-        safe_batch_size = min(batch_size, max(1, int(mem // 1e7)))
-
-        all_probs = []
-        for i in range(0, len(texts), safe_batch_size):
-            batch = texts[i:i + safe_batch_size]
-
-            try:
-                inputs = self.tokenizer(
-                    batch,
-                    padding=True,
-                    truncation=True,
-                    max_length=128,
-                    return_tensors="pt"
-                ).to(self.device)
-
-                with torch.no_grad():
-                    outputs = self.model(**inputs)
-
-                probs = torch.softmax(outputs.logits, dim=1)[:, 1]
-                all_probs.extend(probs.cpu().numpy().tolist())
-
-            except RuntimeError as e:
-                if "CUDA out of memory" in str(e):
-                    safe_batch_size = max(1, safe_batch_size // 2)
-                    print(f"检测到显存不足，调整批大小为: {safe_batch_size}")
-                    continue
-                raise
-
-        return all_probs
-=======
-from transformers import AutoModelForSequenceClassification, AutoTokenizer  # 修改为从transformers导入


 def load_data(file_path):
@ -90,9 +16,9 @@ def load_data(file_path):

 def analyze_sentiment(texts):
    """改进的情感分析函数"""
-    # 使用新的模型配置
-    model_name = "IDEA-CCNL/Erlangshen-Roberta-330M-Sentiment"
    try:
+        # 使用 HuggingFace 的模型
+        model_name = "IDEA-CCNL/Erlangshen-Roberta-330M-Sentiment"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(model_name)

@ -103,50 +29,21 @@ def analyze_sentiment(texts):

        # 调整概率计算方式
        probs = torch.softmax(outputs.logits, dim=1)
-        return probs[:, 1].mean().item()  # 假设正例在位置1
+        return probs[:, 1].mean().item()

    except Exception as e:
-        print(f"模型加载失败: {str(e)}")
-        return 0.5  # 返回中性评分作为默认值
->>>>>>> 0618b31b4ac22dbf912798c562bc80043e8e91b8
+        print(f"情感分析失败: {str(e)}")
+        return 0.5  # 错误时返回中性值


-# ----------------- 使用示例 -----------------
 if __name__ == "__main__":
-    # 初始化分析器（自动选择本地/在线模型）
-    analyzer = SentimentAnalyzer("./local_models/sentiment")  # 优先尝试本地模型
-
-    # 测试数据
-    test_texts = [
-        "这个视频真的太棒了！",
-        "完全看不懂在讲什么",
-        "浪费时间，不建议观看",
-        "画面精美，内容有深度"
-    ]
-
-    # 获取情感概率
-    scores = analyzer.analyze(test_texts)
-
-    # 输出结果
-    for text, score in zip(test_texts, scores):
-        print(f"「{text}」=> 积极概率: {score:.4f}")
-
-
-    # 从CSV文件读取弹幕
-    def load_danmu(file_path: str) -> list:
-        try:
-            df = pd.read_csv(file_path)
-            return df['弹幕内容'].dropna().astype(str).tolist()
-        except Exception as e:
-            print(f"文件读取失败: {str(e)}")
-            return []
-
-
-    # 实际使用
-    # 文件路径
+    # 示例文件路径
    file_path = "hot_data/GMV/BV1ajXMYUE6S/BV1ajXMYUE6S_273_danmaku.csv"
-    danmu_list = load_danmu(file_path)
-    if danmu_list:
-        danmu_scores = analyzer.analyze(danmu_list)
-        avg_score = sum(danmu_scores) / len(danmu_scores)
-        print(f"\n弹幕平均情感评分：{avg_score:.4f}")
+
+    # 执行分析
+    danmu_texts = load_data(file_path)
+    if danmu_texts:
+        final_score = analyze_sentiment(danmu_texts)
+        print(f"B站弹幕情感评分：{final_score:.4f}")
+    else:
+        print("未找到有效弹幕数据")