statistics_model2025/BERT_danmu_sentiment_analyzer.py

import pandas as pd
import torch
<<<<<<< HEAD
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from pathlib import Path


class SentimentAnalyzer:
    def __init__(self, model_path: str = "uer/roberta-base-finetuned-dianping-chinese"):
        """
        初始化情感分析模型
        :param model_path: 本地模型路径或HuggingFace模型名称
        """
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        try:
            # 优先尝试加载本地模型
            local_path = Path(model_path)
            if local_path.exists():
                self.tokenizer = AutoTokenizer.from_pretrained(local_path)
                self.model = AutoModelForSequenceClassification.from_pretrained(local_path)
            else:
                # 从HuggingFace加载（使用国内镜像）
                self.tokenizer = AutoTokenizer.from_pretrained(model_path,
                                                               mirror="https://hf-mirror.com")
                self.model = AutoModelForSequenceClassification.from_pretrained(model_path,
                                                                                mirror="https://hf-mirror.com")

            self.model = self.model.to(self.device)
            self.model.eval()
            print(f"成功加载模型: {model_path}")

        except Exception as e:
            raise RuntimeError(f"模型加载失败: {str(e)}")

    def analyze(self, texts: list, batch_size: int = 32) -> list:
        """
        批量情感分析
        :param texts: 待分析文本列表
        :param batch_size: 批处理大小
        :return: 情感概率列表（0-1之间）
        """
        if not texts:
            return []

        # 自动调整批大小防止内存溢出
        mem = torch.cuda.mem_get_info()[0] if torch.cuda.is_available() else 2e9
        safe_batch_size = min(batch_size, max(1, int(mem // 1e7)))

        all_probs = []
        for i in range(0, len(texts), safe_batch_size):
            batch = texts[i:i + safe_batch_size]

            try:
                inputs = self.tokenizer(
                    batch,
                    padding=True,
                    truncation=True,
                    max_length=128,
                    return_tensors="pt"
                ).to(self.device)

                with torch.no_grad():
                    outputs = self.model(**inputs)

                probs = torch.softmax(outputs.logits, dim=1)[:, 1]
                all_probs.extend(probs.cpu().numpy().tolist())

            except RuntimeError as e:
                if "CUDA out of memory" in str(e):
                    safe_batch_size = max(1, safe_batch_size // 2)
                    print(f"检测到显存不足，调整批大小为: {safe_batch_size}")
                    continue
                raise

        return all_probs
=======
from transformers import AutoModelForSequenceClassification, AutoTokenizer  # 修改为从transformers导入


def load_data(file_path):
    """优化后的数据加载函数"""
    try:
        df = pd.read_csv(file_path, usecols=['弹幕内容'], engine='python')
        return df['弹幕内容'].dropna().astype(str).tolist()
    except Exception as e:
        print(f"数据加载失败: {str(e)}")
        return []


def analyze_sentiment(texts):
    """改进的情感分析函数"""
    # 使用新的模型配置
    model_name = "IDEA-CCNL/Erlangshen-Roberta-330M-Sentiment"
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(model_name)

        # 批量处理提升效率
        inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs)

        # 调整概率计算方式
        probs = torch.softmax(outputs.logits, dim=1)
        return probs[:, 1].mean().item()  # 假设正例在位置1

    except Exception as e:
        print(f"模型加载失败: {str(e)}")
        return 0.5  # 返回中性评分作为默认值
>>>>>>> 0618b31b4ac22dbf912798c562bc80043e8e91b8


# ----------------- 使用示例 -----------------
if __name__ == "__main__":
    # 初始化分析器（自动选择本地/在线模型）
    analyzer = SentimentAnalyzer("./local_models/sentiment")  # 优先尝试本地模型

    # 测试数据
    test_texts = [
        "这个视频真的太棒了！",
        "完全看不懂在讲什么",
        "浪费时间，不建议观看",
        "画面精美，内容有深度"
    ]

    # 获取情感概率
    scores = analyzer.analyze(test_texts)

    # 输出结果
    for text, score in zip(test_texts, scores):
        print(f"「{text}」=> 积极概率: {score:.4f}")


    # 从CSV文件读取弹幕
    def load_danmu(file_path: str) -> list:
        try:
            df = pd.read_csv(file_path)
            return df['弹幕内容'].dropna().astype(str).tolist()
        except Exception as e:
            print(f"文件读取失败: {str(e)}")
            return []


    # 实际使用
    # 文件路径
    file_path = "hot_data/GMV/BV1ajXMYUE6S/BV1ajXMYUE6S_273_danmaku.csv"
    danmu_list = load_danmu(file_path)
    if danmu_list:
        danmu_scores = analyzer.analyze(danmu_list)
        avg_score = sum(danmu_scores) / len(danmu_scores)
        print(f"\n弹幕平均情感评分：{avg_score:.4f}")