import pandas as pd import torch <<<<<<< HEAD from transformers import AutoTokenizer, AutoModelForSequenceClassification from pathlib import Path class SentimentAnalyzer: def __init__(self, model_path: str = "uer/roberta-base-finetuned-dianping-chinese"): """ 初始化情感分析模型 :param model_path: 本地模型路径或HuggingFace模型名称 """ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") try: # 优先尝试加载本地模型 local_path = Path(model_path) if local_path.exists(): self.tokenizer = AutoTokenizer.from_pretrained(local_path) self.model = AutoModelForSequenceClassification.from_pretrained(local_path) else: # 从HuggingFace加载(使用国内镜像) self.tokenizer = AutoTokenizer.from_pretrained(model_path, mirror="https://hf-mirror.com") self.model = AutoModelForSequenceClassification.from_pretrained(model_path, mirror="https://hf-mirror.com") self.model = self.model.to(self.device) self.model.eval() print(f"成功加载模型: {model_path}") except Exception as e: raise RuntimeError(f"模型加载失败: {str(e)}") def analyze(self, texts: list, batch_size: int = 32) -> list: """ 批量情感分析 :param texts: 待分析文本列表 :param batch_size: 批处理大小 :return: 情感概率列表(0-1之间) """ if not texts: return [] # 自动调整批大小防止内存溢出 mem = torch.cuda.mem_get_info()[0] if torch.cuda.is_available() else 2e9 safe_batch_size = min(batch_size, max(1, int(mem // 1e7))) all_probs = [] for i in range(0, len(texts), safe_batch_size): batch = texts[i:i + safe_batch_size] try: inputs = self.tokenizer( batch, padding=True, truncation=True, max_length=128, return_tensors="pt" ).to(self.device) with torch.no_grad(): outputs = self.model(**inputs) probs = torch.softmax(outputs.logits, dim=1)[:, 1] all_probs.extend(probs.cpu().numpy().tolist()) except RuntimeError as e: if "CUDA out of memory" in str(e): safe_batch_size = max(1, safe_batch_size // 2) print(f"检测到显存不足,调整批大小为: {safe_batch_size}") continue raise return all_probs ======= from transformers import AutoModelForSequenceClassification, AutoTokenizer # 修改为从transformers导入 def load_data(file_path): """优化后的数据加载函数""" try: df = pd.read_csv(file_path, usecols=['弹幕内容'], engine='python') return df['弹幕内容'].dropna().astype(str).tolist() except Exception as e: print(f"数据加载失败: {str(e)}") return [] def analyze_sentiment(texts): """改进的情感分析函数""" # 使用新的模型配置 model_name = "IDEA-CCNL/Erlangshen-Roberta-330M-Sentiment" try: tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSequenceClassification.from_pretrained(model_name) # 批量处理提升效率 inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt") with torch.no_grad(): outputs = model(**inputs) # 调整概率计算方式 probs = torch.softmax(outputs.logits, dim=1) return probs[:, 1].mean().item() # 假设正例在位置1 except Exception as e: print(f"模型加载失败: {str(e)}") return 0.5 # 返回中性评分作为默认值 >>>>>>> 0618b31b4ac22dbf912798c562bc80043e8e91b8 # ----------------- 使用示例 ----------------- if __name__ == "__main__": # 初始化分析器(自动选择本地/在线模型) analyzer = SentimentAnalyzer("./local_models/sentiment") # 优先尝试本地模型 # 测试数据 test_texts = [ "这个视频真的太棒了!", "完全看不懂在讲什么", "浪费时间,不建议观看", "画面精美,内容有深度" ] # 获取情感概率 scores = analyzer.analyze(test_texts) # 输出结果 for text, score in zip(test_texts, scores): print(f"「{text}」=> 积极概率: {score:.4f}") # 从CSV文件读取弹幕 def load_danmu(file_path: str) -> list: try: df = pd.read_csv(file_path) return df['弹幕内容'].dropna().astype(str).tolist() except Exception as e: print(f"文件读取失败: {str(e)}") return [] # 实际使用 # 文件路径 file_path = "hot_data/GMV/BV1ajXMYUE6S/BV1ajXMYUE6S_273_danmaku.csv" danmu_list = load_danmu(file_path) if danmu_list: danmu_scores = analyzer.analyze(danmu_list) avg_score = sum(danmu_scores) / len(danmu_scores) print(f"\n弹幕平均情感评分:{avg_score:.4f}")