diff --git a/BERT_danmu_sentiment_analyzer.py b/BERT_danmu_sentiment_analyzer.py index 5d3e9db..1f78095 100644 --- a/BERT_danmu_sentiment_analyzer.py +++ b/BERT_danmu_sentiment_analyzer.py @@ -1,81 +1,7 @@ +# 修改导入部分 +from transformers import AutoModelForSequenceClassification, AutoTokenizer # 替换为 transformers 库 import pandas as pd import torch -<<<<<<< HEAD -from transformers import AutoTokenizer, AutoModelForSequenceClassification -from pathlib import Path - - -class SentimentAnalyzer: - def __init__(self, model_path: str = "uer/roberta-base-finetuned-dianping-chinese"): - """ - 初始化情感分析模型 - :param model_path: 本地模型路径或HuggingFace模型名称 - """ - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - try: - # 优先尝试加载本地模型 - local_path = Path(model_path) - if local_path.exists(): - self.tokenizer = AutoTokenizer.from_pretrained(local_path) - self.model = AutoModelForSequenceClassification.from_pretrained(local_path) - else: - # 从HuggingFace加载(使用国内镜像) - self.tokenizer = AutoTokenizer.from_pretrained(model_path, - mirror="https://hf-mirror.com") - self.model = AutoModelForSequenceClassification.from_pretrained(model_path, - mirror="https://hf-mirror.com") - - self.model = self.model.to(self.device) - self.model.eval() - print(f"成功加载模型: {model_path}") - - except Exception as e: - raise RuntimeError(f"模型加载失败: {str(e)}") - - def analyze(self, texts: list, batch_size: int = 32) -> list: - """ - 批量情感分析 - :param texts: 待分析文本列表 - :param batch_size: 批处理大小 - :return: 情感概率列表(0-1之间) - """ - if not texts: - return [] - - # 自动调整批大小防止内存溢出 - mem = torch.cuda.mem_get_info()[0] if torch.cuda.is_available() else 2e9 - safe_batch_size = min(batch_size, max(1, int(mem // 1e7))) - - all_probs = [] - for i in range(0, len(texts), safe_batch_size): - batch = texts[i:i + safe_batch_size] - - try: - inputs = self.tokenizer( - batch, - padding=True, - truncation=True, - max_length=128, - return_tensors="pt" - ).to(self.device) - - with torch.no_grad(): - outputs = self.model(**inputs) - - probs = torch.softmax(outputs.logits, dim=1)[:, 1] - all_probs.extend(probs.cpu().numpy().tolist()) - - except RuntimeError as e: - if "CUDA out of memory" in str(e): - safe_batch_size = max(1, safe_batch_size // 2) - print(f"检测到显存不足,调整批大小为: {safe_batch_size}") - continue - raise - - return all_probs -======= -from transformers import AutoModelForSequenceClassification, AutoTokenizer # 修改为从transformers导入 def load_data(file_path): @@ -90,9 +16,9 @@ def load_data(file_path): def analyze_sentiment(texts): """改进的情感分析函数""" - # 使用新的模型配置 - model_name = "IDEA-CCNL/Erlangshen-Roberta-330M-Sentiment" try: + # 使用 HuggingFace 的模型 + model_name = "IDEA-CCNL/Erlangshen-Roberta-330M-Sentiment" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSequenceClassification.from_pretrained(model_name) @@ -103,50 +29,21 @@ def analyze_sentiment(texts): # 调整概率计算方式 probs = torch.softmax(outputs.logits, dim=1) - return probs[:, 1].mean().item() # 假设正例在位置1 + return probs[:, 1].mean().item() except Exception as e: - print(f"模型加载失败: {str(e)}") - return 0.5 # 返回中性评分作为默认值 ->>>>>>> 0618b31b4ac22dbf912798c562bc80043e8e91b8 + print(f"情感分析失败: {str(e)}") + return 0.5 # 错误时返回中性值 -# ----------------- 使用示例 ----------------- if __name__ == "__main__": - # 初始化分析器(自动选择本地/在线模型) - analyzer = SentimentAnalyzer("./local_models/sentiment") # 优先尝试本地模型 - - # 测试数据 - test_texts = [ - "这个视频真的太棒了!", - "完全看不懂在讲什么", - "浪费时间,不建议观看", - "画面精美,内容有深度" - ] - - # 获取情感概率 - scores = analyzer.analyze(test_texts) - - # 输出结果 - for text, score in zip(test_texts, scores): - print(f"「{text}」=> 积极概率: {score:.4f}") - - - # 从CSV文件读取弹幕 - def load_danmu(file_path: str) -> list: - try: - df = pd.read_csv(file_path) - return df['弹幕内容'].dropna().astype(str).tolist() - except Exception as e: - print(f"文件读取失败: {str(e)}") - return [] - - - # 实际使用 - # 文件路径 + # 示例文件路径 file_path = "hot_data/GMV/BV1ajXMYUE6S/BV1ajXMYUE6S_273_danmaku.csv" - danmu_list = load_danmu(file_path) - if danmu_list: - danmu_scores = analyzer.analyze(danmu_list) - avg_score = sum(danmu_scores) / len(danmu_scores) - print(f"\n弹幕平均情感评分:{avg_score:.4f}") \ No newline at end of file + + # 执行分析 + danmu_texts = load_data(file_path) + if danmu_texts: + final_score = analyze_sentiment(danmu_texts) + print(f"B站弹幕情感评分:{final_score:.4f}") + else: + print("未找到有效弹幕数据") \ No newline at end of file