From 3a996704a160818ce900ef40b7ba0efbf7e5fc04 Mon Sep 17 00:00:00 2001 From: Bairly <2652270566@qq.com> Date: Sat, 29 Mar 2025 15:41:31 +0800 Subject: [PATCH] modify --- 弹幕情感评分-字典法.py | 65 +++++++++++---- 弹幕情感评分.py | 143 ++++++++++++++++++++++++-------- 2 files changed, 157 insertions(+), 51 deletions(-) diff --git a/弹幕情感评分-字典法.py b/弹幕情感评分-字典法.py index beb51d7..0af89eb 100644 --- a/弹幕情感评分-字典法.py +++ b/弹幕情感评分-字典法.py @@ -1,6 +1,7 @@ import pandas as pd +import numpy as np from snownlp import SnowNLP -import matplotlib.pyplot as plt +import os def load_data(file_path): try: @@ -10,21 +11,53 @@ def load_data(file_path): print(f"数据加载失败: {str(e)}") return [] -# 示例文件路径 -file_path = "hot_data/GMV/BV1ajXMYUE6S/BV1ajXMYUE6S_273_danmaku.csv" -# 执行分析 -danmu_texts = load_data(file_path) +def analyze_sentiment(danmu_texts): + emotions = {'positive': 0, 'negative': 0, 'neutral': 0} + sentiment_scores = [] -emotions = {'positive': 0, - 'negative': 0, - 'neutral': 0} + for item in danmu_texts: + s = SnowNLP(item) + score = s.sentiments + sentiment_scores.append(score) + if score > 0.6: + emotions['positive'] += 1 + elif score < 0.4: + emotions['negative'] += 1 + else: + emotions['neutral'] += 1 -for item in danmu_texts: - s = SnowNLP(item) - if s.sentiments > 0.6: - emotions['positive'] += 1 - elif s.sentiments < 0.4: - emotions['negative'] += 1 - else: - emotions['neutral'] += 1 \ No newline at end of file + avg_score = np.mean(sentiment_scores) + return emotions, avg_score + + +def process_partition(partition_path): + info_file = os.path.join(partition_path, 'info.csv') + if not os.path.exists(info_file): + print(f"未找到info文件: {info_file}") + return + + info_df = pd.read_csv(info_file,encoding='utf-8') + scores = [] + + for bv in info_df['BV号']: + danmu_file = os.path.join(partition_path, bv, f"{bv}_273_danmaku.csv") + if not os.path.exists(danmu_file): + scores.append(None) + continue + + danmu_texts = load_data(danmu_file) + if not danmu_texts: + scores.append(None) + continue + + _, avg_score = analyze_sentiment(danmu_texts) + scores.append(avg_score) + + info_df['情感评分'] = scores + info_df.to_csv(info_file, index=False) + + +# 使用示例 - 处理GMV分区 +partition_path = "hot_data/GMV" +process_partition(partition_path) \ No newline at end of file diff --git a/弹幕情感评分.py b/弹幕情感评分.py index 716effd..172ee73 100644 --- a/弹幕情感评分.py +++ b/弹幕情感评分.py @@ -1,43 +1,116 @@ import pandas as pd import torch -from modelscope import AutoModelForSequenceClassification, AutoTokenizer +from transformers import AutoTokenizer, AutoModelForSequenceClassification +from pathlib import Path -def load_data(file_path): - """优化后的数据加载函数""" - try: - df = pd.read_csv(file_path, usecols=['弹幕内容'], engine='python') - return df['弹幕内容'].dropna().astype(str).tolist() - except Exception as e: - print(f"数据加载失败: {str(e)}") - return [] - - -def analyze_sentiment(texts): - """改进的情感分析函数""" - # 使用模型配置 - model_name = "damo/nlp_structbert_sentiment-classification_chinese-base" - tokenizer = AutoTokenizer.from_pretrained(model_name) - model = AutoModelForSequenceClassification.from_pretrained(model_name) - - # 批量处理提升效率 - inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt") - with torch.no_grad(): - outputs = model(**inputs) - - # 调整概率计算方式 - probs = torch.softmax(outputs.logits, dim=1) - return probs[:, 1].mean().item() +class SentimentAnalyzer: + def __init__(self, model_path: str = "uer/roberta-base-finetuned-dianping-chinese"): + """ + 初始化情感分析模型 + :param model_path: 本地模型路径或HuggingFace模型名称 + """ + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + try: + # 优先尝试加载本地模型 + local_path = Path(model_path) + if local_path.exists(): + self.tokenizer = AutoTokenizer.from_pretrained(local_path) + self.model = AutoModelForSequenceClassification.from_pretrained(local_path) + else: + # 从HuggingFace加载(使用国内镜像) + self.tokenizer = AutoTokenizer.from_pretrained(model_path, + mirror="https://hf-mirror.com") + self.model = AutoModelForSequenceClassification.from_pretrained(model_path, + mirror="https://hf-mirror.com") + + self.model = self.model.to(self.device) + self.model.eval() + print(f"成功加载模型: {model_path}") + + except Exception as e: + raise RuntimeError(f"模型加载失败: {str(e)}") + + def analyze(self, texts: list, batch_size: int = 32) -> list: + """ + 批量情感分析 + :param texts: 待分析文本列表 + :param batch_size: 批处理大小 + :return: 情感概率列表(0-1之间) + """ + if not texts: + return [] + + # 自动调整批大小防止内存溢出 + mem = torch.cuda.mem_get_info()[0] if torch.cuda.is_available() else 2e9 + safe_batch_size = min(batch_size, max(1, int(mem // 1e7))) + + all_probs = [] + for i in range(0, len(texts), safe_batch_size): + batch = texts[i:i + safe_batch_size] + + try: + inputs = self.tokenizer( + batch, + padding=True, + truncation=True, + max_length=128, + return_tensors="pt" + ).to(self.device) + + with torch.no_grad(): + outputs = self.model(**inputs) + + probs = torch.softmax(outputs.logits, dim=1)[:, 1] + all_probs.extend(probs.cpu().numpy().tolist()) + + except RuntimeError as e: + if "CUDA out of memory" in str(e): + safe_batch_size = max(1, safe_batch_size // 2) + print(f"检测到显存不足,调整批大小为: {safe_batch_size}") + continue + raise + + return all_probs +# ----------------- 使用示例 ----------------- if __name__ == "__main__": - # 示例文件路径 - file_path = "hot_data/GMV/BV1ajXMYUE6S/BV1ajXMYUE6S_273_danmaku.csv" + # 初始化分析器(自动选择本地/在线模型) + analyzer = SentimentAnalyzer("./local_models/sentiment") # 优先尝试本地模型 - # 执行分析 - danmu_texts = load_data(file_path) - if danmu_texts: - final_score = analyze_sentiment(danmu_texts) - print(f"B站弹幕情感评分:{final_score:.4f}") - else: - print("未找到有效弹幕数据") \ No newline at end of file + # 测试数据 + test_texts = [ + "这个视频真的太棒了!", + "完全看不懂在讲什么", + "浪费时间,不建议观看", + "画面精美,内容有深度" + ] + + # 获取情感概率 + scores = analyzer.analyze(test_texts) + + # 输出结果 + for text, score in zip(test_texts, scores): + print(f"「{text}」=> 积极概率: {score:.4f}") + + + # 从CSV文件读取弹幕 + def load_danmu(file_path: str) -> list: + try: + df = pd.read_csv(file_path) + return df['弹幕内容'].dropna().astype(str).tolist() + except Exception as e: + print(f"文件读取失败: {str(e)}") + return [] + + + # 实际使用 + # 文件路径 + file_path = "hot_data/GMV/BV1ajXMYUE6S/BV1ajXMYUE6S_273_danmaku.csv" + danmu_list = load_danmu(file_path) + if danmu_list: + danmu_scores = analyzer.analyze(danmu_list) + avg_score = sum(danmu_scores) / len(danmu_scores) + print(f"\n弹幕平均情感评分:{avg_score:.4f}") \ No newline at end of file