modify

2025-03-29 15:41:31 +08:00 · 2025-03-29 15:41:31 +08:00 · 3a996704a1
commit 3a996704a1
parent c94cfe4ab0
2 changed files with 157 additions and 51 deletions
--- a/弹幕情感评分-字典法.py
+++ b/弹幕情感评分-字典法.py
@ -1,6 +1,7 @@
 import pandas as pd
+import numpy as np
 from snownlp import SnowNLP
-import matplotlib.pyplot as plt
+import os

 def load_data(file_path):
    try:
@ -10,21 +11,53 @@ def load_data(file_path):
        print(f"数据加载失败: {str(e)}")
        return []

-# 示例文件路径
-file_path = "hot_data/GMV/BV1ajXMYUE6S/BV1ajXMYUE6S_273_danmaku.csv"

-# 执行分析
-danmu_texts = load_data(file_path)
+def analyze_sentiment(danmu_texts):
+    emotions = {'positive': 0, 'negative': 0, 'neutral': 0}
+    sentiment_scores = []

-emotions = {'positive': 0,
-            'negative': 0,
-            'neutral': 0}
+    for item in danmu_texts:
+        s = SnowNLP(item)
+        score = s.sentiments
+        sentiment_scores.append(score)
+        if score > 0.6:
+            emotions['positive'] += 1
+        elif score < 0.4:
+            emotions['negative'] += 1
+        else:
+            emotions['neutral'] += 1

-for item in danmu_texts:
-    s = SnowNLP(item)
-    if s.sentiments > 0.6:
-        emotions['positive'] += 1
-    elif s.sentiments < 0.4:
-        emotions['negative'] += 1
-    else:
-        emotions['neutral'] += 1
+    avg_score = np.mean(sentiment_scores)
+    return emotions, avg_score
+
+
+def process_partition(partition_path):
+    info_file = os.path.join(partition_path, 'info.csv')
+    if not os.path.exists(info_file):
+        print(f"未找到info文件: {info_file}")
+        return
+
+    info_df = pd.read_csv(info_file,encoding='utf-8')
+    scores = []
+
+    for bv in info_df['BV号']:
+        danmu_file = os.path.join(partition_path, bv, f"{bv}_273_danmaku.csv")
+        if not os.path.exists(danmu_file):
+            scores.append(None)
+            continue
+
+        danmu_texts = load_data(danmu_file)
+        if not danmu_texts:
+            scores.append(None)
+            continue
+
+        _, avg_score = analyze_sentiment(danmu_texts)
+        scores.append(avg_score)
+
+    info_df['情感评分'] = scores
+    info_df.to_csv(info_file, index=False)
+
+
+# 使用示例 - 处理GMV分区
+partition_path = "hot_data/GMV"
+process_partition(partition_path)
--- a/弹幕情感评分.py
+++ b/弹幕情感评分.py
@ -1,43 +1,116 @@
 import pandas as pd
 import torch
-from modelscope import AutoModelForSequenceClassification, AutoTokenizer
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from pathlib import Path


-def load_data(file_path):
-    """优化后的数据加载函数"""
-    try:
-        df = pd.read_csv(file_path, usecols=['弹幕内容'], engine='python')
-        return df['弹幕内容'].dropna().astype(str).tolist()
-    except Exception as e:
-        print(f"数据加载失败: {str(e)}")
-        return []
-
-
-def analyze_sentiment(texts):
-    """改进的情感分析函数"""
-    # 使用模型配置
-    model_name = "damo/nlp_structbert_sentiment-classification_chinese-base"
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model = AutoModelForSequenceClassification.from_pretrained(model_name)
-
-    # 批量处理提升效率
-    inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
-    with torch.no_grad():
-        outputs = model(**inputs)
-
-    # 调整概率计算方式
-    probs = torch.softmax(outputs.logits, dim=1)
-    return probs[:, 1].mean().item()
+class SentimentAnalyzer:
+    def __init__(self, model_path: str = "uer/roberta-base-finetuned-dianping-chinese"):
+        """
+        初始化情感分析模型
+        :param model_path: 本地模型路径或HuggingFace模型名称
+        """
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+        try:
+            # 优先尝试加载本地模型
+            local_path = Path(model_path)
+            if local_path.exists():
+                self.tokenizer = AutoTokenizer.from_pretrained(local_path)
+                self.model = AutoModelForSequenceClassification.from_pretrained(local_path)
+            else:
+                # 从HuggingFace加载（使用国内镜像）
+                self.tokenizer = AutoTokenizer.from_pretrained(model_path,
+                                                               mirror="https://hf-mirror.com")
+                self.model = AutoModelForSequenceClassification.from_pretrained(model_path,
+                                                                                mirror="https://hf-mirror.com")
+
+            self.model = self.model.to(self.device)
+            self.model.eval()
+            print(f"成功加载模型: {model_path}")
+
+        except Exception as e:
+            raise RuntimeError(f"模型加载失败: {str(e)}")
+
+    def analyze(self, texts: list, batch_size: int = 32) -> list:
+        """
+        批量情感分析
+        :param texts: 待分析文本列表
+        :param batch_size: 批处理大小
+        :return: 情感概率列表（0-1之间）
+        """
+        if not texts:
+            return []
+
+        # 自动调整批大小防止内存溢出
+        mem = torch.cuda.mem_get_info()[0] if torch.cuda.is_available() else 2e9
+        safe_batch_size = min(batch_size, max(1, int(mem // 1e7)))
+
+        all_probs = []
+        for i in range(0, len(texts), safe_batch_size):
+            batch = texts[i:i + safe_batch_size]
+
+            try:
+                inputs = self.tokenizer(
+                    batch,
+                    padding=True,
+                    truncation=True,
+                    max_length=128,
+                    return_tensors="pt"
+                ).to(self.device)
+
+                with torch.no_grad():
+                    outputs = self.model(**inputs)
+
+                probs = torch.softmax(outputs.logits, dim=1)[:, 1]
+                all_probs.extend(probs.cpu().numpy().tolist())
+
+            except RuntimeError as e:
+                if "CUDA out of memory" in str(e):
+                    safe_batch_size = max(1, safe_batch_size // 2)
+                    print(f"检测到显存不足，调整批大小为: {safe_batch_size}")
+                    continue
+                raise
+
+        return all_probs


+# ----------------- 使用示例 -----------------
 if __name__ == "__main__":
-    # 示例文件路径
-    file_path = "hot_data/GMV/BV1ajXMYUE6S/BV1ajXMYUE6S_273_danmaku.csv"
+    # 初始化分析器（自动选择本地/在线模型）
+    analyzer = SentimentAnalyzer("./local_models/sentiment")  # 优先尝试本地模型

-    # 执行分析
-    danmu_texts = load_data(file_path)
-    if danmu_texts:
-        final_score = analyze_sentiment(danmu_texts)
-        print(f"B站弹幕情感评分：{final_score:.4f}")
-    else:
-        print("未找到有效弹幕数据")
+    # 测试数据
+    test_texts = [
+        "这个视频真的太棒了！",
+        "完全看不懂在讲什么",
+        "浪费时间，不建议观看",
+        "画面精美，内容有深度"
+    ]
+
+    # 获取情感概率
+    scores = analyzer.analyze(test_texts)
+
+    # 输出结果
+    for text, score in zip(test_texts, scores):
+        print(f"「{text}」=> 积极概率: {score:.4f}")
+
+
+    # 从CSV文件读取弹幕
+    def load_danmu(file_path: str) -> list:
+        try:
+            df = pd.read_csv(file_path)
+            return df['弹幕内容'].dropna().astype(str).tolist()
+        except Exception as e:
+            print(f"文件读取失败: {str(e)}")
+            return []
+
+
+    # 实际使用
+    # 文件路径
+    file_path = "hot_data/GMV/BV1ajXMYUE6S/BV1ajXMYUE6S_273_danmaku.csv"
+    danmu_list = load_danmu(file_path)
+    if danmu_list:
+        danmu_scores = analyzer.analyze(danmu_list)
+        avg_score = sum(danmu_scores) / len(danmu_scores)
+        print(f"\n弹幕平均情感评分：{avg_score:.4f}")