From 2107d9e1895ec8763165aeca74ef63b4b1f703e6 Mon Sep 17 00:00:00 2001
From: Sheyiyuan <2125107118@qq.com>
Date: Sun, 30 Mar 2025 12:06:31 +0800
Subject: [PATCH] add:tqdm

---
 RoBERTa_danmu_sentiment_analyzer.py | 50 +++++++++++++++++------------
 build.spec                          |  0
 2 files changed, 30 insertions(+), 20 deletions(-)
 create mode 100644 build.spec

diff --git a/RoBERTa_danmu_sentiment_analyzer.py b/RoBERTa_danmu_sentiment_analyzer.py
index c6fd93c..b3ca6b6 100644
--- a/RoBERTa_danmu_sentiment_analyzer.py
+++ b/RoBERTa_danmu_sentiment_analyzer.py
@@ -3,24 +3,28 @@ from transformers import AutoModelForSequenceClassification, AutoTokenizer  # 
 import pandas as pd
 import torch
 import os
+# 在文件开头添加导入
+from tqdm import tqdm
 
 def load_data(file_path):
     """优化后的数据加载函数"""
     try:
-        df = pd.read_csv(file_path, usecols=['弹幕内容'], engine='python')
+        df = pd.read_csv(file_path, usecols=['弹幕内容'], engine='python', encoding='utf-8')
         return df['弹幕内容'].dropna().astype(str).tolist()
     except Exception as e:
         print(f"数据加载失败: {str(e)}")
         return []
 
 
+# 在analyze_sentiment函数中添加模型路径处理
 def analyze_sentiment(texts):
-    """改进的情感分析函数"""
     try:
-        # 使用 HuggingFace 的模型
+        # 修改为优先使用打包后的模型路径
+        model_path = os.path.join(os.path.dirname(__file__), '.cache/huggingface/hub')
         model_name = "IDEA-CCNL/Erlangshen-Roberta-330M-Sentiment"
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        model = AutoModelForSequenceClassification.from_pretrained(model_name)
+
+        tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=model_path)
+        model = AutoModelForSequenceClassification.from_pretrained(model_name, cache_dir=model_path)
 
         # 批量处理提升效率
         inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
@@ -53,27 +57,33 @@ def process_partition(partition_path):
         return
 
     info_df = pd.read_csv(info_file, encoding='utf-8')
-    # 创建与info_df行数相同的空列表，初始值为None
     scores = [None] * len(info_df)
 
-    for idx, bv in enumerate(info_df['BV号']):
-        # 构建弹幕文件目录路径
-        danmu_dir = os.path.join(partition_path, bv)
-        if not os.path.exists(danmu_dir):
-            continue  # 保持None值
+    # 添加进度条
+    with tqdm(total=len(info_df), desc=f"处理分区 {os.path.basename(partition_path)}") as pbar:
+        for idx, bv in enumerate(info_df['BV号']):
+            danmu_dir = os.path.join(partition_path, bv)
+            if not os.path.exists(danmu_dir):
+                pbar.update(1)
+                continue
 
-        # 查找匹配的弹幕文件
-        danmu_files = [f for f in os.listdir(danmu_dir)
-                      if f.startswith(bv) and f.endswith('danmaku.csv')]
+            danmu_files = [f for f in os.listdir(danmu_dir)
+                          if f.startswith(bv) and f.endswith('danmaku.csv')]
 
-        if not danmu_files:
-            continue  # 保持None值
+            if not danmu_files:
+                pbar.update(1)
+                continue
 
-        danmu_file = os.path.join(danmu_dir, danmu_files[0])
-        danmu_texts = load_data(danmu_file)
+            danmu_file = os.path.join(danmu_dir, danmu_files[0])
+            danmu_texts = load_data(danmu_file)
 
-        if not danmu_texts:
-            continue  # 保持None值
+            if not danmu_texts:
+                pbar.update(1)
+                continue
+
+            scores[idx] = analyze_sentiment(danmu_texts)
+            pbar.update(1)
+            pbar.set_postfix({'当前BV号': bv, '评分': scores[idx]})
 
         # 将结果放入对应的索引位置
         scores[idx] = analyze_sentiment(danmu_texts)
diff --git a/build.spec b/build.spec
new file mode 100644
index 0000000..e69de29