add:tqdm

2025-03-30 12:06:31 +08:00 · 2025-03-30 12:06:31 +08:00 · 2107d9e189
commit 2107d9e189
parent 11104e6d5a
2 changed files with 30 additions and 20 deletions
--- a/RoBERTa_danmu_sentiment_analyzer.py
+++ b/RoBERTa_danmu_sentiment_analyzer.py
@ -3,24 +3,28 @@ from transformers import AutoModelForSequenceClassification, AutoTokenizer  #
 import pandas as pd
 import torch
 import os
 # 在文件开头添加导入
 from tqdm import tqdm
 def load_data(file_path):
    """优化后的数据加载函数"""
    try:
-        df = pd.read_csv(file_path, usecols=['弹幕内容'], engine='python')
+        df = pd.read_csv(file_path, usecols=['弹幕内容'], engine='python', encoding='utf-8')
        return df['弹幕内容'].dropna().astype(str).tolist()
    except Exception as e:
        print(f"数据加载失败: {str(e)}")
        return []
 # 在analyze_sentiment函数中添加模型路径处理
 def analyze_sentiment(texts):
    """改进的情感分析函数"""
    try:
-        # 使用 HuggingFace 的模型
+        # 修改为优先使用打包后的模型路径
        model_path = os.path.join(os.path.dirname(__file__), '.cache/huggingface/hub')
        model_name = "IDEA-CCNL/Erlangshen-Roberta-330M-Sentiment"
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
+
-        model = AutoModelForSequenceClassification.from_pretrained(model_name)
+        tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=model_path)
        model = AutoModelForSequenceClassification.from_pretrained(model_name, cache_dir=model_path)
        # 批量处理提升效率
        inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
@ -53,27 +57,33 @@ def process_partition(partition_path):
        return
    info_df = pd.read_csv(info_file, encoding='utf-8')
    # 创建与info_df行数相同的空列表，初始值为None
    scores = [None] * len(info_df)
-    for idx, bv in enumerate(info_df['BV号']):
+    # 添加进度条
-        # 构建弹幕文件目录路径
+    with tqdm(total=len(info_df), desc=f"处理分区 {os.path.basename(partition_path)}") as pbar:
-        danmu_dir = os.path.join(partition_path, bv)
+        for idx, bv in enumerate(info_df['BV号']):
-        if not os.path.exists(danmu_dir):
+            danmu_dir = os.path.join(partition_path, bv)
-            continue  # 保持None值
+            if not os.path.exists(danmu_dir):
                pbar.update(1)
                continue
-        # 查找匹配的弹幕文件
+            danmu_files = [f for f in os.listdir(danmu_dir)
-        danmu_files = [f for f in os.listdir(danmu_dir)
+                          if f.startswith(bv) and f.endswith('danmaku.csv')]
                      if f.startswith(bv) and f.endswith('danmaku.csv')]
-        if not danmu_files:
+            if not danmu_files:
-            continue  # 保持None值
+                pbar.update(1)
                continue
-        danmu_file = os.path.join(danmu_dir, danmu_files[0])
+            danmu_file = os.path.join(danmu_dir, danmu_files[0])
-        danmu_texts = load_data(danmu_file)
+            danmu_texts = load_data(danmu_file)
-        if not danmu_texts:
+            if not danmu_texts:
-            continue  # 保持None值
+                pbar.update(1)
                continue
            scores[idx] = analyze_sentiment(danmu_texts)
            pbar.update(1)
            pbar.set_postfix({'当前BV号': bv, '评分': scores[idx]})
        # 将结果放入对应的索引位置
        scores[idx] = analyze_sentiment(danmu_texts)
--- a/build.spec
+++ b/build.spec