From 2107d9e1895ec8763165aeca74ef63b4b1f703e6 Mon Sep 17 00:00:00 2001 From: Sheyiyuan <2125107118@qq.com> Date: Sun, 30 Mar 2025 12:06:31 +0800 Subject: [PATCH] add:tqdm --- RoBERTa_danmu_sentiment_analyzer.py | 50 +++++++++++++++++------------ build.spec | 0 2 files changed, 30 insertions(+), 20 deletions(-) create mode 100644 build.spec diff --git a/RoBERTa_danmu_sentiment_analyzer.py b/RoBERTa_danmu_sentiment_analyzer.py index c6fd93c..b3ca6b6 100644 --- a/RoBERTa_danmu_sentiment_analyzer.py +++ b/RoBERTa_danmu_sentiment_analyzer.py @@ -3,24 +3,28 @@ from transformers import AutoModelForSequenceClassification, AutoTokenizer # import pandas as pd import torch import os +# 在文件开头添加导入 +from tqdm import tqdm def load_data(file_path): """优化后的数据加载函数""" try: - df = pd.read_csv(file_path, usecols=['弹幕内容'], engine='python') + df = pd.read_csv(file_path, usecols=['弹幕内容'], engine='python', encoding='utf-8') return df['弹幕内容'].dropna().astype(str).tolist() except Exception as e: print(f"数据加载失败: {str(e)}") return [] +# 在analyze_sentiment函数中添加模型路径处理 def analyze_sentiment(texts): - """改进的情感分析函数""" try: - # 使用 HuggingFace 的模型 + # 修改为优先使用打包后的模型路径 + model_path = os.path.join(os.path.dirname(__file__), '.cache/huggingface/hub') model_name = "IDEA-CCNL/Erlangshen-Roberta-330M-Sentiment" - tokenizer = AutoTokenizer.from_pretrained(model_name) - model = AutoModelForSequenceClassification.from_pretrained(model_name) + + tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=model_path) + model = AutoModelForSequenceClassification.from_pretrained(model_name, cache_dir=model_path) # 批量处理提升效率 inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt") @@ -53,27 +57,33 @@ def process_partition(partition_path): return info_df = pd.read_csv(info_file, encoding='utf-8') - # 创建与info_df行数相同的空列表,初始值为None scores = [None] * len(info_df) - for idx, bv in enumerate(info_df['BV号']): - # 构建弹幕文件目录路径 - danmu_dir = os.path.join(partition_path, bv) - if not os.path.exists(danmu_dir): - continue # 保持None值 + # 添加进度条 + with tqdm(total=len(info_df), desc=f"处理分区 {os.path.basename(partition_path)}") as pbar: + for idx, bv in enumerate(info_df['BV号']): + danmu_dir = os.path.join(partition_path, bv) + if not os.path.exists(danmu_dir): + pbar.update(1) + continue - # 查找匹配的弹幕文件 - danmu_files = [f for f in os.listdir(danmu_dir) - if f.startswith(bv) and f.endswith('danmaku.csv')] + danmu_files = [f for f in os.listdir(danmu_dir) + if f.startswith(bv) and f.endswith('danmaku.csv')] - if not danmu_files: - continue # 保持None值 + if not danmu_files: + pbar.update(1) + continue - danmu_file = os.path.join(danmu_dir, danmu_files[0]) - danmu_texts = load_data(danmu_file) + danmu_file = os.path.join(danmu_dir, danmu_files[0]) + danmu_texts = load_data(danmu_file) - if not danmu_texts: - continue # 保持None值 + if not danmu_texts: + pbar.update(1) + continue + + scores[idx] = analyze_sentiment(danmu_texts) + pbar.update(1) + pbar.set_postfix({'当前BV号': bv, '评分': scores[idx]}) # 将结果放入对应的索引位置 scores[idx] = analyze_sentiment(danmu_texts) diff --git a/build.spec b/build.spec new file mode 100644 index 0000000..e69de29