add:tqdm
This commit is contained in:
parent
11104e6d5a
commit
2107d9e189
@ -3,24 +3,28 @@ from transformers import AutoModelForSequenceClassification, AutoTokenizer #
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import torch
|
import torch
|
||||||
import os
|
import os
|
||||||
|
# 在文件开头添加导入
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
def load_data(file_path):
|
def load_data(file_path):
|
||||||
"""优化后的数据加载函数"""
|
"""优化后的数据加载函数"""
|
||||||
try:
|
try:
|
||||||
df = pd.read_csv(file_path, usecols=['弹幕内容'], engine='python')
|
df = pd.read_csv(file_path, usecols=['弹幕内容'], engine='python', encoding='utf-8')
|
||||||
return df['弹幕内容'].dropna().astype(str).tolist()
|
return df['弹幕内容'].dropna().astype(str).tolist()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"数据加载失败: {str(e)}")
|
print(f"数据加载失败: {str(e)}")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
# 在analyze_sentiment函数中添加模型路径处理
|
||||||
def analyze_sentiment(texts):
|
def analyze_sentiment(texts):
|
||||||
"""改进的情感分析函数"""
|
|
||||||
try:
|
try:
|
||||||
# 使用 HuggingFace 的模型
|
# 修改为优先使用打包后的模型路径
|
||||||
|
model_path = os.path.join(os.path.dirname(__file__), '.cache/huggingface/hub')
|
||||||
model_name = "IDEA-CCNL/Erlangshen-Roberta-330M-Sentiment"
|
model_name = "IDEA-CCNL/Erlangshen-Roberta-330M-Sentiment"
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
||||||
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=model_path)
|
||||||
|
model = AutoModelForSequenceClassification.from_pretrained(model_name, cache_dir=model_path)
|
||||||
|
|
||||||
# 批量处理提升效率
|
# 批量处理提升效率
|
||||||
inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
|
inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
|
||||||
@ -53,27 +57,33 @@ def process_partition(partition_path):
|
|||||||
return
|
return
|
||||||
|
|
||||||
info_df = pd.read_csv(info_file, encoding='utf-8')
|
info_df = pd.read_csv(info_file, encoding='utf-8')
|
||||||
# 创建与info_df行数相同的空列表,初始值为None
|
|
||||||
scores = [None] * len(info_df)
|
scores = [None] * len(info_df)
|
||||||
|
|
||||||
for idx, bv in enumerate(info_df['BV号']):
|
# 添加进度条
|
||||||
# 构建弹幕文件目录路径
|
with tqdm(total=len(info_df), desc=f"处理分区 {os.path.basename(partition_path)}") as pbar:
|
||||||
danmu_dir = os.path.join(partition_path, bv)
|
for idx, bv in enumerate(info_df['BV号']):
|
||||||
if not os.path.exists(danmu_dir):
|
danmu_dir = os.path.join(partition_path, bv)
|
||||||
continue # 保持None值
|
if not os.path.exists(danmu_dir):
|
||||||
|
pbar.update(1)
|
||||||
|
continue
|
||||||
|
|
||||||
# 查找匹配的弹幕文件
|
danmu_files = [f for f in os.listdir(danmu_dir)
|
||||||
danmu_files = [f for f in os.listdir(danmu_dir)
|
if f.startswith(bv) and f.endswith('danmaku.csv')]
|
||||||
if f.startswith(bv) and f.endswith('danmaku.csv')]
|
|
||||||
|
|
||||||
if not danmu_files:
|
if not danmu_files:
|
||||||
continue # 保持None值
|
pbar.update(1)
|
||||||
|
continue
|
||||||
|
|
||||||
danmu_file = os.path.join(danmu_dir, danmu_files[0])
|
danmu_file = os.path.join(danmu_dir, danmu_files[0])
|
||||||
danmu_texts = load_data(danmu_file)
|
danmu_texts = load_data(danmu_file)
|
||||||
|
|
||||||
if not danmu_texts:
|
if not danmu_texts:
|
||||||
continue # 保持None值
|
pbar.update(1)
|
||||||
|
continue
|
||||||
|
|
||||||
|
scores[idx] = analyze_sentiment(danmu_texts)
|
||||||
|
pbar.update(1)
|
||||||
|
pbar.set_postfix({'当前BV号': bv, '评分': scores[idx]})
|
||||||
|
|
||||||
# 将结果放入对应的索引位置
|
# 将结果放入对应的索引位置
|
||||||
scores[idx] = analyze_sentiment(danmu_texts)
|
scores[idx] = analyze_sentiment(danmu_texts)
|
||||||
|
0
build.spec
Normal file
0
build.spec
Normal file
Loading…
x
Reference in New Issue
Block a user