2025-04-02 18:50:35 +08:00
|
|
|
|
import os
|
|
|
|
|
# 设置HuggingFace国内镜像源(添加在文件最开头)
|
|
|
|
|
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
|
2025-03-29 18:30:20 +08:00
|
|
|
|
# 修改导入部分
|
|
|
|
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer # 替换为 transformers 库
|
|
|
|
|
import pandas as pd
|
|
|
|
|
import torch
|
|
|
|
|
import os
|
2025-03-30 12:06:31 +08:00
|
|
|
|
# 在文件开头添加导入
|
|
|
|
|
from tqdm import tqdm
|
2025-03-29 18:30:20 +08:00
|
|
|
|
|
|
|
|
|
def load_data(file_path):
|
|
|
|
|
"""优化后的数据加载函数"""
|
|
|
|
|
try:
|
2025-03-30 12:06:31 +08:00
|
|
|
|
df = pd.read_csv(file_path, usecols=['弹幕内容'], engine='python', encoding='utf-8')
|
2025-03-31 09:37:53 +08:00
|
|
|
|
print(f"调试信息 - 文件 {file_path} 包含的列名: {list(df.columns)}")
|
2025-03-29 18:30:20 +08:00
|
|
|
|
return df['弹幕内容'].dropna().astype(str).tolist()
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"数据加载失败: {str(e)}")
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
2025-03-30 12:06:31 +08:00
|
|
|
|
# 在analyze_sentiment函数中添加模型路径处理
|
2025-03-29 18:30:20 +08:00
|
|
|
|
def analyze_sentiment(texts):
|
2025-03-30 12:10:20 +08:00
|
|
|
|
"""改进的情感分析函数"""
|
2025-03-29 18:30:20 +08:00
|
|
|
|
try:
|
2025-03-31 09:37:53 +08:00
|
|
|
|
# 如果弹幕数量超过500,均匀抽样
|
|
|
|
|
if len(texts) > 500:
|
|
|
|
|
step = len(texts) // 500
|
|
|
|
|
texts = texts[::step][:500]
|
|
|
|
|
|
2025-03-30 12:10:20 +08:00
|
|
|
|
# 使用 HuggingFace 的模型
|
2025-03-31 16:31:27 +08:00
|
|
|
|
model_path = "IDEA-CCNL/Erlangshen-Roberta-330M-Sentiment"
|
2025-03-31 09:37:53 +08:00
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
|
|
|
|
model = AutoModelForSequenceClassification.from_pretrained(model_path)
|
2025-03-30 12:06:31 +08:00
|
|
|
|
|
2025-03-30 12:10:20 +08:00
|
|
|
|
# 将模型移动到GPU
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
|
|
model = model.to(device)
|
2025-03-29 18:30:20 +08:00
|
|
|
|
|
|
|
|
|
# 批量处理提升效率
|
|
|
|
|
inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
|
2025-03-30 12:10:20 +08:00
|
|
|
|
inputs = {k: v.to(device) for k, v in inputs.items()} # 将输入数据也移动到GPU
|
|
|
|
|
|
2025-03-29 18:30:20 +08:00
|
|
|
|
with torch.no_grad():
|
|
|
|
|
outputs = model(**inputs)
|
|
|
|
|
|
|
|
|
|
# 调整概率计算方式
|
|
|
|
|
probs = torch.softmax(outputs.logits, dim=1)
|
|
|
|
|
return probs[:, 1].mean().item()
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"情感分析失败: {str(e)}")
|
|
|
|
|
return 0.5 # 错误时返回中性值
|
|
|
|
|
|
|
|
|
|
def process_all_partitions(base_path):
|
|
|
|
|
# 获取所有分区目录
|
|
|
|
|
partitions = [d for d in os.listdir(base_path)
|
|
|
|
|
if os.path.isdir(os.path.join(base_path, d))]
|
|
|
|
|
|
|
|
|
|
for partition in partitions:
|
|
|
|
|
partition_path = os.path.join(base_path, partition)
|
|
|
|
|
print(f"正在处理分区: {partition}")
|
|
|
|
|
process_partition(partition_path)
|
|
|
|
|
|
|
|
|
|
# process_partition函数
|
|
|
|
|
def process_partition(partition_path):
|
|
|
|
|
info_file = os.path.join(partition_path, 'info.csv')
|
|
|
|
|
if not os.path.exists(info_file):
|
|
|
|
|
print(f"未找到info文件: {info_file}")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
info_df = pd.read_csv(info_file, encoding='utf-8')
|
|
|
|
|
scores = [None] * len(info_df)
|
|
|
|
|
|
2025-03-30 12:06:31 +08:00
|
|
|
|
# 添加进度条
|
|
|
|
|
with tqdm(total=len(info_df), desc=f"处理分区 {os.path.basename(partition_path)}") as pbar:
|
|
|
|
|
for idx, bv in enumerate(info_df['BV号']):
|
|
|
|
|
danmu_dir = os.path.join(partition_path, bv)
|
|
|
|
|
if not os.path.exists(danmu_dir):
|
|
|
|
|
pbar.update(1)
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
danmu_files = [f for f in os.listdir(danmu_dir)
|
|
|
|
|
if f.startswith(bv) and f.endswith('danmaku.csv')]
|
2025-03-29 18:30:20 +08:00
|
|
|
|
|
2025-03-30 12:06:31 +08:00
|
|
|
|
if not danmu_files:
|
|
|
|
|
pbar.update(1)
|
|
|
|
|
continue
|
2025-03-29 18:30:20 +08:00
|
|
|
|
|
2025-03-30 12:06:31 +08:00
|
|
|
|
danmu_file = os.path.join(danmu_dir, danmu_files[0])
|
|
|
|
|
danmu_texts = load_data(danmu_file)
|
2025-03-29 18:30:20 +08:00
|
|
|
|
|
2025-03-30 12:06:31 +08:00
|
|
|
|
if not danmu_texts:
|
|
|
|
|
pbar.update(1)
|
|
|
|
|
continue
|
2025-03-29 18:30:20 +08:00
|
|
|
|
|
2025-03-30 12:06:31 +08:00
|
|
|
|
scores[idx] = analyze_sentiment(danmu_texts)
|
|
|
|
|
pbar.update(1)
|
|
|
|
|
pbar.set_postfix({'当前BV号': bv, '评分': scores[idx]})
|
2025-03-29 18:30:20 +08:00
|
|
|
|
|
|
|
|
|
info_df['弹幕情感评分RoBERTa'] = scores
|
|
|
|
|
info_df.to_csv(info_file, index=False, encoding='utf-8-sig')
|
|
|
|
|
|
|
|
|
|
# 使用示例 - 处理所有分区
|
2025-03-31 16:31:27 +08:00
|
|
|
|
#process_all_partitions("hot_data")
|
|
|
|
|
process_all_partitions("nohot_data")
|
2025-04-02 18:50:35 +08:00
|
|
|
|
|
|
|
|
|
while True :
|
|
|
|
|
pass
|