statistics_model2025/RoBERTa_danmu_sentiment_analyzer.py

86 lines
2.9 KiB
Python
Raw Normal View History

# 修改导入部分
from transformers import AutoModelForSequenceClassification, AutoTokenizer # 替换为 transformers 库
import pandas as pd
import torch
import os
def load_data(file_path):
"""优化后的数据加载函数"""
try:
df = pd.read_csv(file_path, usecols=['弹幕内容'], engine='python')
return df['弹幕内容'].dropna().astype(str).tolist()
except Exception as e:
print(f"数据加载失败: {str(e)}")
return []
def analyze_sentiment(texts):
"""改进的情感分析函数"""
try:
# 使用 HuggingFace 的模型
model_name = "IDEA-CCNL/Erlangshen-Roberta-330M-Sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
# 批量处理提升效率
inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
# 调整概率计算方式
probs = torch.softmax(outputs.logits, dim=1)
return probs[:, 1].mean().item()
except Exception as e:
print(f"情感分析失败: {str(e)}")
return 0.5 # 错误时返回中性值
def process_all_partitions(base_path):
# 获取所有分区目录
partitions = [d for d in os.listdir(base_path)
if os.path.isdir(os.path.join(base_path, d))]
for partition in partitions:
partition_path = os.path.join(base_path, partition)
print(f"正在处理分区: {partition}")
process_partition(partition_path)
# process_partition函数
def process_partition(partition_path):
info_file = os.path.join(partition_path, 'info.csv')
if not os.path.exists(info_file):
print(f"未找到info文件: {info_file}")
return
info_df = pd.read_csv(info_file, encoding='utf-8')
# 创建与info_df行数相同的空列表初始值为None
scores = [None] * len(info_df)
for idx, bv in enumerate(info_df['BV号']):
# 构建弹幕文件目录路径
danmu_dir = os.path.join(partition_path, bv)
if not os.path.exists(danmu_dir):
continue # 保持None值
# 查找匹配的弹幕文件
danmu_files = [f for f in os.listdir(danmu_dir)
if f.startswith(bv) and f.endswith('danmaku.csv')]
if not danmu_files:
continue # 保持None值
danmu_file = os.path.join(danmu_dir, danmu_files[0])
danmu_texts = load_data(danmu_file)
if not danmu_texts:
continue # 保持None值
# 将结果放入对应的索引位置
scores[idx] = analyze_sentiment(danmu_texts)
info_df['弹幕情感评分RoBERTa'] = scores
info_df.to_csv(info_file, index=False, encoding='utf-8-sig')
# 使用示例 - 处理所有分区
process_all_partitions("hot_data")
process_all_partitions("nohot_data")