statistics_model2025/RoBERTa_danmu_sentiment_analyzer.py

# 修改导入部分
from transformers import AutoModelForSequenceClassification, AutoTokenizer  # 替换为 transformers 库
import pandas as pd
import torch
import os
# 在文件开头添加导入
from tqdm import tqdm

def load_data(file_path):
    """优化后的数据加载函数"""
    try:
        df = pd.read_csv(file_path, usecols=['弹幕内容'], engine='python', encoding='utf-8')
        return df['弹幕内容'].dropna().astype(str).tolist()
    except Exception as e:
        print(f"数据加载失败: {str(e)}")
        return []


# 在analyze_sentiment函数中添加模型路径处理
def analyze_sentiment(texts):
    try:
        # 修改为优先使用打包后的模型路径
        model_path = os.path.join(os.path.dirname(__file__), '.cache/huggingface/hub')
        model_name = "IDEA-CCNL/Erlangshen-Roberta-330M-Sentiment"

        tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=model_path)
        model = AutoModelForSequenceClassification.from_pretrained(model_name, cache_dir=model_path)

        # 批量处理提升效率
        inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs)

        # 调整概率计算方式
        probs = torch.softmax(outputs.logits, dim=1)
        return probs[:, 1].mean().item()

    except Exception as e:
        print(f"情感分析失败: {str(e)}")
        return 0.5  # 错误时返回中性值

def process_all_partitions(base_path):
    # 获取所有分区目录
    partitions = [d for d in os.listdir(base_path)
                 if os.path.isdir(os.path.join(base_path, d))]

    for partition in partitions:
        partition_path = os.path.join(base_path, partition)
        print(f"正在处理分区: {partition}")
        process_partition(partition_path)

# process_partition函数
def process_partition(partition_path):
    info_file = os.path.join(partition_path, 'info.csv')
    if not os.path.exists(info_file):
        print(f"未找到info文件: {info_file}")
        return

    info_df = pd.read_csv(info_file, encoding='utf-8')
    scores = [None] * len(info_df)

    # 添加进度条
    with tqdm(total=len(info_df), desc=f"处理分区 {os.path.basename(partition_path)}") as pbar:
        for idx, bv in enumerate(info_df['BV号']):
            danmu_dir = os.path.join(partition_path, bv)
            if not os.path.exists(danmu_dir):
                pbar.update(1)
                continue

            danmu_files = [f for f in os.listdir(danmu_dir)
                          if f.startswith(bv) and f.endswith('danmaku.csv')]

            if not danmu_files:
                pbar.update(1)
                continue

            danmu_file = os.path.join(danmu_dir, danmu_files[0])
            danmu_texts = load_data(danmu_file)

            if not danmu_texts:
                pbar.update(1)
                continue

            scores[idx] = analyze_sentiment(danmu_texts)
            pbar.update(1)
            pbar.set_postfix({'当前BV号': bv, '评分': scores[idx]})

        # 将结果放入对应的索引位置
        scores[idx] = analyze_sentiment(danmu_texts)

    info_df['弹幕情感评分RoBERTa'] = scores
    info_df.to_csv(info_file, index=False, encoding='utf-8-sig')

# 使用示例 - 处理所有分区
process_all_partitions("hot_data")
process_all_partitions("nohot_data")