statistics_model2025/title_analyzer.py

import pandas as pd
from cnsenti import Sentiment
import pynlpir
from collections import defaultdict
import warnings

warnings.filterwarnings('ignore')

# ------------------文本唤醒度-初始化配置 ---------------------
# 情感大类唤醒强度赋值（根据廖圣清方法调整）
AROUSAL_STRENGTH = {
    '惊': 7,   # 最高唤醒
    '惧': 6,
    '怒': 5,
    '乐': 4,
    '恶': 3,
    '好': 2,
    '哀': 1     # 最低唤醒
}

# 情感小类到大类映射表（基于大连理工词典规范）
EMOTION_CATEGORY_MAP = {
    'PA': '乐',  # 快乐
    'PE': '乐',  # 安心
    'PD': '好',  # 尊敬
    'PH': '好',  # 赞扬
    'PG': '好',  # 相信
    'PB': '好',  # 喜爱
    'PK': '好',  # 祝愿
    'NA': '怒',  # 愤怒
    'NB': '哀',  # 悲伤
    'NJ': '哀',  # 失望
    'NH': '哀',  # 疚
    'PF': '哀',  # 思
    'NI': '惧',  # 慌
    'NC': '惧',  # 恐惧
    'NG': '惧',  # 羞
    'NE': '恶',  # 烦闷
    'ND': '恶',  # 憎恶
    'NN': '恶',  # 贬责
    'NK': '恶',  # 妒忌
    'NL': '恶',  # 怀疑
    'PC': '惊'   # 惊奇
}

# ------------------处理文本 ---------------------
def load_emotion_dict(excel_path):
    """加载大连理工情感词典（仅处理主分类列）"""
    emotion_dict = defaultdict(list)
    try:
        df = pd.read_excel(excel_path)
        for _, row in df.iterrows():
            word = str(row['词语']).strip()
            if not word:
                continue

            # 仅处理"情感分类"列
            main_cls = str(row['情感分类']).strip() if pd.notna(row['情感分类']) else ''
            for cls in main_cls.split('/'):
                cls = cls.strip()
                if cls in EMOTION_CATEGORY_MAP:
                    emotion_dict[word].append(EMOTION_CATEGORY_MAP[cls])

        # 去重处理
        for word in emotion_dict:
            emotion_dict[word] = list(set(emotion_dict[word]))

    except Exception as e:
        print(f"情感词典加载错误: {str(e)}")
    return emotion_dict

# 初始化情感词典
EMOTION_DICT = load_emotion_dict('情感词汇本体/情感词汇本体.xlsx')

def analyze_text(file_path):
    """完善的文本情感分析"""
    try:
        # 1. 从Excel读取数据
        df = pd.read_csv(file_path)
        results = []

        # 2. 逐行处理每个title
        for idx, row in df.iterrows():
            text = str(row['title']).strip() if pd.notna(row['title']) else ''

            if not text:
                results.append({'PosTe': 0.0, 'ActTe': 0.0})
                continue

            # 安全分词
            pynlpir.open()
            segments = pynlpir.segment(text, pos_tagging=True)  # 必须启用词性标注
            word_list = []
            for seg in segments:
                if isinstance(seg, tuple) and len(seg) == 2:
                    word_list.append(seg[0])
                else:
                    print(f"异常分词结果: {seg}")

            # 2. 情感词统计分析
            # 初始化计数
            category_counts = defaultdict(int)
            total_emotion_words = 0

            # 3. 情感词匹配与分类
            for word in word_list:
                if word in EMOTION_DICT:
                    for category in EMOTION_DICT[word]:
                        category_counts[category] += 1
                        total_emotion_words += 1

            # 4. 情感效价计算（PosTe）
            # NLPIR分词与情感词标注
            segments = pynlpir.segment(text, pos_tagging=False)
            senti = Sentiment()
            result = senti.sentiment_count(text)

            # 计算情感效价（PosTe公式：积极词数 / (积极词数+消极词数)）
            positive = result['pos']
            negative = result['neg']
            total = positive + negative
            pos_te = positive / total if total > 0 else 0.5

            # 5. 情感唤醒度计算（ActTe）
            if total_emotion_words == 0:
                act_te = 0.0
            else:
                total_arousal = sum(
                    count * AROUSAL_STRENGTH[cat]
                    for cat, count in category_counts.items()
                )
                # 归一化到[0,1]范围（最大可能值=7*N，实际值=total_arousal）
                act_te = total_arousal / (7 * total_emotion_words)

            results.append({
                'PosTe': round(pos_te, 3),
                'ActTe': round(act_te, 3),
            })

            # 每处理100条显示一次进度
            if (idx + 1) % 100 == 0:
                print(f"已处理 {idx + 1}/{len(df)} 条数据")

        #输出到文档
        df['PosTe'] = [r['PosTe'] for r in results]
        df['ActTe'] = [r['ActTe'] for r in results]
        df.to_csv(file_path, index=False, encoding='utf-8-sig')

        return results

    except Exception as e:
        print(f"文本分析错误：{str(e)}")
        # 返回与输入数据长度匹配的默认结果列表
        return [{'PosTe': 0.0, 'ActTe': 0.0} for _ in range(len(df))]

    finally:
            pynlpir.close()  # 确保释放NLPIR资源

if __name__ == "__main__":
    file_path = 'data_all_second_ver.csv'
    analyze_text(file_path)