import pandas as pd from cnsenti import Sentiment import pynlpir from collections import defaultdict import warnings warnings.filterwarnings('ignore') # ------------------文本唤醒度-初始化配置 --------------------- # 情感大类唤醒强度赋值(根据廖圣清方法调整) AROUSAL_STRENGTH = { '惊': 7, # 最高唤醒 '惧': 6, '怒': 5, '乐': 4, '恶': 3, '好': 2, '哀': 1 # 最低唤醒 } # 情感小类到大类映射表(基于大连理工词典规范) EMOTION_CATEGORY_MAP = { 'PA': '乐', # 快乐 'PE': '乐', # 安心 'PD': '好', # 尊敬 'PH': '好', # 赞扬 'PG': '好', # 相信 'PB': '好', # 喜爱 'PK': '好', # 祝愿 'NA': '怒', # 愤怒 'NB': '哀', # 悲伤 'NJ': '哀', # 失望 'NH': '哀', # 疚 'PF': '哀', # 思 'NI': '惧', # 慌 'NC': '惧', # 恐惧 'NG': '惧', # 羞 'NE': '恶', # 烦闷 'ND': '恶', # 憎恶 'NN': '恶', # 贬责 'NK': '恶', # 妒忌 'NL': '恶', # 怀疑 'PC': '惊' # 惊奇 } # ------------------处理文本 --------------------- def load_emotion_dict(excel_path): """加载大连理工情感词典(仅处理主分类列)""" emotion_dict = defaultdict(list) try: df = pd.read_excel(excel_path) for _, row in df.iterrows(): word = str(row['词语']).strip() if not word: continue # 仅处理"情感分类"列 main_cls = str(row['情感分类']).strip() if pd.notna(row['情感分类']) else '' for cls in main_cls.split('/'): cls = cls.strip() if cls in EMOTION_CATEGORY_MAP: emotion_dict[word].append(EMOTION_CATEGORY_MAP[cls]) # 去重处理 for word in emotion_dict: emotion_dict[word] = list(set(emotion_dict[word])) except Exception as e: print(f"情感词典加载错误: {str(e)}") return emotion_dict # 初始化情感词典 EMOTION_DICT = load_emotion_dict('情感词汇本体/情感词汇本体.xlsx') def analyze_text(file_path): """完善的文本情感分析""" try: # 1. 从Excel读取数据 df = pd.read_csv(file_path) results = [] # 2. 逐行处理每个title for idx, row in df.iterrows(): text = str(row['title']).strip() if pd.notna(row['title']) else '' if not text: results.append({'PosTe': 0.0, 'ActTe': 0.0}) continue # 安全分词 pynlpir.open() segments = pynlpir.segment(text, pos_tagging=True) # 必须启用词性标注 word_list = [] for seg in segments: if isinstance(seg, tuple) and len(seg) == 2: word_list.append(seg[0]) else: print(f"异常分词结果: {seg}") # 2. 情感词统计分析 # 初始化计数 category_counts = defaultdict(int) total_emotion_words = 0 # 3. 情感词匹配与分类 for word in word_list: if word in EMOTION_DICT: for category in EMOTION_DICT[word]: category_counts[category] += 1 total_emotion_words += 1 # 4. 情感效价计算(PosTe) # NLPIR分词与情感词标注 segments = pynlpir.segment(text, pos_tagging=False) senti = Sentiment() result = senti.sentiment_count(text) # 计算情感效价(PosTe公式:积极词数 / (积极词数+消极词数)) positive = result['pos'] negative = result['neg'] total = positive + negative pos_te = positive / total if total > 0 else 0.5 # 5. 情感唤醒度计算(ActTe) if total_emotion_words == 0: act_te = 0.0 else: total_arousal = sum( count * AROUSAL_STRENGTH[cat] for cat, count in category_counts.items() ) # 归一化到[0,1]范围(最大可能值=7*N,实际值=total_arousal) act_te = total_arousal / (7 * total_emotion_words) results.append({ 'PosTe': round(pos_te, 3), 'ActTe': round(act_te, 3), }) # 每处理100条显示一次进度 if (idx + 1) % 100 == 0: print(f"已处理 {idx + 1}/{len(df)} 条数据") #输出到文档 df['PosTe'] = [r['PosTe'] for r in results] df['ActTe'] = [r['ActTe'] for r in results] df.to_csv(file_path, index=False, encoding='utf-8-sig') return results except Exception as e: print(f"文本分析错误:{str(e)}") # 返回与输入数据长度匹配的默认结果列表 return [{'PosTe': 0.0, 'ActTe': 0.0} for _ in range(len(df))] finally: pynlpir.close() # 确保释放NLPIR资源 if __name__ == "__main__": file_path = 'data_all_second_ver.csv' analyze_text(file_path)