statistics_model2025/title_analyzer.py

161 lines
5.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pandas as pd
from cnsenti import Sentiment
import pynlpir
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')
# ------------------文本唤醒度-初始化配置 ---------------------
# 情感大类唤醒强度赋值(根据廖圣清方法调整)
AROUSAL_STRENGTH = {
'': 7, # 最高唤醒
'': 6,
'': 5,
'': 4,
'': 3,
'': 2,
'': 1 # 最低唤醒
}
# 情感小类到大类映射表(基于大连理工词典规范)
EMOTION_CATEGORY_MAP = {
'PA': '', # 快乐
'PE': '', # 安心
'PD': '', # 尊敬
'PH': '', # 赞扬
'PG': '', # 相信
'PB': '', # 喜爱
'PK': '', # 祝愿
'NA': '', # 愤怒
'NB': '', # 悲伤
'NJ': '', # 失望
'NH': '', # 疚
'PF': '', # 思
'NI': '', # 慌
'NC': '', # 恐惧
'NG': '', # 羞
'NE': '', # 烦闷
'ND': '', # 憎恶
'NN': '', # 贬责
'NK': '', # 妒忌
'NL': '', # 怀疑
'PC': '' # 惊奇
}
# ------------------处理文本 ---------------------
def load_emotion_dict(excel_path):
"""加载大连理工情感词典(仅处理主分类列)"""
emotion_dict = defaultdict(list)
try:
df = pd.read_excel(excel_path)
for _, row in df.iterrows():
word = str(row['词语']).strip()
if not word:
continue
# 仅处理"情感分类"列
main_cls = str(row['情感分类']).strip() if pd.notna(row['情感分类']) else ''
for cls in main_cls.split('/'):
cls = cls.strip()
if cls in EMOTION_CATEGORY_MAP:
emotion_dict[word].append(EMOTION_CATEGORY_MAP[cls])
# 去重处理
for word in emotion_dict:
emotion_dict[word] = list(set(emotion_dict[word]))
except Exception as e:
print(f"情感词典加载错误: {str(e)}")
return emotion_dict
# 初始化情感词典
EMOTION_DICT = load_emotion_dict('情感词汇本体/情感词汇本体.xlsx')
def analyze_text(file_path):
"""完善的文本情感分析"""
try:
# 1. 从Excel读取数据
df = pd.read_csv(file_path)
results = []
# 2. 逐行处理每个title
for idx, row in df.iterrows():
text = str(row['title']).strip() if pd.notna(row['title']) else ''
if not text:
results.append({'PosTe': 0.0, 'ActTe': 0.0})
continue
# 安全分词
pynlpir.open()
segments = pynlpir.segment(text, pos_tagging=True) # 必须启用词性标注
word_list = []
for seg in segments:
if isinstance(seg, tuple) and len(seg) == 2:
word_list.append(seg[0])
else:
print(f"异常分词结果: {seg}")
# 2. 情感词统计分析
# 初始化计数
category_counts = defaultdict(int)
total_emotion_words = 0
# 3. 情感词匹配与分类
for word in word_list:
if word in EMOTION_DICT:
for category in EMOTION_DICT[word]:
category_counts[category] += 1
total_emotion_words += 1
# 4. 情感效价计算PosTe
# NLPIR分词与情感词标注
segments = pynlpir.segment(text, pos_tagging=False)
senti = Sentiment()
result = senti.sentiment_count(text)
# 计算情感效价PosTe公式积极词数 / (积极词数+消极词数)
positive = result['pos']
negative = result['neg']
total = positive + negative
pos_te = positive / total if total > 0 else 0.5
# 5. 情感唤醒度计算ActTe
if total_emotion_words == 0:
act_te = 0.0
else:
total_arousal = sum(
count * AROUSAL_STRENGTH[cat]
for cat, count in category_counts.items()
)
# 归一化到[0,1]范围(最大可能值=7*N实际值=total_arousal
act_te = total_arousal / (7 * total_emotion_words)
results.append({
'PosTe': round(pos_te, 3),
'ActTe': round(act_te, 3),
})
# 每处理100条显示一次进度
if (idx + 1) % 100 == 0:
print(f"已处理 {idx + 1}/{len(df)} 条数据")
#输出到文档
df['PosTe'] = [r['PosTe'] for r in results]
df['ActTe'] = [r['ActTe'] for r in results]
df.to_csv(file_path, index=False, encoding='utf-8-sig')
return results
except Exception as e:
print(f"文本分析错误:{str(e)}")
# 返回与输入数据长度匹配的默认结果列表
return [{'PosTe': 0.0, 'ActTe': 0.0} for _ in range(len(df))]
finally:
pynlpir.close() # 确保释放NLPIR资源
if __name__ == "__main__":
file_path = 'data_all_second_ver.csv'
analyze_text(file_path)