statistics_model2025/title_analyzer.py

161 lines
5.1 KiB
Python
Raw Permalink Normal View History

2025-04-01 11:20:27 +08:00
import pandas as pd
from cnsenti import Sentiment
import pynlpir
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')
# ------------------文本唤醒度-初始化配置 ---------------------
# 情感大类唤醒强度赋值(根据廖圣清方法调整)
AROUSAL_STRENGTH = {
'': 7, # 最高唤醒
'': 6,
'': 5,
'': 4,
'': 3,
'': 2,
'': 1 # 最低唤醒
}
# 情感小类到大类映射表(基于大连理工词典规范)
EMOTION_CATEGORY_MAP = {
'PA': '', # 快乐
'PE': '', # 安心
'PD': '', # 尊敬
'PH': '', # 赞扬
'PG': '', # 相信
'PB': '', # 喜爱
'PK': '', # 祝愿
'NA': '', # 愤怒
'NB': '', # 悲伤
'NJ': '', # 失望
'NH': '', # 疚
'PF': '', # 思
'NI': '', # 慌
'NC': '', # 恐惧
'NG': '', # 羞
'NE': '', # 烦闷
'ND': '', # 憎恶
'NN': '', # 贬责
'NK': '', # 妒忌
'NL': '', # 怀疑
'PC': '' # 惊奇
}
# ------------------处理文本 ---------------------
def load_emotion_dict(excel_path):
"""加载大连理工情感词典(仅处理主分类列)"""
emotion_dict = defaultdict(list)
try:
df = pd.read_excel(excel_path)
for _, row in df.iterrows():
word = str(row['词语']).strip()
if not word:
continue
# 仅处理"情感分类"列
main_cls = str(row['情感分类']).strip() if pd.notna(row['情感分类']) else ''
for cls in main_cls.split('/'):
cls = cls.strip()
if cls in EMOTION_CATEGORY_MAP:
emotion_dict[word].append(EMOTION_CATEGORY_MAP[cls])
# 去重处理
for word in emotion_dict:
emotion_dict[word] = list(set(emotion_dict[word]))
except Exception as e:
print(f"情感词典加载错误: {str(e)}")
return emotion_dict
# 初始化情感词典
EMOTION_DICT = load_emotion_dict('情感词汇本体/情感词汇本体.xlsx')
def analyze_text(file_path):
"""完善的文本情感分析"""
try:
# 1. 从Excel读取数据
df = pd.read_csv(file_path)
results = []
# 2. 逐行处理每个title
for idx, row in df.iterrows():
text = str(row['title']).strip() if pd.notna(row['title']) else ''
if not text:
results.append({'PosTe': 0.0, 'ActTe': 0.0})
continue
# 安全分词
pynlpir.open()
segments = pynlpir.segment(text, pos_tagging=True) # 必须启用词性标注
word_list = []
for seg in segments:
if isinstance(seg, tuple) and len(seg) == 2:
word_list.append(seg[0])
else:
print(f"异常分词结果: {seg}")
# 2. 情感词统计分析
# 初始化计数
category_counts = defaultdict(int)
total_emotion_words = 0
# 3. 情感词匹配与分类
for word in word_list:
if word in EMOTION_DICT:
for category in EMOTION_DICT[word]:
category_counts[category] += 1
total_emotion_words += 1
# 4. 情感效价计算PosTe
# NLPIR分词与情感词标注
segments = pynlpir.segment(text, pos_tagging=False)
senti = Sentiment()
result = senti.sentiment_count(text)
# 计算情感效价PosTe公式积极词数 / (积极词数+消极词数)
positive = result['pos']
negative = result['neg']
total = positive + negative
pos_te = positive / total if total > 0 else 0.5
# 5. 情感唤醒度计算ActTe
if total_emotion_words == 0:
act_te = 0.0
else:
total_arousal = sum(
count * AROUSAL_STRENGTH[cat]
for cat, count in category_counts.items()
)
# 归一化到[0,1]范围(最大可能值=7*N实际值=total_arousal
act_te = total_arousal / (7 * total_emotion_words)
results.append({
'PosTe': round(pos_te, 3),
'ActTe': round(act_te, 3),
})
# 每处理100条显示一次进度
if (idx + 1) % 100 == 0:
print(f"已处理 {idx + 1}/{len(df)} 条数据")
#输出到文档
df['PosTe'] = [r['PosTe'] for r in results]
df['ActTe'] = [r['ActTe'] for r in results]
df.to_csv(file_path, index=False, encoding='utf-8-sig')
return results
except Exception as e:
print(f"文本分析错误:{str(e)}")
# 返回与输入数据长度匹配的默认结果列表
return [{'PosTe': 0.0, 'ActTe': 0.0} for _ in range(len(df))]
finally:
pynlpir.close() # 确保释放NLPIR资源
if __name__ == "__main__":
2025-04-03 20:33:46 +08:00
file_path = 'data_all_second_ver.csv'
2025-04-01 11:20:27 +08:00
analyze_text(file_path)