161 lines
5.1 KiB
Python
161 lines
5.1 KiB
Python
import pandas as pd
|
||
from cnsenti import Sentiment
|
||
import pynlpir
|
||
from collections import defaultdict
|
||
import warnings
|
||
|
||
warnings.filterwarnings('ignore')
|
||
|
||
# ------------------文本唤醒度-初始化配置 ---------------------
|
||
# 情感大类唤醒强度赋值(根据廖圣清方法调整)
|
||
AROUSAL_STRENGTH = {
|
||
'惊': 7, # 最高唤醒
|
||
'惧': 6,
|
||
'怒': 5,
|
||
'乐': 4,
|
||
'恶': 3,
|
||
'好': 2,
|
||
'哀': 1 # 最低唤醒
|
||
}
|
||
|
||
# 情感小类到大类映射表(基于大连理工词典规范)
|
||
EMOTION_CATEGORY_MAP = {
|
||
'PA': '乐', # 快乐
|
||
'PE': '乐', # 安心
|
||
'PD': '好', # 尊敬
|
||
'PH': '好', # 赞扬
|
||
'PG': '好', # 相信
|
||
'PB': '好', # 喜爱
|
||
'PK': '好', # 祝愿
|
||
'NA': '怒', # 愤怒
|
||
'NB': '哀', # 悲伤
|
||
'NJ': '哀', # 失望
|
||
'NH': '哀', # 疚
|
||
'PF': '哀', # 思
|
||
'NI': '惧', # 慌
|
||
'NC': '惧', # 恐惧
|
||
'NG': '惧', # 羞
|
||
'NE': '恶', # 烦闷
|
||
'ND': '恶', # 憎恶
|
||
'NN': '恶', # 贬责
|
||
'NK': '恶', # 妒忌
|
||
'NL': '恶', # 怀疑
|
||
'PC': '惊' # 惊奇
|
||
}
|
||
|
||
# ------------------处理文本 ---------------------
|
||
def load_emotion_dict(excel_path):
|
||
"""加载大连理工情感词典(仅处理主分类列)"""
|
||
emotion_dict = defaultdict(list)
|
||
try:
|
||
df = pd.read_excel(excel_path)
|
||
for _, row in df.iterrows():
|
||
word = str(row['词语']).strip()
|
||
if not word:
|
||
continue
|
||
|
||
# 仅处理"情感分类"列
|
||
main_cls = str(row['情感分类']).strip() if pd.notna(row['情感分类']) else ''
|
||
for cls in main_cls.split('/'):
|
||
cls = cls.strip()
|
||
if cls in EMOTION_CATEGORY_MAP:
|
||
emotion_dict[word].append(EMOTION_CATEGORY_MAP[cls])
|
||
|
||
# 去重处理
|
||
for word in emotion_dict:
|
||
emotion_dict[word] = list(set(emotion_dict[word]))
|
||
|
||
except Exception as e:
|
||
print(f"情感词典加载错误: {str(e)}")
|
||
return emotion_dict
|
||
|
||
# 初始化情感词典
|
||
EMOTION_DICT = load_emotion_dict('情感词汇本体/情感词汇本体.xlsx')
|
||
|
||
def analyze_text(file_path):
|
||
"""完善的文本情感分析"""
|
||
try:
|
||
# 1. 从Excel读取数据
|
||
df = pd.read_csv(file_path)
|
||
results = []
|
||
|
||
# 2. 逐行处理每个title
|
||
for idx, row in df.iterrows():
|
||
text = str(row['title']).strip() if pd.notna(row['title']) else ''
|
||
|
||
if not text:
|
||
results.append({'PosTe': 0.0, 'ActTe': 0.0})
|
||
continue
|
||
|
||
# 安全分词
|
||
pynlpir.open()
|
||
segments = pynlpir.segment(text, pos_tagging=True) # 必须启用词性标注
|
||
word_list = []
|
||
for seg in segments:
|
||
if isinstance(seg, tuple) and len(seg) == 2:
|
||
word_list.append(seg[0])
|
||
else:
|
||
print(f"异常分词结果: {seg}")
|
||
|
||
# 2. 情感词统计分析
|
||
# 初始化计数
|
||
category_counts = defaultdict(int)
|
||
total_emotion_words = 0
|
||
|
||
# 3. 情感词匹配与分类
|
||
for word in word_list:
|
||
if word in EMOTION_DICT:
|
||
for category in EMOTION_DICT[word]:
|
||
category_counts[category] += 1
|
||
total_emotion_words += 1
|
||
|
||
# 4. 情感效价计算(PosTe)
|
||
# NLPIR分词与情感词标注
|
||
segments = pynlpir.segment(text, pos_tagging=False)
|
||
senti = Sentiment()
|
||
result = senti.sentiment_count(text)
|
||
|
||
# 计算情感效价(PosTe公式:积极词数 / (积极词数+消极词数))
|
||
positive = result['pos']
|
||
negative = result['neg']
|
||
total = positive + negative
|
||
pos_te = positive / total if total > 0 else 0.5
|
||
|
||
# 5. 情感唤醒度计算(ActTe)
|
||
if total_emotion_words == 0:
|
||
act_te = 0.0
|
||
else:
|
||
total_arousal = sum(
|
||
count * AROUSAL_STRENGTH[cat]
|
||
for cat, count in category_counts.items()
|
||
)
|
||
# 归一化到[0,1]范围(最大可能值=7*N,实际值=total_arousal)
|
||
act_te = total_arousal / (7 * total_emotion_words)
|
||
|
||
results.append({
|
||
'PosTe': round(pos_te, 3),
|
||
'ActTe': round(act_te, 3),
|
||
})
|
||
|
||
# 每处理100条显示一次进度
|
||
if (idx + 1) % 100 == 0:
|
||
print(f"已处理 {idx + 1}/{len(df)} 条数据")
|
||
|
||
#输出到文档
|
||
df['PosTe'] = [r['PosTe'] for r in results]
|
||
df['ActTe'] = [r['ActTe'] for r in results]
|
||
df.to_csv(file_path, index=False, encoding='utf-8-sig')
|
||
|
||
return results
|
||
|
||
except Exception as e:
|
||
print(f"文本分析错误:{str(e)}")
|
||
# 返回与输入数据长度匹配的默认结果列表
|
||
return [{'PosTe': 0.0, 'ActTe': 0.0} for _ in range(len(df))]
|
||
|
||
finally:
|
||
pynlpir.close() # 确保释放NLPIR资源
|
||
|
||
if __name__ == "__main__":
|
||
file_path = 'data_all_second_ver.csv'
|
||
analyze_text(file_path) |