2025-04-01 11:20:27 +08:00
|
|
|
|
import pandas as pd
|
|
|
|
|
from cnsenti import Sentiment
|
|
|
|
|
import pynlpir
|
|
|
|
|
from collections import defaultdict
|
|
|
|
|
import warnings
|
|
|
|
|
|
|
|
|
|
warnings.filterwarnings('ignore')
|
|
|
|
|
|
|
|
|
|
# ------------------文本唤醒度-初始化配置 ---------------------
|
|
|
|
|
# 情感大类唤醒强度赋值(根据廖圣清方法调整)
|
|
|
|
|
AROUSAL_STRENGTH = {
|
|
|
|
|
'惊': 7, # 最高唤醒
|
|
|
|
|
'惧': 6,
|
|
|
|
|
'怒': 5,
|
|
|
|
|
'乐': 4,
|
|
|
|
|
'恶': 3,
|
|
|
|
|
'好': 2,
|
|
|
|
|
'哀': 1 # 最低唤醒
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# 情感小类到大类映射表(基于大连理工词典规范)
|
|
|
|
|
EMOTION_CATEGORY_MAP = {
|
|
|
|
|
'PA': '乐', # 快乐
|
|
|
|
|
'PE': '乐', # 安心
|
|
|
|
|
'PD': '好', # 尊敬
|
|
|
|
|
'PH': '好', # 赞扬
|
|
|
|
|
'PG': '好', # 相信
|
|
|
|
|
'PB': '好', # 喜爱
|
|
|
|
|
'PK': '好', # 祝愿
|
|
|
|
|
'NA': '怒', # 愤怒
|
|
|
|
|
'NB': '哀', # 悲伤
|
|
|
|
|
'NJ': '哀', # 失望
|
|
|
|
|
'NH': '哀', # 疚
|
|
|
|
|
'PF': '哀', # 思
|
|
|
|
|
'NI': '惧', # 慌
|
|
|
|
|
'NC': '惧', # 恐惧
|
|
|
|
|
'NG': '惧', # 羞
|
|
|
|
|
'NE': '恶', # 烦闷
|
|
|
|
|
'ND': '恶', # 憎恶
|
|
|
|
|
'NN': '恶', # 贬责
|
|
|
|
|
'NK': '恶', # 妒忌
|
|
|
|
|
'NL': '恶', # 怀疑
|
|
|
|
|
'PC': '惊' # 惊奇
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# ------------------处理文本 ---------------------
|
|
|
|
|
def load_emotion_dict(excel_path):
|
|
|
|
|
"""加载大连理工情感词典(仅处理主分类列)"""
|
|
|
|
|
emotion_dict = defaultdict(list)
|
|
|
|
|
try:
|
|
|
|
|
df = pd.read_excel(excel_path)
|
|
|
|
|
for _, row in df.iterrows():
|
|
|
|
|
word = str(row['词语']).strip()
|
|
|
|
|
if not word:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# 仅处理"情感分类"列
|
|
|
|
|
main_cls = str(row['情感分类']).strip() if pd.notna(row['情感分类']) else ''
|
|
|
|
|
for cls in main_cls.split('/'):
|
|
|
|
|
cls = cls.strip()
|
|
|
|
|
if cls in EMOTION_CATEGORY_MAP:
|
|
|
|
|
emotion_dict[word].append(EMOTION_CATEGORY_MAP[cls])
|
|
|
|
|
|
|
|
|
|
# 去重处理
|
|
|
|
|
for word in emotion_dict:
|
|
|
|
|
emotion_dict[word] = list(set(emotion_dict[word]))
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"情感词典加载错误: {str(e)}")
|
|
|
|
|
return emotion_dict
|
|
|
|
|
|
|
|
|
|
# 初始化情感词典
|
|
|
|
|
EMOTION_DICT = load_emotion_dict('情感词汇本体/情感词汇本体.xlsx')
|
|
|
|
|
|
|
|
|
|
def analyze_text(file_path):
|
|
|
|
|
"""完善的文本情感分析"""
|
|
|
|
|
try:
|
|
|
|
|
# 1. 从Excel读取数据
|
|
|
|
|
df = pd.read_csv(file_path)
|
|
|
|
|
results = []
|
|
|
|
|
|
|
|
|
|
# 2. 逐行处理每个title
|
|
|
|
|
for idx, row in df.iterrows():
|
|
|
|
|
text = str(row['title']).strip() if pd.notna(row['title']) else ''
|
|
|
|
|
|
|
|
|
|
if not text:
|
|
|
|
|
results.append({'PosTe': 0.0, 'ActTe': 0.0})
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# 安全分词
|
|
|
|
|
pynlpir.open()
|
|
|
|
|
segments = pynlpir.segment(text, pos_tagging=True) # 必须启用词性标注
|
|
|
|
|
word_list = []
|
|
|
|
|
for seg in segments:
|
|
|
|
|
if isinstance(seg, tuple) and len(seg) == 2:
|
|
|
|
|
word_list.append(seg[0])
|
|
|
|
|
else:
|
|
|
|
|
print(f"异常分词结果: {seg}")
|
|
|
|
|
|
|
|
|
|
# 2. 情感词统计分析
|
|
|
|
|
# 初始化计数
|
|
|
|
|
category_counts = defaultdict(int)
|
|
|
|
|
total_emotion_words = 0
|
|
|
|
|
|
|
|
|
|
# 3. 情感词匹配与分类
|
|
|
|
|
for word in word_list:
|
|
|
|
|
if word in EMOTION_DICT:
|
|
|
|
|
for category in EMOTION_DICT[word]:
|
|
|
|
|
category_counts[category] += 1
|
|
|
|
|
total_emotion_words += 1
|
|
|
|
|
|
|
|
|
|
# 4. 情感效价计算(PosTe)
|
|
|
|
|
# NLPIR分词与情感词标注
|
|
|
|
|
segments = pynlpir.segment(text, pos_tagging=False)
|
|
|
|
|
senti = Sentiment()
|
|
|
|
|
result = senti.sentiment_count(text)
|
|
|
|
|
|
|
|
|
|
# 计算情感效价(PosTe公式:积极词数 / (积极词数+消极词数))
|
|
|
|
|
positive = result['pos']
|
|
|
|
|
negative = result['neg']
|
|
|
|
|
total = positive + negative
|
|
|
|
|
pos_te = positive / total if total > 0 else 0.5
|
|
|
|
|
|
|
|
|
|
# 5. 情感唤醒度计算(ActTe)
|
|
|
|
|
if total_emotion_words == 0:
|
|
|
|
|
act_te = 0.0
|
|
|
|
|
else:
|
|
|
|
|
total_arousal = sum(
|
|
|
|
|
count * AROUSAL_STRENGTH[cat]
|
|
|
|
|
for cat, count in category_counts.items()
|
|
|
|
|
)
|
|
|
|
|
# 归一化到[0,1]范围(最大可能值=7*N,实际值=total_arousal)
|
|
|
|
|
act_te = total_arousal / (7 * total_emotion_words)
|
|
|
|
|
|
|
|
|
|
results.append({
|
|
|
|
|
'PosTe': round(pos_te, 3),
|
|
|
|
|
'ActTe': round(act_te, 3),
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
# 每处理100条显示一次进度
|
|
|
|
|
if (idx + 1) % 100 == 0:
|
|
|
|
|
print(f"已处理 {idx + 1}/{len(df)} 条数据")
|
|
|
|
|
|
|
|
|
|
#输出到文档
|
|
|
|
|
df['PosTe'] = [r['PosTe'] for r in results]
|
|
|
|
|
df['ActTe'] = [r['ActTe'] for r in results]
|
|
|
|
|
df.to_csv(file_path, index=False, encoding='utf-8-sig')
|
|
|
|
|
|
|
|
|
|
return results
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"文本分析错误:{str(e)}")
|
|
|
|
|
# 返回与输入数据长度匹配的默认结果列表
|
|
|
|
|
return [{'PosTe': 0.0, 'ActTe': 0.0} for _ in range(len(df))]
|
|
|
|
|
|
|
|
|
|
finally:
|
|
|
|
|
pynlpir.close() # 确保释放NLPIR资源
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2025-04-03 20:33:46 +08:00
|
|
|
|
file_path = 'data_all_second_ver.csv'
|
2025-04-01 11:20:27 +08:00
|
|
|
|
analyze_text(file_path)
|