Compare commits
11 Commits
Author | SHA1 | Date | |
---|---|---|---|
aa9563b289 | |||
8bb89eff36 | |||
991232b1ec | |||
![]() |
4babdcdb54 | ||
![]() |
6afdec65df | ||
![]() |
097eac7444 | ||
0a480de300 | |||
206f6d233a | |||
f399d06390 | |||
5887b83698 | |||
29cadea1a4 |
1
.idea/.name
generated
Normal file
1
.idea/.name
generated
Normal file
@ -0,0 +1 @@
|
||||
readme.md
|
2
.idea/misc.xml
generated
2
.idea/misc.xml
generated
@ -3,5 +3,5 @@
|
||||
<component name="Black">
|
||||
<option name="sdkName" value="SAM-bilibil" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="SAM-bilibil" project-jdk-type="Python SDK" />
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12" project-jdk-type="Python SDK" />
|
||||
</project>
|
2
.idea/statistics_model2025.iml
generated
2
.idea/statistics_model2025.iml
generated
@ -2,7 +2,7 @@
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="jdk" jdkName="SAM-bilibil" jdkType="Python SDK" />
|
||||
<orderEntry type="jdk" jdkName="Python 3.12" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="PyDocumentationSettings">
|
||||
|
BIN
Final_Data.xlsx
Normal file
BIN
Final_Data.xlsx
Normal file
Binary file not shown.
BIN
Final_Data_Quantificated.xlsx
Normal file
BIN
Final_Data_Quantificated.xlsx
Normal file
Binary file not shown.
@ -5,40 +5,17 @@ import requests
|
||||
from io import BytesIO
|
||||
from PIL import Image
|
||||
import os
|
||||
from colorthief import ColorThief
|
||||
import pytesseract
|
||||
from multiprocessing import Pool
|
||||
from cnsenti import Sentiment
|
||||
import pynlpir
|
||||
from collections import defaultdict
|
||||
from sklearn.cluster import MiniBatchKMeans
|
||||
from tqdm import tqdm
|
||||
import warnings
|
||||
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
#设置OCR路径
|
||||
pytesseract.pytesseract.tesseract_cmd = r'D:Program files\Tesseract-OCR\tesseract.exe'
|
||||
|
||||
# ------------------图像处理-初始化配置 ---------------------
|
||||
# 人脸检测模型初始化
|
||||
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
|
||||
|
||||
# 图像情感模型系数(基于IAPS数据集校准)
|
||||
VALENCE_WEIGHTS = {
|
||||
'warm_ratio': 0.35,
|
||||
'brightness': 0.15,
|
||||
'symmetry': 0.20,
|
||||
'colorfulness': 0.30
|
||||
}
|
||||
|
||||
AROUSAL_WEIGHTS = {
|
||||
'contrast': 0.40,
|
||||
'edge_density': 0.35,
|
||||
'saturation_std': 0.25
|
||||
}
|
||||
|
||||
# 暖色调定义(HSV色相范围)
|
||||
WARM_HUE_RANGE = (0, 60) # 红色到黄色
|
||||
|
||||
# ------------------处理图像 ---------------------
|
||||
def get_image(url):
|
||||
"""从URL获取图像并预处理"""
|
||||
@ -51,56 +28,72 @@ def get_image(url):
|
||||
return None
|
||||
|
||||
|
||||
def analyze_image(img):
|
||||
"""分析图像特征"""
|
||||
if img is None:
|
||||
return {}
|
||||
def extract_color_palette(img_rgb, color_count=5):
|
||||
"""提取前N种主色及比例"""
|
||||
# 使用k-means聚类提取主色
|
||||
pixels = img_rgb.reshape(-1, 3)
|
||||
kmeans = MiniBatchKMeans(n_clusters=color_count, random_state=0)
|
||||
labels = kmeans.fit_predict(pixels)
|
||||
|
||||
# 转换为HSV颜色空间
|
||||
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
|
||||
h, s, v = cv2.split(hsv)
|
||||
# 计算每种颜色的比例
|
||||
counts = np.bincount(labels)
|
||||
total = counts.sum()
|
||||
palette = []
|
||||
for i in range(color_count):
|
||||
ratio = counts[i] / total
|
||||
color = kmeans.cluster_centers_[i].astype(int)
|
||||
palette.append((color, ratio))
|
||||
|
||||
# 计算基础特征
|
||||
features = {
|
||||
'brightness': np.mean(v),
|
||||
'contrast': np.max(v) - np.min(v),
|
||||
'saturation_std': np.std(s),
|
||||
'colorfulness': np.std(h) + np.std(s) + np.std(v)
|
||||
}
|
||||
|
||||
# 暖色比例计算
|
||||
hue_mask = cv2.inRange(h, WARM_HUE_RANGE[0], WARM_HUE_RANGE[1])
|
||||
features['warm_ratio'] = np.count_nonzero(hue_mask) / (img.shape[0] * img.shape[1])
|
||||
|
||||
# 对称性计算
|
||||
mid = img.shape[1] // 2
|
||||
left = img[:, :mid]
|
||||
right = cv2.flip(img[:, mid:], 1)
|
||||
features['symmetry'] = cv2.matchTemplate(left, right, cv2.TM_CCOEFF_NORMED)[0][0]
|
||||
|
||||
# 边缘密度
|
||||
edges = cv2.Canny(cv2.cvtColor(img, cv2.COLOR_BGR2GRAY), 100, 200)
|
||||
features['edge_density'] = np.mean(edges)
|
||||
|
||||
return features
|
||||
return sorted(palette, key=lambda x: -x[1]) # 按比例降序排列
|
||||
|
||||
|
||||
def calculate_affect(features):
|
||||
"""计算情感效价和唤醒度"""
|
||||
poslm = sum(features[k] * VALENCE_WEIGHTS[k] for k in VALENCE_WEIGHTS)
|
||||
actlm = sum(features[k] * AROUSAL_WEIGHTS[k] for k in AROUSAL_WEIGHTS)
|
||||
def classify_hsv_color(rgb_color):
|
||||
"""将RGB颜色分类为暖色/冷色/中性色"""
|
||||
try:
|
||||
# 确保输入是有效的RGB颜色值
|
||||
rgb_color = np.clip(rgb_color, 0, 255)
|
||||
hsv = cv2.cvtColor(np.uint8([[rgb_color]]), cv2.COLOR_RGB2HSV)[0][0]
|
||||
h, s, v = hsv[0], hsv[1] / 255.0, hsv[2] / 255.0 # 归一化
|
||||
|
||||
# Sigmoid归一化
|
||||
return {
|
||||
'Poslm': 2 / (1 + np.exp(-poslm)) - 1,
|
||||
'Actlm': 2 / (1 + np.exp(-actlm)) - 1
|
||||
}
|
||||
# 中性色判断(根据Palmer标准)
|
||||
if s < 0.2 or v < 0.2:
|
||||
return 'neutral'
|
||||
|
||||
# 色相分类
|
||||
if (0 <= h < 90) or (270 <= h <= 360):
|
||||
return 'warm'
|
||||
return 'cool'
|
||||
except Exception as e:
|
||||
print(f"颜色越界: {str(e)}")
|
||||
return 'neutral' # 出错时默认返回中性色
|
||||
|
||||
def determine_warm_tone(img_rgb):
|
||||
"""返回(暖色标签, 暖色比例, 冷色比例)"""
|
||||
palette = extract_color_palette(img_rgb)
|
||||
|
||||
warm_ratio, cool_ratio, neutral_ratio = 0.0, 0.0, 0.0
|
||||
for color, ratio in palette:
|
||||
category = classify_hsv_color(color)
|
||||
if category == 'warm':
|
||||
warm_ratio += ratio
|
||||
elif category == 'cool':
|
||||
cool_ratio += ratio
|
||||
else:
|
||||
neutral_ratio += ratio
|
||||
|
||||
return warm_ratio, cool_ratio, neutral_ratio
|
||||
|
||||
|
||||
def detect_human(img):
|
||||
"""检测人像"""
|
||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
faces = face_cascade.detectMultiScale(gray, 1.05, 3)
|
||||
# 优化参数组合
|
||||
faces = face_cascade.detectMultiScale(
|
||||
gray,
|
||||
scaleFactor=1.02, # 减小缩放步长,增加检测粒度
|
||||
minNeighbors=5, # 提高邻居数要求,减少误检
|
||||
minSize=(50, 50), # 适配B站封面最小人脸尺寸
|
||||
flags=cv2.CASCADE_FIND_BIGGEST_OBJECT # 优先检测最大人脸
|
||||
)
|
||||
return len(faces) > 0
|
||||
|
||||
|
||||
@ -110,24 +103,18 @@ def process_url(url):
|
||||
if img is None:
|
||||
return None
|
||||
|
||||
features = analyze_image(img)
|
||||
affect = calculate_affect(features)
|
||||
|
||||
# 主色分析
|
||||
color_thief = ColorThief(BytesIO(requests.get(url).content))
|
||||
dominant_color = color_thief.get_color(quality=1)
|
||||
hsv_color = cv2.cvtColor(np.uint8([[dominant_color]]), cv2.COLOR_RGB2HSV)[0][0]
|
||||
warm = 1 if WARM_HUE_RANGE[0] <= hsv_color[0] <= WARM_HUE_RANGE[1] else 0
|
||||
|
||||
# 人像检测
|
||||
has_human = detect_human(img)
|
||||
# 颜色分析
|
||||
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
||||
warm_ratio, cool_ratio, neutral_ratio = determine_warm_tone(img_rgb)
|
||||
|
||||
return {
|
||||
'url': url,
|
||||
**affect,
|
||||
'Warm': warm,
|
||||
'Portrait': int(has_human)
|
||||
'Portrait': int(detect_human(img)),
|
||||
'WarmRatio': round(warm_ratio, 3),
|
||||
'CoolRatio': round(cool_ratio, 3),
|
||||
'NeutralRatio': round(neutral_ratio, 3)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing {url}: {str(e)}")
|
||||
return None
|
||||
@ -135,42 +122,51 @@ def process_url(url):
|
||||
|
||||
# 批量处理
|
||||
def batch_process(urls, workers=4):
|
||||
# 创建包含所有URL的初始结果列表
|
||||
results = [{'url': url, 'success': False} for url in urls]
|
||||
|
||||
with Pool(workers) as pool:
|
||||
results = [res for res in pool.imap(process_url, urls) if res is not None]
|
||||
processed = list(tqdm(pool.imap(process_url, urls),
|
||||
total=len(urls),
|
||||
desc="处理进度"))
|
||||
|
||||
# 按原始顺序更新成功处理的结果
|
||||
for i, res in enumerate(processed):
|
||||
if res is not None:
|
||||
results[i] = res
|
||||
results[i]['success'] = True
|
||||
|
||||
return pd.DataFrame(results)
|
||||
|
||||
|
||||
# 使用示例
|
||||
if __name__ == "__main__":
|
||||
# 读取URL列表
|
||||
input_csv = "data_all.csv"
|
||||
#输出路径
|
||||
os.makedirs('./result', exist_ok=True)
|
||||
output_csv = "result/analysis_results.csv"
|
||||
|
||||
##完整运行
|
||||
# # 读取URL列表
|
||||
# input_csv = "data_all_first_ver.csv"
|
||||
# #输出路径
|
||||
# os.makedirs('./result', exist_ok=True)
|
||||
# output_csv = "result/analysis_results.csv"
|
||||
#
|
||||
# #完整运行
|
||||
# df = pd.read_csv(input_csv)
|
||||
# urls = df['视频封面'].tolist()
|
||||
# urls = df['视频封面URL'].tolist()
|
||||
#
|
||||
# # 执行分析
|
||||
# result_df = batch_process(urls)
|
||||
#
|
||||
# # 合并原始数据
|
||||
# final_df = df.merge(result_df, left_on='视频封面')
|
||||
# final_df.drop('url', axis=1).to_csv(output_csv, index=False)
|
||||
# result_df.to_csv(output_csv, index=False, encoding='utf-8-sig')
|
||||
# print(f"成功处理 {len(result_df)}/{len(urls)} 张图片")
|
||||
# print("分析完成!结果已保存至", output_csv)
|
||||
|
||||
#重新执行失败的url
|
||||
urls_failed=[
|
||||
'http://i1.hdslb.com/bfs/archive/5c42e0fa42ec945106d2e167253889e8a05541c9.jpg',
|
||||
'http://i1.hdslb.com/bfs/archive/d2ca3e3f4c543245715937bf643e98b55badcc21.jpg',
|
||||
'http://i0.hdslb.com/bfs/archive/2b1cf64d70bf2036793e33b2de3067344a7ff77d.jpg',
|
||||
'http://i0.hdslb.com/bfs/archive/123ddc4cdf429968fa416f78f4049a728e8da3ab.jpg',
|
||||
'http://i2.hdslb.com/bfs/archive/b07446d2176cec63d42f204504f4cda7a940b05b.jpg',
|
||||
]
|
||||
result_failed = batch_process(urls_failed)
|
||||
result_failed.to_csv('result/reanalyze.csv', index=False, encoding='utf-8-sig')
|
||||
|
||||
# 示例URL列表
|
||||
#小批量实验
|
||||
urls = [
|
||||
'http://i0.hdslb.com/bfs/archive/393a8e961b704d43256fe7e6c89fee04df966e17.jpg',
|
||||
'http://i0.hdslb.com/bfs/archive/072e16a1237040941f15b1ed67a8d1ebe6f2e041.jpg',
|
||||
'http://i2.hdslb.com/bfs/archive/1c56b5bec767c604175983cc5926f5832baa9bb8.jpg',
|
||||
'http://i0.hdslb.com/bfs/archive/66384e53a15345a539ccbb2989442f1d960b9235.jpg',
|
||||
'http://i2.hdslb.com/bfs/archive/836b762456f0b4d65dd2c40fc4cd120107e46b88.jpg',
|
||||
]
|
||||
result_df = batch_process(urls)
|
||||
result_df.to_csv("result/analysis_results.csv", index=False)
|
||||
print(f"成功处理 {len(result_df)}/{len(urls)} 张图片")
|
||||
|
||||
|
||||
print("分析完成!结果已保存至", output_csv)
|
File diff suppressed because one or more lines are too long
68573
data_text_all/comments_hot.txt
Normal file
68573
data_text_all/comments_hot.txt
Normal file
File diff suppressed because it is too large
Load Diff
923
data_text_all/description.txt
Normal file
923
data_text_all/description.txt
Normal file
File diff suppressed because one or more lines are too long
8311
data_text_all/tags.txt
Normal file
8311
data_text_all/tags.txt
Normal file
File diff suppressed because it is too large
Load Diff
1136
data_text_all/标题.txt
Normal file
1136
data_text_all/标题.txt
Normal file
File diff suppressed because it is too large
Load Diff
49
readme.md
49
readme.md
@ -1,5 +1,5 @@
|
||||
# 数据处理
|
||||
|
||||
原数据文件+量化后文件两个文件
|
||||
## 合并数据文件
|
||||
### 1. 合并热门数据
|
||||
- 数据文件
|
||||
@ -36,7 +36,7 @@
|
||||
- 视频类型:搬运0,自制1
|
||||
- 字幕: 无字幕为0,剩下为1
|
||||
- 视频总时长:输出小于60的,之间的,和大于600的,赋值为1,2,3,方便后续描述性分析
|
||||
- 弹幕情感评分=0.8*snowNLP+0.2*RoBERTa
|
||||
- 弹幕情感评分(SentimentScore)=0.8*snowNLP+0.2*RoBERTa
|
||||
### 删除不用指标
|
||||
- 发布时间等上述被处理过的指标(原播放量要保留)
|
||||
- 视频简介、标签
|
||||
@ -44,4 +44,47 @@
|
||||
### 数据清洗
|
||||
- 筛选极端弹幕情感评分,筛选出两种差值>0.3的人工检查
|
||||
- 缺失值处理(按总平均填入)
|
||||
- 去除异常值
|
||||
- 去除异常值
|
||||
## 指标创新
|
||||
新增指标:
|
||||
- 弹幕情感评分snowNLP和RoBERTa及其加权平均作为最终评分(SentimentScore)
|
||||
- 标题情感效价(PosTe)和情感唤醒度(ActTe)
|
||||
- 封面:
|
||||
- 是否有人像(Portrait)
|
||||
- 暖色比例(WarmRatio)
|
||||
- 冷色比例(CoolRatio)
|
||||
- 中性色比例(NeutralRatio)
|
||||
### 弹幕情感评分
|
||||
弹幕情感倾向可以反映用户对视频的喜爱程度。此前我们爬取了每个视频的弹幕,为分析弹幕的情感趋向,我们设计了字典法和模型法两套计算方案,并且最终采用加权平均的方法求得最终值以提高结果的可信度。
|
||||
字典法采用SnowNLP库进行情感分析,得到的情感评分范围在0到1之间,其中0表示负面情感,1表示正面情感。
|
||||
字典法运算较为快速,同时也较为传统, 为适应B站弹幕的语言特点,我们手动增添了部分词汇并给予一定的情感赋值(以原词典中很好0.78,一般0.52,差0.14为标准),
|
||||
如"爷青回"0.9(我的青春回来了),"yyds"0.9(永远的神),"awsl"0.8(啊我死了(感动、可爱)),"2333"0.6(笑),"DNA动了"0.8(触发记忆),
|
||||
得到“弹幕情感评分snowNLP”指标。
|
||||
|
||||
模型法采用前人预训练好的RoBERTa模型进行情感分析,情感评分规则同上。
|
||||
RoBERTa是由...等人在2019年提出的改进版BERT模型,适用于文本分类和情感分析,具有一定的鲁棒性(引用论文)。
|
||||
在此基础上,我们采用开源的Erlangshen-Roberta-330M模型,其已在中文领域经过调整,拥有3.3亿个参数,在京东、微博评论等数据集上表现良好(引用论文), 因此较为适合B站的弹幕情感分析。
|
||||
由于弹幕数据量大,计算量很大,我们对单个视频弹幕量超过500的作均匀抽样处理(500条),并且借助学校高性能运算中心提供的平台进行计算~~虽然没什么用~~,得到"弹幕情感评分RoBERTa"指标。
|
||||
最终,我们采用加权平均的方法结合两种方法的结果,得到最终的弹幕情感评分(SentimentScore)。
|
||||
|
||||
### 标题文本的情感效价(PosTe)和情感唤醒度(ActTe)
|
||||
(验证了论文里的封面文本的情感效价和情感唤醒度的计算,发现OCR的识别效果并不好,不过函数编都编了,遂应用到视频标题上去。)
|
||||
|
||||
步骤:
|
||||
1. 定义见论文<<应急科普>>P37表格
|
||||
2. 情感效价PosTe的计算:
|
||||
使用NLPIR分词与情感词标注(Python的“cnsenti”包),统计文本中的积极、消极词汇数,然后 依据廖圣清、程 俊超等学者的做法以“积极词汇数/(积极词汇数+消极词汇数)”作为该条文本的情感效价。
|
||||
若文本中积极、消极词汇同时为0,将PosTe赋值为0.5,表示为中性.
|
||||
3. 情感唤醒度ActTe的计算:
|
||||
对分词结果对照大连理工情感词典(引用论文:徐琳宏,林鸿飞,潘宇,任惠,陈建美.情感词汇本体的构造.情报学报,2008,27(2):180-185)
|
||||
匹配到情感分类(小类),进一步匹配到情感大类, 即“哀”、"好"、“恶”、“乐”、“怒”、“惧”、“惊”七种情感,
|
||||
并借鉴廖圣清等学者的做法,根据情感唤醒度的强弱,分别赋值为1-7。
|
||||
对文本中反映该七种情感的字词出现的频数进行统计,将七种情感所包含的词汇数目分别乘以对应的情感唤醒程度的赋值,归一化后作为该封面文本的情感唤醒度。
|
||||
对标题文本匹配结果为0的,表明主要为中性词汇,将ActTe赋值为0.
|
||||
|
||||
### 视频封面处理
|
||||
视频的封面对其传播具有重要影响,但由于封面吸引力等因素具有主观性,以往的研究较为有限。
|
||||
在这里,我们提取了是否有人像、暖色比例、冷色比例、中性色比例四个客观指标辅助分析。
|
||||
我们使用了OpenCV库加载了预训练的人脸检测模型,并调用`detectMultiScale`方法进行人像检测。
|
||||
并使用改进的k-means聚类算法(MiniBatchK-means)提取封面主色,结合HSV色彩空间分类标准计算色调比例。
|
||||
|
||||
|
1137
result/analysis_results.csv
Normal file
1137
result/analysis_results.csv
Normal file
File diff suppressed because it is too large
Load Diff
6
result/reanalyze.csv
Normal file
6
result/reanalyze.csv
Normal file
@ -0,0 +1,6 @@
|
||||
url,success,Portrait,WarmRatio,CoolRatio,NeutralRatio
|
||||
http://i1.hdslb.com/bfs/archive/5c42e0fa42ec945106d2e167253889e8a05541c9.jpg,False,,,,
|
||||
http://i1.hdslb.com/bfs/archive/d2ca3e3f4c543245715937bf643e98b55badcc21.jpg,False,,,,
|
||||
http://i0.hdslb.com/bfs/archive/2b1cf64d70bf2036793e33b2de3067344a7ff77d.jpg,True,1.0,0.178,0.414,0.408
|
||||
http://i0.hdslb.com/bfs/archive/123ddc4cdf429968fa416f78f4049a728e8da3ab.jpg,False,,,,
|
||||
http://i2.hdslb.com/bfs/archive/b07446d2176cec63d42f204504f4cda7a940b05b.jpg,False,,,,
|
|
79
temp.py
Normal file
79
temp.py
Normal file
@ -0,0 +1,79 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from snownlp import SnowNLP
|
||||
import os
|
||||
|
||||
def load_data(file_path):
|
||||
try:
|
||||
df = pd.read_csv(file_path, usecols=['弹幕内容'], engine='python')
|
||||
return df['弹幕内容'].dropna().astype(str).tolist()
|
||||
except Exception as e:
|
||||
print(f"数据加载失败: {str(e)}")
|
||||
return []
|
||||
|
||||
def analyze_sentiment(danmu_texts):
|
||||
# 添加特殊词汇处理(以原词典中很好为0.78,一般为0.52,差为0.14为标准手动添加)
|
||||
special_cases = {
|
||||
# 高强度正能量词
|
||||
"爷青回": 0.9, # 情怀向
|
||||
"yyds": 0.9, # 永远滴神
|
||||
'YYDS': 0.9, # 永远滴神
|
||||
"awsl": 0.8, # 啊我死了(感动)
|
||||
'阿伟死了': 0.8, # 谐上(感动)
|
||||
"泪目": 0.8, # 感动场景
|
||||
"排面": 0.8, # 排场十足
|
||||
"双厨狂喜": 0.7, # 跨界联动
|
||||
"梦幻联动": 0.7, # 跨作品合作
|
||||
"注入灵魂": 0.7, # 高能片段
|
||||
"文艺复兴": 0.8, # 经典重现
|
||||
# 玩梗互动词
|
||||
"下次一定": 0.55, # 投币拖延梗
|
||||
"你币没了": 0.45, # 威胁不投币
|
||||
"空降成功": 0.5, # 跳片头
|
||||
"标准结局": 0.5, # 意料之中
|
||||
"典中典": 0.4, # 经典复读(含贬义)
|
||||
# 高能名场面
|
||||
"名场面": 0.85, # 经典片段
|
||||
"神仙打架": 0.9, # 高手对决
|
||||
"前方高能": 0.7, # 高潮预警
|
||||
# 数字谐音
|
||||
"666": 0.75, # 玩得厉害
|
||||
"999": 0.75, # 6翻了
|
||||
"2333": 0.6, # 笑
|
||||
# 抽象文化
|
||||
"草": 0.6, # 笑(中性)
|
||||
"生草": 0.65, # 搞笑场景
|
||||
# 破防场景
|
||||
"破防了": 0.4, # 心理防线崩溃
|
||||
"我裂开了": 0.3, # 心态炸裂
|
||||
# 特定领域梗
|
||||
"奥利给": 0.8, # 加油打气
|
||||
"DNA动了": 0.8, # 触发记忆
|
||||
"有内味了": 0.7, # 特色到位
|
||||
# 负向场景
|
||||
"公开处刑": 0.5, # 尴尬场面
|
||||
"阴间": 0.3, # 诡异内容
|
||||
"阴间滤镜": 0.3, # 画面诡异
|
||||
"血压上来了": 0.3 # 令人烦躁
|
||||
}
|
||||
sentiment_scores = []
|
||||
|
||||
for item in danmu_texts:
|
||||
if item in special_cases:
|
||||
sentiment_scores.append(special_cases[item])
|
||||
else:
|
||||
s = SnowNLP(item)
|
||||
sentiment_scores.append(s.sentiments)
|
||||
|
||||
avg_score = np.mean(sentiment_scores)
|
||||
return avg_score
|
||||
|
||||
# file_path='hot_data/亲子/BV1TLXVYREDt/BV1TLXVYREDt_287_danmaku.csv'
|
||||
# df = load_data(file_path)
|
||||
# scores=analyze_sentiment(df)
|
||||
# print(scores)
|
||||
|
||||
# 测试
|
||||
test_words = ['4']
|
||||
s = analyze_sentiment(test_words)
|
||||
print(s)
|
@ -157,5 +157,5 @@ def analyze_text(file_path):
|
||||
pynlpir.close() # 确保释放NLPIR资源
|
||||
|
||||
if __name__ == "__main__":
|
||||
file_path = 'data_all.csv'
|
||||
file_path = 'data_all_second_ver.csv'
|
||||
analyze_text(file_path)
|
Loading…
x
Reference in New Issue
Block a user