Compare commits

...

11 Commits
SyyTmp ... main

Author SHA1 Message Date
aa9563b289 add:likerate 2025-04-06 14:12:50 +08:00
8bb89eff36 finish 2025-04-03 23:24:36 +08:00
991232b1ec data preprocessing finished 2025-04-03 20:33:46 +08:00
Sheyiyuan
4babdcdb54 modify:data_collection_complete 2025-04-02 19:01:51 +08:00
Sheyiyuan
6afdec65df modify:data_collection_complete 2025-04-02 18:57:42 +08:00
Sheyiyuan
097eac7444 fix:data_fix 2025-04-02 18:53:49 +08:00
0a480de300 Merge pull request 'modify:no_hot_data_fix' (#4) from SyyTmp into main
Reviewed-on: #4
2025-04-02 18:52:14 +08:00
206f6d233a update:readme 2025-04-02 10:56:29 +08:00
f399d06390 finish:ImageAnalyse 2025-04-02 09:18:48 +08:00
5887b83698 fix:covers_analyser 2025-04-01 19:56:57 +08:00
29cadea1a4 add:covers_analyser 2025-04-01 19:46:25 +08:00
17 changed files with 81454 additions and 1249 deletions

1
.idea/.name generated Normal file
View File

@ -0,0 +1 @@
readme.md

2
.idea/misc.xml generated
View File

@ -3,5 +3,5 @@
<component name="Black">
<option name="sdkName" value="SAM-bilibil" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="SAM-bilibil" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12" project-jdk-type="Python SDK" />
</project>

View File

@ -2,7 +2,7 @@
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="SAM-bilibil" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="Python 3.12" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyDocumentationSettings">

BIN
FSR.xlsx

Binary file not shown.

BIN
Final_Data.xlsx Normal file

Binary file not shown.

Binary file not shown.

View File

@ -5,40 +5,17 @@ import requests
from io import BytesIO
from PIL import Image
import os
from colorthief import ColorThief
import pytesseract
from multiprocessing import Pool
from cnsenti import Sentiment
import pynlpir
from collections import defaultdict
from sklearn.cluster import MiniBatchKMeans
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
#设置OCR路径
pytesseract.pytesseract.tesseract_cmd = r'D:Program files\Tesseract-OCR\tesseract.exe'
# ------------------图像处理-初始化配置 ---------------------
# 人脸检测模型初始化
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
# 图像情感模型系数基于IAPS数据集校准
VALENCE_WEIGHTS = {
'warm_ratio': 0.35,
'brightness': 0.15,
'symmetry': 0.20,
'colorfulness': 0.30
}
AROUSAL_WEIGHTS = {
'contrast': 0.40,
'edge_density': 0.35,
'saturation_std': 0.25
}
# 暖色调定义HSV色相范围
WARM_HUE_RANGE = (0, 60) # 红色到黄色
# ------------------处理图像 ---------------------
def get_image(url):
"""从URL获取图像并预处理"""
@ -51,56 +28,72 @@ def get_image(url):
return None
def analyze_image(img):
"""分析图像特征"""
if img is None:
return {}
def extract_color_palette(img_rgb, color_count=5):
"""提取前N种主色及比例"""
# 使用k-means聚类提取主色
pixels = img_rgb.reshape(-1, 3)
kmeans = MiniBatchKMeans(n_clusters=color_count, random_state=0)
labels = kmeans.fit_predict(pixels)
# 转换为HSV颜色空间
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
h, s, v = cv2.split(hsv)
# 计算每种颜色的比例
counts = np.bincount(labels)
total = counts.sum()
palette = []
for i in range(color_count):
ratio = counts[i] / total
color = kmeans.cluster_centers_[i].astype(int)
palette.append((color, ratio))
# 计算基础特征
features = {
'brightness': np.mean(v),
'contrast': np.max(v) - np.min(v),
'saturation_std': np.std(s),
'colorfulness': np.std(h) + np.std(s) + np.std(v)
}
# 暖色比例计算
hue_mask = cv2.inRange(h, WARM_HUE_RANGE[0], WARM_HUE_RANGE[1])
features['warm_ratio'] = np.count_nonzero(hue_mask) / (img.shape[0] * img.shape[1])
# 对称性计算
mid = img.shape[1] // 2
left = img[:, :mid]
right = cv2.flip(img[:, mid:], 1)
features['symmetry'] = cv2.matchTemplate(left, right, cv2.TM_CCOEFF_NORMED)[0][0]
# 边缘密度
edges = cv2.Canny(cv2.cvtColor(img, cv2.COLOR_BGR2GRAY), 100, 200)
features['edge_density'] = np.mean(edges)
return features
return sorted(palette, key=lambda x: -x[1]) # 按比例降序排列
def calculate_affect(features):
"""计算情感效价和唤醒度"""
poslm = sum(features[k] * VALENCE_WEIGHTS[k] for k in VALENCE_WEIGHTS)
actlm = sum(features[k] * AROUSAL_WEIGHTS[k] for k in AROUSAL_WEIGHTS)
def classify_hsv_color(rgb_color):
"""将RGB颜色分类为暖色/冷色/中性色"""
try:
# 确保输入是有效的RGB颜色值
rgb_color = np.clip(rgb_color, 0, 255)
hsv = cv2.cvtColor(np.uint8([[rgb_color]]), cv2.COLOR_RGB2HSV)[0][0]
h, s, v = hsv[0], hsv[1] / 255.0, hsv[2] / 255.0 # 归一化
# Sigmoid归一化
return {
'Poslm': 2 / (1 + np.exp(-poslm)) - 1,
'Actlm': 2 / (1 + np.exp(-actlm)) - 1
}
# 中性色判断根据Palmer标准
if s < 0.2 or v < 0.2:
return 'neutral'
# 色相分类
if (0 <= h < 90) or (270 <= h <= 360):
return 'warm'
return 'cool'
except Exception as e:
print(f"颜色越界: {str(e)}")
return 'neutral' # 出错时默认返回中性色
def determine_warm_tone(img_rgb):
"""返回(暖色标签, 暖色比例, 冷色比例)"""
palette = extract_color_palette(img_rgb)
warm_ratio, cool_ratio, neutral_ratio = 0.0, 0.0, 0.0
for color, ratio in palette:
category = classify_hsv_color(color)
if category == 'warm':
warm_ratio += ratio
elif category == 'cool':
cool_ratio += ratio
else:
neutral_ratio += ratio
return warm_ratio, cool_ratio, neutral_ratio
def detect_human(img):
"""检测人像"""
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
faces = face_cascade.detectMultiScale(gray, 1.05, 3)
# 优化参数组合
faces = face_cascade.detectMultiScale(
gray,
scaleFactor=1.02, # 减小缩放步长,增加检测粒度
minNeighbors=5, # 提高邻居数要求,减少误检
minSize=(50, 50), # 适配B站封面最小人脸尺寸
flags=cv2.CASCADE_FIND_BIGGEST_OBJECT # 优先检测最大人脸
)
return len(faces) > 0
@ -110,24 +103,18 @@ def process_url(url):
if img is None:
return None
features = analyze_image(img)
affect = calculate_affect(features)
# 主色分析
color_thief = ColorThief(BytesIO(requests.get(url).content))
dominant_color = color_thief.get_color(quality=1)
hsv_color = cv2.cvtColor(np.uint8([[dominant_color]]), cv2.COLOR_RGB2HSV)[0][0]
warm = 1 if WARM_HUE_RANGE[0] <= hsv_color[0] <= WARM_HUE_RANGE[1] else 0
# 人像检测
has_human = detect_human(img)
# 颜色分析
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
warm_ratio, cool_ratio, neutral_ratio = determine_warm_tone(img_rgb)
return {
'url': url,
**affect,
'Warm': warm,
'Portrait': int(has_human)
'Portrait': int(detect_human(img)),
'WarmRatio': round(warm_ratio, 3),
'CoolRatio': round(cool_ratio, 3),
'NeutralRatio': round(neutral_ratio, 3)
}
except Exception as e:
print(f"Error processing {url}: {str(e)}")
return None
@ -135,42 +122,51 @@ def process_url(url):
# 批量处理
def batch_process(urls, workers=4):
# 创建包含所有URL的初始结果列表
results = [{'url': url, 'success': False} for url in urls]
with Pool(workers) as pool:
results = [res for res in pool.imap(process_url, urls) if res is not None]
processed = list(tqdm(pool.imap(process_url, urls),
total=len(urls),
desc="处理进度"))
# 按原始顺序更新成功处理的结果
for i, res in enumerate(processed):
if res is not None:
results[i] = res
results[i]['success'] = True
return pd.DataFrame(results)
# 使用示例
if __name__ == "__main__":
# 读取URL列表
input_csv = "data_all.csv"
#输出路径
os.makedirs('./result', exist_ok=True)
output_csv = "result/analysis_results.csv"
##完整运行
# # 读取URL列表
# input_csv = "data_all_first_ver.csv"
# #输出路径
# os.makedirs('./result', exist_ok=True)
# output_csv = "result/analysis_results.csv"
#
# #完整运行
# df = pd.read_csv(input_csv)
# urls = df['视频封面'].tolist()
# urls = df['视频封面URL'].tolist()
#
# # 执行分析
# result_df = batch_process(urls)
#
# # 合并原始数据
# final_df = df.merge(result_df, left_on='视频封面')
# final_df.drop('url', axis=1).to_csv(output_csv, index=False)
# result_df.to_csv(output_csv, index=False, encoding='utf-8-sig')
# print(f"成功处理 {len(result_df)}/{len(urls)} 张图片")
# print("分析完成!结果已保存至", output_csv)
#重新执行失败的url
urls_failed=[
'http://i1.hdslb.com/bfs/archive/5c42e0fa42ec945106d2e167253889e8a05541c9.jpg',
'http://i1.hdslb.com/bfs/archive/d2ca3e3f4c543245715937bf643e98b55badcc21.jpg',
'http://i0.hdslb.com/bfs/archive/2b1cf64d70bf2036793e33b2de3067344a7ff77d.jpg',
'http://i0.hdslb.com/bfs/archive/123ddc4cdf429968fa416f78f4049a728e8da3ab.jpg',
'http://i2.hdslb.com/bfs/archive/b07446d2176cec63d42f204504f4cda7a940b05b.jpg',
]
result_failed = batch_process(urls_failed)
result_failed.to_csv('result/reanalyze.csv', index=False, encoding='utf-8-sig')
# 示例URL列表
#小批量实验
urls = [
'http://i0.hdslb.com/bfs/archive/393a8e961b704d43256fe7e6c89fee04df966e17.jpg',
'http://i0.hdslb.com/bfs/archive/072e16a1237040941f15b1ed67a8d1ebe6f2e041.jpg',
'http://i2.hdslb.com/bfs/archive/1c56b5bec767c604175983cc5926f5832baa9bb8.jpg',
'http://i0.hdslb.com/bfs/archive/66384e53a15345a539ccbb2989442f1d960b9235.jpg',
'http://i2.hdslb.com/bfs/archive/836b762456f0b4d65dd2c40fc4cd120107e46b88.jpg',
]
result_df = batch_process(urls)
result_df.to_csv("result/analysis_results.csv", index=False)
print(f"成功处理 {len(result_df)}/{len(urls)} 张图片")
print("分析完成!结果已保存至", output_csv)

File diff suppressed because one or more lines are too long

68573
data_text_all/comments_hot.txt Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

8311
data_text_all/tags.txt Normal file

File diff suppressed because it is too large Load Diff

1136
data_text_all/标题.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
# 数据处理
原数据文件+量化后文件两个文件
## 合并数据文件
### 1. 合并热门数据
- 数据文件
@ -36,7 +36,7 @@
- 视频类型搬运0自制1
- 字幕: 无字幕为0剩下为1
- 视频总时长输出小于60的之间的和大于600的赋值为123方便后续描述性分析
- 弹幕情感评分=0.8*snowNLP+0.2*RoBERTa
- 弹幕情感评分(SentimentScore)=0.8*snowNLP+0.2*RoBERTa
### 删除不用指标
- 发布时间等上述被处理过的指标(原播放量要保留)
- 视频简介、标签
@ -44,4 +44,47 @@
### 数据清洗
- 筛选极端弹幕情感评分,筛选出两种差值>0.3的人工检查
- 缺失值处理(按总平均填入)
- 去除异常值
- 去除异常值
## 指标创新
新增指标:
- 弹幕情感评分snowNLP和RoBERTa及其加权平均作为最终评分(SentimentScore)
- 标题情感效价(PosTe)和情感唤醒度(ActTe)
- 封面:
- 是否有人像(Portrait)
- 暖色比例(WarmRatio)
- 冷色比例(CoolRatio)
- 中性色比例(NeutralRatio)
### 弹幕情感评分
弹幕情感倾向可以反映用户对视频的喜爱程度。此前我们爬取了每个视频的弹幕,为分析弹幕的情感趋向,我们设计了字典法和模型法两套计算方案,并且最终采用加权平均的方法求得最终值以提高结果的可信度。
字典法采用SnowNLP库进行情感分析得到的情感评分范围在0到1之间其中0表示负面情感1表示正面情感。
字典法运算较为快速,同时也较为传统, 为适应B站弹幕的语言特点我们手动增添了部分词汇并给予一定的情感赋值(以原词典中很好0.78一般0.52差0.14为标准),
如"爷青回"0.9(我的青春回来了),"yyds"0.9(永远的神)"awsl"0.8(啊我死了(感动、可爱)),"2333"0.6(笑)"DNA动了"0.8(触发记忆),
得到“弹幕情感评分snowNLP”指标。
模型法采用前人预训练好的RoBERTa模型进行情感分析情感评分规则同上。
RoBERTa是由...等人在2019年提出的改进版BERT模型适用于文本分类和情感分析具有一定的鲁棒性(引用论文)。
在此基础上我们采用开源的Erlangshen-Roberta-330M模型其已在中文领域经过调整拥有3.3亿个参数,在京东、微博评论等数据集上表现良好(引用论文) 因此较为适合B站的弹幕情感分析。
由于弹幕数据量大计算量很大我们对单个视频弹幕量超过500的作均匀抽样处理500条并且借助学校高性能运算中心提供的平台进行计算~~虽然没什么用~~,得到"弹幕情感评分RoBERTa"指标。
最终,我们采用加权平均的方法结合两种方法的结果,得到最终的弹幕情感评分(SentimentScore)。
### 标题文本的情感效价(PosTe)和情感唤醒度(ActTe)
(验证了论文里的封面文本的情感效价和情感唤醒度的计算发现OCR的识别效果并不好不过函数编都编了遂应用到视频标题上去。
步骤:
1. 定义见论文<<应急科普>>P37表格
2. 情感效价PosTe的计算
使用NLPIR分词与情感词标注(Python的“cnsenti”包),统计文本中的积极、消极词汇数,然后 依据廖圣清、程 俊超等学者的做法以“积极词汇数/(积极词汇数+消极词汇数)”作为该条文本的情感效价。
若文本中积极、消极词汇同时为0将PosTe赋值为0.5,表示为中性.
3. 情感唤醒度ActTe的计算
对分词结果对照大连理工情感词典(引用论文:徐琳宏,林鸿飞,潘宇,任惠,陈建美.情感词汇本体的构造.情报学报,2008,27(2):180-185)
匹配到情感分类(小类),进一步匹配到情感大类, 即“哀”、"好"、“恶”、“乐”、“怒”、“惧”、“惊”七种情感,
并借鉴廖圣清等学者的做法根据情感唤醒度的强弱分别赋值为1-7。
对文本中反映该七种情感的字词出现的频数进行统计,将七种情感所包含的词汇数目分别乘以对应的情感唤醒程度的赋值,归一化后作为该封面文本的情感唤醒度。
对标题文本匹配结果为0的表明主要为中性词汇将ActTe赋值为0.
### 视频封面处理
视频的封面对其传播具有重要影响,但由于封面吸引力等因素具有主观性,以往的研究较为有限。
在这里,我们提取了是否有人像、暖色比例、冷色比例、中性色比例四个客观指标辅助分析。
我们使用了OpenCV库加载了预训练的人脸检测模型并调用`detectMultiScale`方法进行人像检测。
并使用改进的k-means聚类算法MiniBatchK-means提取封面主色结合HSV色彩空间分类标准计算色调比例。

1137
result/analysis_results.csv Normal file

File diff suppressed because it is too large Load Diff

6
result/reanalyze.csv Normal file
View File

@ -0,0 +1,6 @@
url,success,Portrait,WarmRatio,CoolRatio,NeutralRatio
http://i1.hdslb.com/bfs/archive/5c42e0fa42ec945106d2e167253889e8a05541c9.jpg,False,,,,
http://i1.hdslb.com/bfs/archive/d2ca3e3f4c543245715937bf643e98b55badcc21.jpg,False,,,,
http://i0.hdslb.com/bfs/archive/2b1cf64d70bf2036793e33b2de3067344a7ff77d.jpg,True,1.0,0.178,0.414,0.408
http://i0.hdslb.com/bfs/archive/123ddc4cdf429968fa416f78f4049a728e8da3ab.jpg,False,,,,
http://i2.hdslb.com/bfs/archive/b07446d2176cec63d42f204504f4cda7a940b05b.jpg,False,,,,
1 url success Portrait WarmRatio CoolRatio NeutralRatio
2 http://i1.hdslb.com/bfs/archive/5c42e0fa42ec945106d2e167253889e8a05541c9.jpg False
3 http://i1.hdslb.com/bfs/archive/d2ca3e3f4c543245715937bf643e98b55badcc21.jpg False
4 http://i0.hdslb.com/bfs/archive/2b1cf64d70bf2036793e33b2de3067344a7ff77d.jpg True 1.0 0.178 0.414 0.408
5 http://i0.hdslb.com/bfs/archive/123ddc4cdf429968fa416f78f4049a728e8da3ab.jpg False
6 http://i2.hdslb.com/bfs/archive/b07446d2176cec63d42f204504f4cda7a940b05b.jpg False

79
temp.py Normal file
View File

@ -0,0 +1,79 @@
import pandas as pd
import numpy as np
from snownlp import SnowNLP
import os
def load_data(file_path):
try:
df = pd.read_csv(file_path, usecols=['弹幕内容'], engine='python')
return df['弹幕内容'].dropna().astype(str).tolist()
except Exception as e:
print(f"数据加载失败: {str(e)}")
return []
def analyze_sentiment(danmu_texts):
# 添加特殊词汇处理以原词典中很好为0.78一般为0.52差为0.14为标准手动添加)
special_cases = {
# 高强度正能量词
"爷青回": 0.9, # 情怀向
"yyds": 0.9, # 永远滴神
'YYDS': 0.9, # 永远滴神
"awsl": 0.8, # 啊我死了(感动)
'阿伟死了': 0.8, # 谐上(感动)
"泪目": 0.8, # 感动场景
"排面": 0.8, # 排场十足
"双厨狂喜": 0.7, # 跨界联动
"梦幻联动": 0.7, # 跨作品合作
"注入灵魂": 0.7, # 高能片段
"文艺复兴": 0.8, # 经典重现
# 玩梗互动词
"下次一定": 0.55, # 投币拖延梗
"你币没了": 0.45, # 威胁不投币
"空降成功": 0.5, # 跳片头
"标准结局": 0.5, # 意料之中
"典中典": 0.4, # 经典复读(含贬义)
# 高能名场面
"名场面": 0.85, # 经典片段
"神仙打架": 0.9, # 高手对决
"前方高能": 0.7, # 高潮预警
# 数字谐音
"666": 0.75, # 玩得厉害
"999": 0.75, # 6翻了
"2333": 0.6, # 笑
# 抽象文化
"": 0.6, # 笑(中性)
"生草": 0.65, # 搞笑场景
# 破防场景
"破防了": 0.4, # 心理防线崩溃
"我裂开了": 0.3, # 心态炸裂
# 特定领域梗
"奥利给": 0.8, # 加油打气
"DNA动了": 0.8, # 触发记忆
"有内味了": 0.7, # 特色到位
# 负向场景
"公开处刑": 0.5, # 尴尬场面
"阴间": 0.3, # 诡异内容
"阴间滤镜": 0.3, # 画面诡异
"血压上来了": 0.3 # 令人烦躁
}
sentiment_scores = []
for item in danmu_texts:
if item in special_cases:
sentiment_scores.append(special_cases[item])
else:
s = SnowNLP(item)
sentiment_scores.append(s.sentiments)
avg_score = np.mean(sentiment_scores)
return avg_score
# file_path='hot_data/亲子/BV1TLXVYREDt/BV1TLXVYREDt_287_danmaku.csv'
# df = load_data(file_path)
# scores=analyze_sentiment(df)
# print(scores)
# 测试
test_words = ['4']
s = analyze_sentiment(test_words)
print(s)

View File

@ -157,5 +157,5 @@ def analyze_text(file_path):
pynlpir.close() # 确保释放NLPIR资源
if __name__ == "__main__":
file_path = 'data_all.csv'
file_path = 'data_all_second_ver.csv'
analyze_text(file_path)