From 0618b31b4ac22dbf912798c562bc80043e8e91b8 Mon Sep 17 00:00:00 2001
From: Sheyiyuan <2125107118@qq.com>
Date: Sat, 29 Mar 2025 15:37:12 +0800
Subject: [PATCH] =?UTF-8?q?add:=E6=9B=B4=E6=96=B0=E6=83=85=E6=84=9F?=
=?UTF-8?q?=E5=88=86=E6=9E=90=E6=A8=A1=E5=9E=8B=EF=BC=9B=E6=B7=BB=E5=8A=A0?=
=?UTF-8?q?requirements.txt=E4=BE=BF=E4=BA=8E=E9=83=A8=E7=BD=B2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.idea/.gitignore | 8 +++++
.idea/MarsCodeWorkspaceAppSettings.xml | 7 ++++
.idea/inspectionProfiles/Project_Default.xml | 21 ++++++++++++
.../inspectionProfiles/profiles_settings.xml | 6 ++++
.idea/misc.xml | 7 ++++
.idea/modules.xml | 8 +++++
.idea/statistics_model2025.iml | 12 +++++++
.idea/vcs.xml | 6 ++++
readme.md | 32 +++++++++++++++++++
requirements.txt | 19 +++++++++++
弹幕情感评分.py | 29 ++++++++++-------
11 files changed, 143 insertions(+), 12 deletions(-)
create mode 100644 .idea/.gitignore
create mode 100644 .idea/MarsCodeWorkspaceAppSettings.xml
create mode 100644 .idea/inspectionProfiles/Project_Default.xml
create mode 100644 .idea/inspectionProfiles/profiles_settings.xml
create mode 100644 .idea/misc.xml
create mode 100644 .idea/modules.xml
create mode 100644 .idea/statistics_model2025.iml
create mode 100644 .idea/vcs.xml
create mode 100644 readme.md
create mode 100644 requirements.txt
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..35410ca
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# 默认忽略的文件
+/shelf/
+/workspace.xml
+# 基于编辑器的 HTTP 客户端请求
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
diff --git a/.idea/MarsCodeWorkspaceAppSettings.xml b/.idea/MarsCodeWorkspaceAppSettings.xml
new file mode 100644
index 0000000..8e12c96
--- /dev/null
+++ b/.idea/MarsCodeWorkspaceAppSettings.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
new file mode 100644
index 0000000..312e865
--- /dev/null
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,21 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..98a0d17
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..adf2539
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/statistics_model2025.iml b/.idea/statistics_model2025.iml
new file mode 100644
index 0000000..cd2e5af
--- /dev/null
+++ b/.idea/statistics_model2025.iml
@@ -0,0 +1,12 @@
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..35eb1dd
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/readme.md b/readme.md
new file mode 100644
index 0000000..63a8d62
--- /dev/null
+++ b/readme.md
@@ -0,0 +1,32 @@
+# 数据处理
+
+3月26日 20:07 | 355字
+
+- 合并数据文件
+ - 合并热门数据
+ - 数据文件
+ - 视频: hot_data/分区/info
+ - up: hot_data/分区/up_info
+ - 弹幕/评论: hot_data/分区/BV号/...
+ - 单分区处理
+ - 按bv号匹配视频对应up指标,添加到info.csv
+ - 依序读取弹幕输出情感评分,(如果顺序没变的话) 直接添加一列“affective_scores”到info.csv
+ - 合并: 遍历分区info文件创建总文件,并给“视频荣誉”改名成“是否热门”并赋值为1
+ - 合并非热数据
+ - 同上,并赋值为0
+ - 合并两个文件
+ - 根据URL获取封面
+ - 按发布时间排序
+ - 文本数据合并
+ - 评论文本 (仅热门): 直接合并成列,人工筛选高质量文本
+ - 标签合并 (放一起整一个txt即可,拉个词云了事)
+ - 简介合并 (同上)
+- 数据预处理
+ - 是否为系列 (标题关键词分析)
+- 数据量化
+ - 大小分区名给一下,映射到大分区
+ - 视频分辨率: 360、720、1080、2k、4k、8k
+ - 封面处理并量化
+- 删除不用指标
+ - up主uid、bv号
+ - 视频简介、标签
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..0e5871e
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,19 @@
+# 基础数据处理和分析
+pandas>=1.3.0
+numpy>=1.21.0
+
+# 情感分析相关
+torch>=1.10.0
+transformers>=4.26.0
+modelscope>=1.4.0
+snownlp>=0.12.3 # 新增的SnowNLP库
+
+# 词云和可视化
+matplotlib>=3.5.0
+wordcloud>=1.8.0
+jieba>=0.42.1
+Pillow>=9.0.0
+
+# 其他工具
+requests>=2.26.0
+tqdm>=4.45.0
diff --git a/弹幕情感评分.py b/弹幕情感评分.py
index 716effd..593192b 100644
--- a/弹幕情感评分.py
+++ b/弹幕情感评分.py
@@ -1,6 +1,6 @@
import pandas as pd
import torch
-from modelscope import AutoModelForSequenceClassification, AutoTokenizer
+from transformers import AutoModelForSequenceClassification, AutoTokenizer # 修改为从transformers导入
def load_data(file_path):
@@ -15,19 +15,24 @@ def load_data(file_path):
def analyze_sentiment(texts):
"""改进的情感分析函数"""
- # 使用模型配置
- model_name = "damo/nlp_structbert_sentiment-classification_chinese-base"
- tokenizer = AutoTokenizer.from_pretrained(model_name)
- model = AutoModelForSequenceClassification.from_pretrained(model_name)
+ # 使用新的模型配置
+ model_name = "IDEA-CCNL/Erlangshen-Roberta-330M-Sentiment"
+ try:
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
- # 批量处理提升效率
- inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
- with torch.no_grad():
- outputs = model(**inputs)
+ # 批量处理提升效率
+ inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
+ with torch.no_grad():
+ outputs = model(**inputs)
- # 调整概率计算方式
- probs = torch.softmax(outputs.logits, dim=1)
- return probs[:, 1].mean().item()
+ # 调整概率计算方式
+ probs = torch.softmax(outputs.logits, dim=1)
+ return probs[:, 1].mean().item() # 假设正例在位置1
+
+ except Exception as e:
+ print(f"模型加载失败: {str(e)}")
+ return 0.5 # 返回中性评分作为默认值
if __name__ == "__main__":