modify
This commit is contained in:
commit
7153f7a097
8
.idea/.gitignore
generated
vendored
Normal file
8
.idea/.gitignore
generated
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
# 默认忽略的文件
|
||||
/shelf/
|
||||
/workspace.xml
|
||||
# 基于编辑器的 HTTP 客户端请求
|
||||
/httpRequests/
|
||||
# Datasource local storage ignored files
|
||||
/dataSources/
|
||||
/dataSources.local.xml
|
7
.idea/MarsCodeWorkspaceAppSettings.xml
generated
Normal file
7
.idea/MarsCodeWorkspaceAppSettings.xml
generated
Normal file
@ -0,0 +1,7 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="com.codeverse.userSettings.MarscodeWorkspaceAppSettingsState">
|
||||
<option name="ckgOperationStatus" value="SUCCESS" />
|
||||
<option name="progress" value="0.99839693" />
|
||||
</component>
|
||||
</project>
|
21
.idea/inspectionProfiles/Project_Default.xml
generated
Normal file
21
.idea/inspectionProfiles/Project_Default.xml
generated
Normal file
@ -0,0 +1,21 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<profile version="1.0">
|
||||
<option name="myName" value="Project Default" />
|
||||
<inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
|
||||
<option name="ignoredErrors">
|
||||
<list>
|
||||
<option value="N803" />
|
||||
<option value="N802" />
|
||||
</list>
|
||||
</option>
|
||||
</inspection_tool>
|
||||
<inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
|
||||
<option name="ignoredIdentifiers">
|
||||
<list>
|
||||
<option value="OlivOS" />
|
||||
</list>
|
||||
</option>
|
||||
</inspection_tool>
|
||||
<inspection_tool class="RMarkdownRequirements" enabled="false" level="WARNING" enabled_by_default="false" />
|
||||
</profile>
|
||||
</component>
|
6
.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
6
.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
@ -0,0 +1,6 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
7
.idea/misc.xml
generated
Normal file
7
.idea/misc.xml
generated
Normal file
@ -0,0 +1,7 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="Black">
|
||||
<option name="sdkName" value="SAM-bilibil" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="SAM-bilibil" project-jdk-type="Python SDK" />
|
||||
</project>
|
8
.idea/modules.xml
generated
Normal file
8
.idea/modules.xml
generated
Normal file
@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/statistics_model2025.iml" filepath="$PROJECT_DIR$/.idea/statistics_model2025.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
12
.idea/statistics_model2025.iml
generated
Normal file
12
.idea/statistics_model2025.iml
generated
Normal file
@ -0,0 +1,12 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="jdk" jdkName="SAM-bilibil" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="PyDocumentationSettings">
|
||||
<option name="format" value="PLAIN" />
|
||||
<option name="myDocStringFormat" value="Plain" />
|
||||
</component>
|
||||
</module>
|
6
.idea/vcs.xml
generated
Normal file
6
.idea/vcs.xml
generated
Normal file
@ -0,0 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
@ -1,5 +1,6 @@
|
||||
import pandas as pd
|
||||
import torch
|
||||
<<<<<<< HEAD
|
||||
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
||||
from pathlib import Path
|
||||
|
||||
@ -73,6 +74,41 @@ class SentimentAnalyzer:
|
||||
raise
|
||||
|
||||
return all_probs
|
||||
=======
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer # 修改为从transformers导入
|
||||
|
||||
|
||||
def load_data(file_path):
|
||||
"""优化后的数据加载函数"""
|
||||
try:
|
||||
df = pd.read_csv(file_path, usecols=['弹幕内容'], engine='python')
|
||||
return df['弹幕内容'].dropna().astype(str).tolist()
|
||||
except Exception as e:
|
||||
print(f"数据加载失败: {str(e)}")
|
||||
return []
|
||||
|
||||
|
||||
def analyze_sentiment(texts):
|
||||
"""改进的情感分析函数"""
|
||||
# 使用新的模型配置
|
||||
model_name = "IDEA-CCNL/Erlangshen-Roberta-330M-Sentiment"
|
||||
try:
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
||||
|
||||
# 批量处理提升效率
|
||||
inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
|
||||
with torch.no_grad():
|
||||
outputs = model(**inputs)
|
||||
|
||||
# 调整概率计算方式
|
||||
probs = torch.softmax(outputs.logits, dim=1)
|
||||
return probs[:, 1].mean().item() # 假设正例在位置1
|
||||
|
||||
except Exception as e:
|
||||
print(f"模型加载失败: {str(e)}")
|
||||
return 0.5 # 返回中性评分作为默认值
|
||||
>>>>>>> 0618b31b4ac22dbf912798c562bc80043e8e91b8
|
||||
|
||||
|
||||
# ----------------- 使用示例 -----------------
|
32
readme.md
Normal file
32
readme.md
Normal file
@ -0,0 +1,32 @@
|
||||
# 数据处理
|
||||
|
||||
3月26日 20:07 | 355字
|
||||
|
||||
- 合并数据文件
|
||||
- 合并热门数据
|
||||
- 数据文件
|
||||
- 视频: hot_data/分区/info
|
||||
- up: hot_data/分区/up_info
|
||||
- 弹幕/评论: hot_data/分区/BV号/...
|
||||
- 单分区处理
|
||||
- 按bv号匹配视频对应up指标,添加到info.csv
|
||||
- 依序读取弹幕输出情感评分,(如果顺序没变的话) 直接添加一列“affective_scores”到info.csv
|
||||
- 合并: 遍历分区info文件创建总文件,并给“视频荣誉”改名成“是否热门”并赋值为1
|
||||
- 合并非热数据
|
||||
- 同上,并赋值为0
|
||||
- 合并两个文件
|
||||
- 根据URL获取封面
|
||||
- 按发布时间排序
|
||||
- 文本数据合并
|
||||
- 评论文本 (仅热门): 直接合并成列,人工筛选高质量文本
|
||||
- 标签合并 (放一起整一个txt即可,拉个词云了事)
|
||||
- 简介合并 (同上)
|
||||
- 数据预处理
|
||||
- 是否为系列 (标题关键词分析)
|
||||
- 数据量化
|
||||
- 大小分区名给一下,映射到大分区
|
||||
- 视频分辨率: 360、720、1080、2k、4k、8k
|
||||
- 封面处理并量化
|
||||
- 删除不用指标
|
||||
- up主uid、bv号
|
||||
- 视频简介、标签
|
19
requirements.txt
Normal file
19
requirements.txt
Normal file
@ -0,0 +1,19 @@
|
||||
# 基础数据处理和分析
|
||||
pandas>=1.3.0
|
||||
numpy>=1.21.0
|
||||
|
||||
# 情感分析相关
|
||||
torch>=1.10.0
|
||||
transformers>=4.26.0
|
||||
modelscope>=1.4.0
|
||||
snownlp>=0.12.3 # 新增的SnowNLP库
|
||||
|
||||
# 词云和可视化
|
||||
matplotlib>=3.5.0
|
||||
wordcloud>=1.8.0
|
||||
jieba>=0.42.1
|
||||
Pillow>=9.0.0
|
||||
|
||||
# 其他工具
|
||||
requests>=2.26.0
|
||||
tqdm>=4.45.0
|
Loading…
x
Reference in New Issue
Block a user