diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..35410ca --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# 默认忽略的文件 +/shelf/ +/workspace.xml +# 基于编辑器的 HTTP 客户端请求 +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/MarsCodeWorkspaceAppSettings.xml b/.idea/MarsCodeWorkspaceAppSettings.xml new file mode 100644 index 0000000..8e12c96 --- /dev/null +++ b/.idea/MarsCodeWorkspaceAppSettings.xml @@ -0,0 +1,7 @@ + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..312e865 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,21 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..98a0d17 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..adf2539 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/statistics_model2025.iml b/.idea/statistics_model2025.iml new file mode 100644 index 0000000..cd2e5af --- /dev/null +++ b/.idea/statistics_model2025.iml @@ -0,0 +1,12 @@ + + + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/弹幕情感评分.py b/BERT_danmu_sentiment_analyzer.py similarity index 75% rename from 弹幕情感评分.py rename to BERT_danmu_sentiment_analyzer.py index 172ee73..5d3e9db 100644 --- a/弹幕情感评分.py +++ b/BERT_danmu_sentiment_analyzer.py @@ -1,5 +1,6 @@ import pandas as pd import torch +<<<<<<< HEAD from transformers import AutoTokenizer, AutoModelForSequenceClassification from pathlib import Path @@ -73,6 +74,41 @@ class SentimentAnalyzer: raise return all_probs +======= +from transformers import AutoModelForSequenceClassification, AutoTokenizer # 修改为从transformers导入 + + +def load_data(file_path): + """优化后的数据加载函数""" + try: + df = pd.read_csv(file_path, usecols=['弹幕内容'], engine='python') + return df['弹幕内容'].dropna().astype(str).tolist() + except Exception as e: + print(f"数据加载失败: {str(e)}") + return [] + + +def analyze_sentiment(texts): + """改进的情感分析函数""" + # 使用新的模型配置 + model_name = "IDEA-CCNL/Erlangshen-Roberta-330M-Sentiment" + try: + tokenizer = AutoTokenizer.from_pretrained(model_name) + model = AutoModelForSequenceClassification.from_pretrained(model_name) + + # 批量处理提升效率 + inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt") + with torch.no_grad(): + outputs = model(**inputs) + + # 调整概率计算方式 + probs = torch.softmax(outputs.logits, dim=1) + return probs[:, 1].mean().item() # 假设正例在位置1 + + except Exception as e: + print(f"模型加载失败: {str(e)}") + return 0.5 # 返回中性评分作为默认值 +>>>>>>> 0618b31b4ac22dbf912798c562bc80043e8e91b8 # ----------------- 使用示例 ----------------- diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..63a8d62 --- /dev/null +++ b/readme.md @@ -0,0 +1,32 @@ +# 数据处理 + +3月26日 20:07 | 355字 + +- 合并数据文件 + - 合并热门数据 + - 数据文件 + - 视频: hot_data/分区/info + - up: hot_data/分区/up_info + - 弹幕/评论: hot_data/分区/BV号/... + - 单分区处理 + - 按bv号匹配视频对应up指标,添加到info.csv + - 依序读取弹幕输出情感评分,(如果顺序没变的话) 直接添加一列“affective_scores”到info.csv + - 合并: 遍历分区info文件创建总文件,并给“视频荣誉”改名成“是否热门”并赋值为1 + - 合并非热数据 + - 同上,并赋值为0 + - 合并两个文件 + - 根据URL获取封面 + - 按发布时间排序 + - 文本数据合并 + - 评论文本 (仅热门): 直接合并成列,人工筛选高质量文本 + - 标签合并 (放一起整一个txt即可,拉个词云了事) + - 简介合并 (同上) +- 数据预处理 + - 是否为系列 (标题关键词分析) +- 数据量化 + - 大小分区名给一下,映射到大分区 + - 视频分辨率: 360、720、1080、2k、4k、8k + - 封面处理并量化 +- 删除不用指标 + - up主uid、bv号 + - 视频简介、标签 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..0e5871e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,19 @@ +# 基础数据处理和分析 +pandas>=1.3.0 +numpy>=1.21.0 + +# 情感分析相关 +torch>=1.10.0 +transformers>=4.26.0 +modelscope>=1.4.0 +snownlp>=0.12.3 # 新增的SnowNLP库 + +# 词云和可视化 +matplotlib>=3.5.0 +wordcloud>=1.8.0 +jieba>=0.42.1 +Pillow>=9.0.0 + +# 其他工具 +requests>=2.26.0 +tqdm>=4.45.0 diff --git a/弹幕情感评分-字典法.py b/snowNLP_danmu sentiment_analyzer.py similarity index 100% rename from 弹幕情感评分-字典法.py rename to snowNLP_danmu sentiment_analyzer.py diff --git a/词云图.py b/word_cloud.py similarity index 100% rename from 词云图.py rename to word_cloud.py diff --git a/词频.py b/word_frequence.py similarity index 100% rename from 词频.py rename to word_frequence.py