modify
This commit is contained in:
parent
c94cfe4ab0
commit
3a996704a1
@ -1,6 +1,7 @@
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
from snownlp import SnowNLP
|
from snownlp import SnowNLP
|
||||||
import matplotlib.pyplot as plt
|
import os
|
||||||
|
|
||||||
def load_data(file_path):
|
def load_data(file_path):
|
||||||
try:
|
try:
|
||||||
@ -10,21 +11,53 @@ def load_data(file_path):
|
|||||||
print(f"数据加载失败: {str(e)}")
|
print(f"数据加载失败: {str(e)}")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# 示例文件路径
|
|
||||||
file_path = "hot_data/GMV/BV1ajXMYUE6S/BV1ajXMYUE6S_273_danmaku.csv"
|
|
||||||
|
|
||||||
# 执行分析
|
def analyze_sentiment(danmu_texts):
|
||||||
danmu_texts = load_data(file_path)
|
emotions = {'positive': 0, 'negative': 0, 'neutral': 0}
|
||||||
|
sentiment_scores = []
|
||||||
|
|
||||||
emotions = {'positive': 0,
|
for item in danmu_texts:
|
||||||
'negative': 0,
|
s = SnowNLP(item)
|
||||||
'neutral': 0}
|
score = s.sentiments
|
||||||
|
sentiment_scores.append(score)
|
||||||
|
if score > 0.6:
|
||||||
|
emotions['positive'] += 1
|
||||||
|
elif score < 0.4:
|
||||||
|
emotions['negative'] += 1
|
||||||
|
else:
|
||||||
|
emotions['neutral'] += 1
|
||||||
|
|
||||||
for item in danmu_texts:
|
avg_score = np.mean(sentiment_scores)
|
||||||
s = SnowNLP(item)
|
return emotions, avg_score
|
||||||
if s.sentiments > 0.6:
|
|
||||||
emotions['positive'] += 1
|
|
||||||
elif s.sentiments < 0.4:
|
def process_partition(partition_path):
|
||||||
emotions['negative'] += 1
|
info_file = os.path.join(partition_path, 'info.csv')
|
||||||
else:
|
if not os.path.exists(info_file):
|
||||||
emotions['neutral'] += 1
|
print(f"未找到info文件: {info_file}")
|
||||||
|
return
|
||||||
|
|
||||||
|
info_df = pd.read_csv(info_file,encoding='utf-8')
|
||||||
|
scores = []
|
||||||
|
|
||||||
|
for bv in info_df['BV号']:
|
||||||
|
danmu_file = os.path.join(partition_path, bv, f"{bv}_273_danmaku.csv")
|
||||||
|
if not os.path.exists(danmu_file):
|
||||||
|
scores.append(None)
|
||||||
|
continue
|
||||||
|
|
||||||
|
danmu_texts = load_data(danmu_file)
|
||||||
|
if not danmu_texts:
|
||||||
|
scores.append(None)
|
||||||
|
continue
|
||||||
|
|
||||||
|
_, avg_score = analyze_sentiment(danmu_texts)
|
||||||
|
scores.append(avg_score)
|
||||||
|
|
||||||
|
info_df['情感评分'] = scores
|
||||||
|
info_df.to_csv(info_file, index=False)
|
||||||
|
|
||||||
|
|
||||||
|
# 使用示例 - 处理GMV分区
|
||||||
|
partition_path = "hot_data/GMV"
|
||||||
|
process_partition(partition_path)
|
143
弹幕情感评分.py
143
弹幕情感评分.py
@ -1,43 +1,116 @@
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import torch
|
import torch
|
||||||
from modelscope import AutoModelForSequenceClassification, AutoTokenizer
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
def load_data(file_path):
|
class SentimentAnalyzer:
|
||||||
"""优化后的数据加载函数"""
|
def __init__(self, model_path: str = "uer/roberta-base-finetuned-dianping-chinese"):
|
||||||
try:
|
"""
|
||||||
df = pd.read_csv(file_path, usecols=['弹幕内容'], engine='python')
|
初始化情感分析模型
|
||||||
return df['弹幕内容'].dropna().astype(str).tolist()
|
:param model_path: 本地模型路径或HuggingFace模型名称
|
||||||
except Exception as e:
|
"""
|
||||||
print(f"数据加载失败: {str(e)}")
|
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
return []
|
|
||||||
|
try:
|
||||||
|
# 优先尝试加载本地模型
|
||||||
def analyze_sentiment(texts):
|
local_path = Path(model_path)
|
||||||
"""改进的情感分析函数"""
|
if local_path.exists():
|
||||||
# 使用模型配置
|
self.tokenizer = AutoTokenizer.from_pretrained(local_path)
|
||||||
model_name = "damo/nlp_structbert_sentiment-classification_chinese-base"
|
self.model = AutoModelForSequenceClassification.from_pretrained(local_path)
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
else:
|
||||||
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
# 从HuggingFace加载(使用国内镜像)
|
||||||
|
self.tokenizer = AutoTokenizer.from_pretrained(model_path,
|
||||||
# 批量处理提升效率
|
mirror="https://hf-mirror.com")
|
||||||
inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
|
self.model = AutoModelForSequenceClassification.from_pretrained(model_path,
|
||||||
with torch.no_grad():
|
mirror="https://hf-mirror.com")
|
||||||
outputs = model(**inputs)
|
|
||||||
|
self.model = self.model.to(self.device)
|
||||||
# 调整概率计算方式
|
self.model.eval()
|
||||||
probs = torch.softmax(outputs.logits, dim=1)
|
print(f"成功加载模型: {model_path}")
|
||||||
return probs[:, 1].mean().item()
|
|
||||||
|
except Exception as e:
|
||||||
|
raise RuntimeError(f"模型加载失败: {str(e)}")
|
||||||
|
|
||||||
|
def analyze(self, texts: list, batch_size: int = 32) -> list:
|
||||||
|
"""
|
||||||
|
批量情感分析
|
||||||
|
:param texts: 待分析文本列表
|
||||||
|
:param batch_size: 批处理大小
|
||||||
|
:return: 情感概率列表(0-1之间)
|
||||||
|
"""
|
||||||
|
if not texts:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# 自动调整批大小防止内存溢出
|
||||||
|
mem = torch.cuda.mem_get_info()[0] if torch.cuda.is_available() else 2e9
|
||||||
|
safe_batch_size = min(batch_size, max(1, int(mem // 1e7)))
|
||||||
|
|
||||||
|
all_probs = []
|
||||||
|
for i in range(0, len(texts), safe_batch_size):
|
||||||
|
batch = texts[i:i + safe_batch_size]
|
||||||
|
|
||||||
|
try:
|
||||||
|
inputs = self.tokenizer(
|
||||||
|
batch,
|
||||||
|
padding=True,
|
||||||
|
truncation=True,
|
||||||
|
max_length=128,
|
||||||
|
return_tensors="pt"
|
||||||
|
).to(self.device)
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
outputs = self.model(**inputs)
|
||||||
|
|
||||||
|
probs = torch.softmax(outputs.logits, dim=1)[:, 1]
|
||||||
|
all_probs.extend(probs.cpu().numpy().tolist())
|
||||||
|
|
||||||
|
except RuntimeError as e:
|
||||||
|
if "CUDA out of memory" in str(e):
|
||||||
|
safe_batch_size = max(1, safe_batch_size // 2)
|
||||||
|
print(f"检测到显存不足,调整批大小为: {safe_batch_size}")
|
||||||
|
continue
|
||||||
|
raise
|
||||||
|
|
||||||
|
return all_probs
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------- 使用示例 -----------------
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# 示例文件路径
|
# 初始化分析器(自动选择本地/在线模型)
|
||||||
file_path = "hot_data/GMV/BV1ajXMYUE6S/BV1ajXMYUE6S_273_danmaku.csv"
|
analyzer = SentimentAnalyzer("./local_models/sentiment") # 优先尝试本地模型
|
||||||
|
|
||||||
# 执行分析
|
# 测试数据
|
||||||
danmu_texts = load_data(file_path)
|
test_texts = [
|
||||||
if danmu_texts:
|
"这个视频真的太棒了!",
|
||||||
final_score = analyze_sentiment(danmu_texts)
|
"完全看不懂在讲什么",
|
||||||
print(f"B站弹幕情感评分:{final_score:.4f}")
|
"浪费时间,不建议观看",
|
||||||
else:
|
"画面精美,内容有深度"
|
||||||
print("未找到有效弹幕数据")
|
]
|
||||||
|
|
||||||
|
# 获取情感概率
|
||||||
|
scores = analyzer.analyze(test_texts)
|
||||||
|
|
||||||
|
# 输出结果
|
||||||
|
for text, score in zip(test_texts, scores):
|
||||||
|
print(f"「{text}」=> 积极概率: {score:.4f}")
|
||||||
|
|
||||||
|
|
||||||
|
# 从CSV文件读取弹幕
|
||||||
|
def load_danmu(file_path: str) -> list:
|
||||||
|
try:
|
||||||
|
df = pd.read_csv(file_path)
|
||||||
|
return df['弹幕内容'].dropna().astype(str).tolist()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"文件读取失败: {str(e)}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
# 实际使用
|
||||||
|
# 文件路径
|
||||||
|
file_path = "hot_data/GMV/BV1ajXMYUE6S/BV1ajXMYUE6S_273_danmaku.csv"
|
||||||
|
danmu_list = load_danmu(file_path)
|
||||||
|
if danmu_list:
|
||||||
|
danmu_scores = analyzer.analyze(danmu_list)
|
||||||
|
avg_score = sum(danmu_scores) / len(danmu_scores)
|
||||||
|
print(f"\n弹幕平均情感评分:{avg_score:.4f}")
|
Loading…
x
Reference in New Issue
Block a user