In [14]:
import os
import sys
#导入基础包
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf
import matplotlib.font_manager as fm
# 导入主成分分析相关包
from factor_analyzer import Rotator
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity, calculate_kmo
# 导入SARIMA相关包
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_error, mean_squared_error
from pmdarima import auto_arima
import pmdarima as pm
# 导入XGBOOST相关包
from xgboost import XGBRegressor
from scipy.stats import randint, uniform
from sklearn.model_selection import RandomizedSearchCV
from matplotlib.dates import DateFormatter, HourLocator
# 导入单独写的函数
from calculate import *
from heatmap import *
from sort_matrix import *
In [15]:
# 设置字体
if sys.platform == 'darwin': # macOS
font_path = '/System/Library/Fonts/STHeiti Light.ttc'
elif sys.platform == 'win32': # Windows
plt.rcParams['font.sans-serif'] = ['SimHei'] # Windows系统自带黑体
plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题
else: # Linux/其他系统
font_path = '/usr/share/fonts/truetype/wqy/wqy-zenhei.ttc' # 文泉驿字体
# 仅非Windows系统需要加载字体文件
if sys.platform != 'win32':
try:
font_prop = fm.FontProperties(fname=font_path)
plt.rcParams['font.family'] = font_prop.get_name()
except:
print(f"警告:{font_path} 字体加载失败,请检查路径有效性")
try:
os.mkdir('./images')
except FileExistsError:
pass
try:
os.mkdir('./results')
except FileExistsError:
pass
#读取数据
data=pd.read_excel('北京市空气质量指数与气象数据.xlsx')
data.head()
Out[15]:
date | hour | AQI | CO | NO2 | O3 | PM10 | PM2.5 | SO2 | T | ... | P | Pa | U | Ff | Tn | Tx | VV | Td | RRR | tR | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2022-11-01 | 2 | 18.371429 | 0.211429 | 23.771429 | 29.057143 | 13.257143 | 3.057143 | 2.628571 | 6.7 | ... | 770.5 | 0.1 | 36.0 | 1.0 | 5.3 | 17.3 | 30.0 | -7.3 | 0.0 | 12 |
1 | 2022-11-01 | 5 | 21.914286 | 0.180000 | 26.571429 | 20.142857 | 18.914286 | 3.771429 | 2.542857 | 2.0 | ... | 770.8 | 0.3 | 62.0 | 0.0 | 1.9 | 17.3 | 7.0 | -4.5 | 0.0 | 12 |
2 | 2022-11-01 | 8 | 28.628571 | 0.311429 | 30.028571 | 14.285714 | 27.942857 | 6.857143 | 2.400000 | 6.6 | ... | 771.7 | 0.9 | 56.0 | 0.0 | 0.9 | 17.3 | 10.0 | -7.1 | 0.0 | 12 |
3 | 2022-11-01 | 11 | 19.000000 | 0.237143 | 17.971429 | 40.529412 | 17.852941 | 5.914286 | 2.176471 | 13.5 | ... | 771.3 | -0.4 | 19.0 | 2.0 | 0.9 | 17.3 | 30.0 | -9.7 | 0.0 | 12 |
4 | 2022-11-01 | 14 | 21.742857 | 0.252941 | 15.588235 | 53.617647 | 20.941176 | 6.742857 | 2.000000 | 15.7 | ... | 768.6 | -2.7 | 19.0 | 2.0 | 0.9 | 17.3 | 30.0 | -7.9 | 0.0 | 12 |
5 rows × 21 columns
题目1¶
研究单日内空气质量指数与各项指标的变化趋势,这种趋势是否具有周期性?
In [16]:
# 数据预处理:将数据按小时分组,计算每个小时各指标的平均值
# 转换Excel日期序列值为实际日期并分组
data['datetime'] = pd.to_datetime(data['date']) + pd.to_timedelta(data['hour'], unit='h')
valid_hours = sorted(data['hour'].unique())
hourly_data = data.groupby('hour').mean().loc[valid_hours]
plt.figure(figsize=(12, 8))
indicators = ['AQI', 'PM2.5', 'PM10', 'CO', 'NO2', 'O3','SO2']
colors = ['#2d87bb', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf', '#1f77b4', '#ffbb78', '#98df8a', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf', '#1f77b4', '#ffbb78', '#98df8a', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf', '#1f77b4', '#ffbb78', '#98df8a', '#d62728',]
normalized = (hourly_data[indicators] - hourly_data[indicators].mean(axis=0)) / hourly_data[indicators].std(axis=0)
<Figure size 1200x800 with 0 Axes>
In [17]:
# 绘制各指标小时均值变化趋势(标准化后)折线图
for i, indicator in enumerate(indicators):
plt.plot(normalized.index, normalized[indicator],
marker='o',label=indicator, color=colors[i], linewidth=2)
plt.title('各指标小时均值变化趋势(标准化后)', fontsize=14)
plt.xlabel('小时', fontsize=12)
plt.ylabel('标准化值', fontsize=12)
plt.xticks(range(0, 24))
plt.grid(alpha=0.3)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
# 新增保存代码
plt.savefig('images/hourly_trends_combined.png', dpi=300, bbox_inches='tight') # 保存组合大图
plt.show()
# 新增保存子图代码
for i, indicator in enumerate(indicators):
plt.figure(figsize=(8, 5))
plt.plot(normalized.index, normalized[indicator],
marker='o', color=colors[i], linewidth=2)
plt.title(f'{indicator}小时均值变化趋势(标准化后)')
plt.xlabel('小时')
plt.ylabel('标准化值')
plt.xticks(range(0, 24))
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig(f'images/hourly_{indicator}.png', dpi=300) # 保存单个指标子图
plt.close()
In [18]:
# ACF检验周期性
# 创建完整时间序列(每小时一个样本,缺失值用线性插值)
full_idx = pd.date_range(start=data['datetime'].min(),
end=data['datetime'].max(),
freq='h')
full_series = data.set_index('datetime').reindex(full_idx)
interpolated = full_series[indicators].interpolate(method='time')
# 绘制ACF图,检验3天周期(24*3)
plt.figure(figsize=(60, 20)) # 调整整体画布尺寸
for i, indicator in enumerate(indicators):
ax = plt.subplot(2, 4, i+1) # 创建2行4列的子图布局
plot_acf(interpolated[indicator].dropna(),
lags=72,
alpha=0.05,
title=f'{indicator}',
color=colors[i],
ax=ax)
plt.xticks(np.arange(0, 73, 12))
plt.tight_layout()
plt.savefig('./images/all_acf_subplots.png', dpi=200, bbox_inches='tight')
plt.show()
for i, indicator in enumerate(indicators):
plt.figure(figsize=(12, 6))
plot_acf(interpolated[indicator].dropna(),
lags=72,
alpha=0.05,
title=f'{indicator} ACF',
color=colors[i])
plt.xticks(np.arange(0, 73, 12))
plt.savefig(f'./images/acf_{indicator}.png', dpi=200, bbox_inches='tight')
plt.close()
<Figure size 1200x600 with 0 Axes>
<Figure size 1200x600 with 0 Axes>
<Figure size 1200x600 with 0 Axes>
<Figure size 1200x600 with 0 Axes>
<Figure size 1200x600 with 0 Axes>
<Figure size 1200x600 with 0 Axes>
<Figure size 1200x600 with 0 Axes>
题目2¶
简述各项指标间的相互关系。
In [19]:
#计算相关系数矩阵
correlation_matrix = data.iloc[:, 1:].corr()
#绘制热力图
plot_heatmap(correlation_matrix,20,16,title="Correlation Matrix Heatmap",save_path="./images/correlation_heatmap.png")
In [20]:
#主成分分析(PCA)
data=pd.read_excel('北京市空气质量指数与气象数据.xlsx')
PCA_data=data.iloc[:,2:]#去除日期列
# 计算KMO值
kmo_all, kmo_model = calculate_kmo(PCA_data)
print(f"KMO值: {kmo_model.round(3)}")
# 进行巴赫利特检验
chi_square_value, p_value = calculate_bartlett_sphericity(PCA_data)
print(f"巴赫利特检验卡方值: {chi_square_value.round(3)}, p值: {p_value}")
#判断
if kmo_model>0.7 and p_value<0.05:
print("数据适合进行主成分分析",'\n')
else:
print("数据不适合进行主成分分析",'\n')
# 数据标准化
scaled_data = (PCA_data - PCA_data.mean()) / PCA_data.std()
scaled_data = scaled_data.dropna()#去除空值
# 计算协方差矩阵
cov_matrix = np.cov(scaled_data, rowvar=False)
# 计算特征值和特征向量
eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)
sorted_indices = np.argsort(eigenvalues)[::-1]
sorted_eigenvalues = eigenvalues[sorted_indices]
sorted_eigenvectors = eigenvectors[:, sorted_indices]
# 绘制累计方差解释比例图
explained_variance_ratio = sorted_eigenvalues / np.sum(sorted_eigenvalues)
cumulative_explained_variance = np.cumsum(explained_variance_ratio)
print("累计方差解释比例:", [f"{cum * 100:.2f}%" for cum in cumulative_explained_variance])
plt.plot(range(1, len(cumulative_explained_variance) + 1), cumulative_explained_variance, marker='o')
plt.xlabel('主成分数量')
plt.ylabel('累计方差解释比例')
plt.title('PCA 累计方差解释比例')
plt.savefig('./images/PCA_cumulative_explained_variance.png', dpi=200, bbox_inches='tight')
plt.show()
# 选择特征值大于1的作为主成分
mask = sorted_eigenvalues > 1
selected_eigenvectors = sorted_eigenvectors[:, mask]
# 计算因子载荷矩阵
loadings = selected_eigenvectors * np.sqrt(sorted_eigenvalues[mask])
# 使用Varimax旋转载荷矩阵
rotator = Rotator(method='varimax')
rotated_loadings = rotator.fit_transform(loadings)
# 输出旋转后的成分矩阵
rotated_components_df = pd.DataFrame(rotated_loadings,
index=PCA_data.columns,
columns=[f'Factor{i+1}' for i in range(rotated_loadings.shape[1])])
rotated_components_df = rotated_components_df.round(3)
# 输出排序后的载荷矩阵
rotated_components_df=sort_matrix_by_diag(rotated_components_df)
print("旋转后的载荷矩阵(排序后):\n", rotated_components_df)
plot_heatmap(rotated_components_df, 4, 8,save_path="./images/components_heatmap.png")
KMO值: 0.762 巴赫利特检验卡方值: 90424.712, p值: 0.0 数据适合进行主成分分析 累计方差解释比例: ['31.41%', '54.60%', '66.53%', '73.02%', '78.89%', '84.04%', '88.27%', '91.46%', '93.59%', '95.70%', '97.14%', '98.29%', '98.91%', '99.26%', '99.55%', '99.79%', '99.96%', '100.00%', '100.00%']
旋转后的载荷矩阵(排序后): Factor1 Factor3 Factor2 Factor4 Factor5 Tn -0.963 -0.035 -0.071 -0.079 0.035 T -0.958 0.138 -0.033 0.074 -0.028 Tx -0.954 0.014 -0.045 -0.052 -0.063 P 0.924 0.029 -0.071 -0.032 -0.000 Po 0.921 0.029 -0.073 -0.033 -0.000 Td -0.898 -0.366 -0.043 -0.013 0.076 O3 -0.637 0.529 0.030 0.239 -0.084 U -0.322 -0.824 0.008 -0.156 0.229 Ff -0.045 0.772 0.126 0.024 0.172 NO2 0.300 -0.728 0.290 0.110 -0.202 CO -0.101 -0.695 0.449 0.298 -0.007 VV 0.153 0.667 -0.531 -0.093 -0.175 AQI -0.017 -0.038 0.967 0.025 -0.029 PM10 0.037 0.060 0.933 -0.092 0.003 PM2.5 0.049 -0.359 0.879 0.149 -0.007 Pa 0.006 -0.055 0.147 -0.747 -0.130 SO2 -0.035 -0.099 0.208 0.694 -0.065 RRR -0.139 -0.094 -0.103 -0.077 0.819 tR 0.163 0.120 0.087 0.131 0.512
题目3¶
令2022年11月1日至2023年9月30日的空气质量数据为训练集,剩余数据为测试集。基于训练集,尝试使用两种不同的方法构建空气质量指数预测模型,并在测试集上测试。比较所选模型的预测效果。
In [21]:
#重新读取数据
data = pd.read_excel('北京市空气质量指数与气象数据.xlsx')
data['date_hour'] = pd.to_datetime(data['date']) + pd.to_timedelta(data['hour'], unit='h')
data = data[['date_hour', 'date', 'hour', 'AQI']].set_index('date_hour')
data = data.asfreq('3h')
data['AQI'] = data['AQI'].ffill()
(1)SARIMA模型¶
In [22]:
"""
该模型在假设不知道测试集其他指标的情况下,仅使用AQI历史数据预测未来AQI。
整体运行时间约在1min左右,请耐心等待。
"""
# 数据切分
train_end = pd.Timestamp('2023-09-30 23:00:00')
test_start = pd.Timestamp('2023-10-01 02:00:00')
train = data.loc[:train_end, 'AQI']
test = data.loc[test_start:, 'AQI']
# 自动参数搜索
print("开始自动参数搜索(请耐心等待)...")
model = auto_arima(
train,
start_p=0, start_q=0,
max_p=1, max_q=1,
seasonal=True,
m=8,
d=1,
D=1,
trace=False,
error_action='ignore',
suppress_warnings=True,
stepwise=True
)
print(f"最优参数组合:Order{model.order} Seasonal{model.seasonal_order}")
# 初始化模型
current_model = SARIMAX(train, order=model.order, seasonal_order=model.seasonal_order)
current_results = current_model.fit(disp=False)
# 预测
predictions = []
lower_bounds = []
upper_bounds = []
print("开始单步滚动预测(请耐心等待)...")
for t in test.index:
forecast = current_results.get_forecast(steps=1)
pred_mean = forecast.predicted_mean.iloc[0]
pred_ci = forecast.conf_int().iloc[0]
predictions.append(pred_mean)
lower_bounds.append(pred_ci.iloc[0])
upper_bounds.append(pred_ci.iloc[1])
current_results = current_results.append(test.loc[[t]], refit=False)
forecast_df = pd.DataFrame({
'predicted': predictions,
'lower': lower_bounds,
'upper': upper_bounds
}, index=test.index)
# 结果处理
valid_mask = forecast_df['predicted'].notna()
y_actual_valid = test[valid_mask]
y_pred_valid = forecast_df.loc[valid_mask, 'predicted']
# 可视化
plt.figure(figsize=(15, 6))
train_last_3days = train.loc[train.index[-24]:]
train_last_3days.plot(label='训练集(最后3天)', alpha=0.7)
test.plot(label='实际值', color='green', alpha=0.7)
forecast_df['predicted'].plot(style='--', marker='o', markersize=5, label='单步预测值', color='red')
plt.fill_between(forecast_df.index,
forecast_df['lower'],
forecast_df['upper'],
color='pink', alpha=0.3, label='95%置信区间')
plt.axvline(test_start, color='gray', linestyle='--', alpha=0.6)
plt.title('AQI单步滚动预测结果 (SARIMA模型)')
plt.xlabel('时间')
plt.ylabel('AQI')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig('./images/AQI-SARIMA.png', dpi=200, bbox_inches='tight')
plt.show()
# 评估指标
metrics = cal_metrics(y_actual_valid, y_pred_valid)
print("\n单步预测评估结果:")
print(f"有效预测点数: {len(y_actual_valid)}/{len(test)}")
for k, v in metrics.items():
print(f"{k}: {v:.3f}")
开始自动参数搜索(请耐心等待)... 最优参数组合:Order(0, 1, 1) Seasonal(2, 1, 0, 8) 开始单步滚动预测(请耐心等待)...
单步预测评估结果: 有效预测点数: 248/248 RMSE: 11.893 R-squared: 0.932 MAE: 7.744
(2)XGBOOST模型¶
In [23]:
"""
该模型在假设不考虑测试集其他指标的情况下,仅使用AQI数据对未来AQI进行<单步预测>,即每次预测都是根据之前时间点的真实AQI值进行的。
整体运行时间约为25s,请耐心等待。
"""
#特征工程
data=data[['AQI']]
data_processed = data.copy()
#时间分解特征
# 基础特征
data_processed['hour'] = data_processed.index.hour
data_processed['day_of_week'] = data_processed.index.dayofweek
data_processed['month'] = data_processed.index.month
# 周期性编码
data_processed['hour_sin'] = np.sin(2 * np.pi * data_processed['hour'] / 24)
data_processed['hour_cos'] = np.cos(2 * np.pi * data_processed['hour'] / 24)
data_processed['week_sin'] = np.sin(2 * np.pi * data_processed['day_of_week'] / 7)
data_processed['week_cos'] = np.cos(2 * np.pi * data_processed['day_of_week'] / 7)
#滞后特征
# 生成3小时粒度的滞后特征(最多7天)
lags = [i for i in range(1, 7 * 8 + 1)] # 7天*每天8个时间点(3小时间隔)
for lag in lags:
data_processed[f'AQI_lag_{lag}'] = data_processed['AQI'].shift(lag)
# 划分数据集
train_data = data_processed.loc['2022-11-01':'2023-09-30']
test_data = data_processed.loc['2023-10-01':]
# 特征选择
features = [col for col in train_data.columns if col != 'AQI']
X_train, y_train = train_data[features], train_data['AQI']
X_test, y_test = test_data[features], test_data['AQI']
In [24]:
#随机搜索法参数调优(这里耗时较长,请耐心等待)
param_dist = {
'n_estimators': [100, 200, 300],
'max_depth': randint(5, 10),
'learning_rate': uniform(0.01, 0.2),
'subsample': uniform(0.7, 0.3),
'colsample_bytree': uniform(0.7, 0.3),
'gamma': uniform(0, 0.3)
}
search = RandomizedSearchCV(
XGBRegressor(n_jobs=-1, random_state=42),
param_distributions=param_dist,
n_iter=10,
cv=3,
scoring='neg_mean_absolute_error',
verbose=1,
random_state=42
)
search.fit(X_train, y_train)
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Out[24]:
RandomizedSearchCV(cv=3, estimator=XGBRegressor(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=... 'learning_rate': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x000001F4345D2330>, 'max_depth': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x000001F433D34290>, 'n_estimators': [100, 200, 300], 'subsample': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x000001F433D355E0>}, random_state=42, scoring='neg_mean_absolute_error', verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomizedSearchCV(cv=3, estimator=XGBRegressor(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=... 'learning_rate': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x000001F4345D2330>, 'max_depth': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x000001F433D34290>, 'n_estimators': [100, 200, 300], 'subsample': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x000001F433D355E0>}, random_state=42, scoring='neg_mean_absolute_error', verbose=1)
XGBRegressor(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=0.9826605267054558, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=0.16898646535366177, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=0.08708330050798323, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=6, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=100, n_jobs=-1, num_parallel_tree=None, random_state=42, ...)
XGBRegressor(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=0.9826605267054558, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=0.16898646535366177, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=0.08708330050798323, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=6, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=100, n_jobs=-1, num_parallel_tree=None, random_state=42, ...)
In [25]:
#模型预测
best_model = search.best_estimator_
y_pred = best_model.predict(X_test)
#评估指标
metrics=cal_metrics(y_pred, y_test)
#输出结果
print("最佳参数组合:", search.best_params_)
print("评估指标:")
for k, v in metrics.items():
print(f"{k}: {v:.3f}")
最佳参数组合: {'colsample_bytree': 0.9826605267054558, 'gamma': 0.16898646535366177, 'learning_rate': 0.08708330050798323, 'max_depth': 6, 'n_estimators': 100, 'subsample': 0.7692681476866446} 评估指标: RMSE: 11.815 R-squared: 0.929 MAE: 7.722
In [26]:
#预测结果可视化
def plot_results(y_true, y_pred, timestamps):
plt.figure(figsize=(18, 8))
ax = plt.gca()
# 绘制预测曲线
ax.plot(timestamps, y_true, label='真实值',
marker='o', markersize=4, linewidth=1, alpha=0.8)
ax.plot(timestamps, y_pred, label='预测值',
linestyle='--', marker='x', markersize=5, alpha=0.9)
# 设置时间轴格式
ax.xaxis.set_major_locator(HourLocator(interval=12))
ax.xaxis.set_minor_locator(HourLocator(interval=3))
ax.xaxis.set_major_formatter(DateFormatter("%m-%d %H:%M"))
# 增强可视化元素
plt.title(f'AQI预测效果对比(MAE={metrics["MAE"]:.2f}, R-squared={metrics["R-squared"]:.2f})',
fontsize=14, pad=20)
plt.xlabel('时间', fontsize=12)
plt.ylabel('AQI', fontsize=12)
plt.grid(True, which='both', linestyle='--', alpha=0.5)
plt.legend()
# 自动调整标签
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('./images/xg_by_step.png', dpi=200, bbox_inches='tight')
plt.show()
plot_results(y_test, y_pred, test_data.index)
#特征重要性可视化
def plot_importance(model, features, top_n=20):
importance = pd.Series(model.feature_importances_, index=features)
top_features = importance.sort_values(ascending=False)[:top_n]
plt.figure(figsize=(12, 8))
ax = top_features.sort_values().plot.barh()
# 添加数据标签
for i in ax.patches:
ax.text(i.get_width() + 0.02, i.get_y() + 0.2,
f'{i.get_width():.2f}',
fontsize=10, color='dimgrey')
plt.title('Top {} 重要特征'.format(top_n), fontsize=14)
plt.xlabel('特征重要性', fontsize=12)
plt.tight_layout()
plt.savefig('./images/xg_feature_importance.png', dpi=200, bbox_inches='tight')
plt.show()
np.random.seed(42)
plot_importance(best_model, features)