{ "cells": [ { "metadata": {}, "cell_type": "markdown", "source": [ "# 预测建模\n", "北京市空气质量指数预测(推荐难度系数10)\n", "\n", "这个数据集是北京市2022年11月1日至2023年10月31日期间空气质量相关数据。\n", "根据这个数据集,回答以下问题" ], "id": "b610f839dca4877" }, { "metadata": { "collapsed": true, "ExecuteTime": { "end_time": "2025-03-26T08:40:24.864528Z", "start_time": "2025-03-26T08:40:24.859852Z" } }, "cell_type": "code", "source": [ "import os\n", "import sys\n", "\n", "#导入基础包\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", "from statsmodels.graphics.tsaplots import plot_acf\n", "import matplotlib.font_manager as fm\n", "\n", "# 导入主成分分析相关包\n", "from factor_analyzer import Rotator\n", "from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity, calculate_kmo\n", "\n", "# 导入SARIMA相关包\n", "from statsmodels.tsa.statespace.sarimax import SARIMAX\n", "from sklearn.metrics import mean_absolute_error, mean_squared_error\n", "from pmdarima import auto_arima\n", "import pmdarima as pm\n", "\n", "# 导入XGBOOST相关包\n", "from xgboost import XGBRegressor\n", "from scipy.stats import randint, uniform\n", "from sklearn.model_selection import RandomizedSearchCV\n", "from matplotlib.dates import DateFormatter, HourLocator\n", "\n", "# 导入单独写的函数\n", "from calculate import *\n", "from heatmap import *\n", "from sort_matrix import *" ], "id": "initial_id", "outputs": [], "execution_count": 14 }, { "metadata": { "ExecuteTime": { "end_time": "2025-03-26T08:40:25.424563Z", "start_time": "2025-03-26T08:40:24.935756Z" } }, "cell_type": "code", "source": [ "# 设置字体\n", "if sys.platform == 'darwin': # macOS\n", " font_path = '/System/Library/Fonts/STHeiti Light.ttc'\n", "elif sys.platform == 'win32': # Windows\n", " plt.rcParams['font.sans-serif'] = ['SimHei'] # Windows系统自带黑体\n", " plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题\n", "else: # Linux/其他系统\n", " font_path = '/usr/share/fonts/truetype/wqy/wqy-zenhei.ttc' # 文泉驿字体\n", "\n", "# 仅非Windows系统需要加载字体文件\n", "if sys.platform != 'win32':\n", " try:\n", " font_prop = fm.FontProperties(fname=font_path)\n", " plt.rcParams['font.family'] = font_prop.get_name()\n", " except:\n", " print(f\"警告:{font_path} 字体加载失败,请检查路径有效性\")\n", "\n", "try:\n", " os.mkdir('./images')\n", "except FileExistsError:\n", " pass\n", "try:\n", " os.mkdir('./results')\n", "except FileExistsError:\n", " pass\n", "#读取数据\n", "data=pd.read_excel('北京市空气质量指数与气象数据.xlsx')\n", "data.head()" ], "id": "92ea7ba1218799cd", "outputs": [ { "data": { "text/plain": [ " date hour AQI CO NO2 O3 PM10 \\\n", "0 2022-11-01 2 18.371429 0.211429 23.771429 29.057143 13.257143 \n", "1 2022-11-01 5 21.914286 0.180000 26.571429 20.142857 18.914286 \n", "2 2022-11-01 8 28.628571 0.311429 30.028571 14.285714 27.942857 \n", "3 2022-11-01 11 19.000000 0.237143 17.971429 40.529412 17.852941 \n", "4 2022-11-01 14 21.742857 0.252941 15.588235 53.617647 20.941176 \n", "\n", " PM2.5 SO2 T ... P Pa U Ff Tn Tx VV Td \\\n", "0 3.057143 2.628571 6.7 ... 770.5 0.1 36.0 1.0 5.3 17.3 30.0 -7.3 \n", "1 3.771429 2.542857 2.0 ... 770.8 0.3 62.0 0.0 1.9 17.3 7.0 -4.5 \n", "2 6.857143 2.400000 6.6 ... 771.7 0.9 56.0 0.0 0.9 17.3 10.0 -7.1 \n", "3 5.914286 2.176471 13.5 ... 771.3 -0.4 19.0 2.0 0.9 17.3 30.0 -9.7 \n", "4 6.742857 2.000000 15.7 ... 768.6 -2.7 19.0 2.0 0.9 17.3 30.0 -7.9 \n", "\n", " RRR tR \n", "0 0.0 12 \n", "1 0.0 12 \n", "2 0.0 12 \n", "3 0.0 12 \n", "4 0.0 12 \n", "\n", "[5 rows x 21 columns]" ], "text/html": [ "
\n", " | date | \n", "hour | \n", "AQI | \n", "CO | \n", "NO2 | \n", "O3 | \n", "PM10 | \n", "PM2.5 | \n", "SO2 | \n", "T | \n", "... | \n", "P | \n", "Pa | \n", "U | \n", "Ff | \n", "Tn | \n", "Tx | \n", "VV | \n", "Td | \n", "RRR | \n", "tR | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "2022-11-01 | \n", "2 | \n", "18.371429 | \n", "0.211429 | \n", "23.771429 | \n", "29.057143 | \n", "13.257143 | \n", "3.057143 | \n", "2.628571 | \n", "6.7 | \n", "... | \n", "770.5 | \n", "0.1 | \n", "36.0 | \n", "1.0 | \n", "5.3 | \n", "17.3 | \n", "30.0 | \n", "-7.3 | \n", "0.0 | \n", "12 | \n", "
1 | \n", "2022-11-01 | \n", "5 | \n", "21.914286 | \n", "0.180000 | \n", "26.571429 | \n", "20.142857 | \n", "18.914286 | \n", "3.771429 | \n", "2.542857 | \n", "2.0 | \n", "... | \n", "770.8 | \n", "0.3 | \n", "62.0 | \n", "0.0 | \n", "1.9 | \n", "17.3 | \n", "7.0 | \n", "-4.5 | \n", "0.0 | \n", "12 | \n", "
2 | \n", "2022-11-01 | \n", "8 | \n", "28.628571 | \n", "0.311429 | \n", "30.028571 | \n", "14.285714 | \n", "27.942857 | \n", "6.857143 | \n", "2.400000 | \n", "6.6 | \n", "... | \n", "771.7 | \n", "0.9 | \n", "56.0 | \n", "0.0 | \n", "0.9 | \n", "17.3 | \n", "10.0 | \n", "-7.1 | \n", "0.0 | \n", "12 | \n", "
3 | \n", "2022-11-01 | \n", "11 | \n", "19.000000 | \n", "0.237143 | \n", "17.971429 | \n", "40.529412 | \n", "17.852941 | \n", "5.914286 | \n", "2.176471 | \n", "13.5 | \n", "... | \n", "771.3 | \n", "-0.4 | \n", "19.0 | \n", "2.0 | \n", "0.9 | \n", "17.3 | \n", "30.0 | \n", "-9.7 | \n", "0.0 | \n", "12 | \n", "
4 | \n", "2022-11-01 | \n", "14 | \n", "21.742857 | \n", "0.252941 | \n", "15.588235 | \n", "53.617647 | \n", "20.941176 | \n", "6.742857 | \n", "2.000000 | \n", "15.7 | \n", "... | \n", "768.6 | \n", "-2.7 | \n", "19.0 | \n", "2.0 | \n", "0.9 | \n", "17.3 | \n", "30.0 | \n", "-7.9 | \n", "0.0 | \n", "12 | \n", "
5 rows × 21 columns
\n", "RandomizedSearchCV(cv=3,\n", " estimator=XGBRegressor(base_score=None, booster=None,\n", " callbacks=None,\n", " colsample_bylevel=None,\n", " colsample_bynode=None,\n", " colsample_bytree=None, device=None,\n", " early_stopping_rounds=None,\n", " enable_categorical=False,\n", " eval_metric=None, feature_types=None,\n", " gamma=None, grow_policy=None,\n", " importance_type=None,\n", " interaction_constraints=None,\n", " learning_rate=...\n", " 'learning_rate': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x000001F4345D2330>,\n", " 'max_depth': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x000001F433D34290>,\n", " 'n_estimators': [100, 200, 300],\n", " 'subsample': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x000001F433D355E0>},\n", " random_state=42, scoring='neg_mean_absolute_error',\n", " verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomizedSearchCV(cv=3,\n", " estimator=XGBRegressor(base_score=None, booster=None,\n", " callbacks=None,\n", " colsample_bylevel=None,\n", " colsample_bynode=None,\n", " colsample_bytree=None, device=None,\n", " early_stopping_rounds=None,\n", " enable_categorical=False,\n", " eval_metric=None, feature_types=None,\n", " gamma=None, grow_policy=None,\n", " importance_type=None,\n", " interaction_constraints=None,\n", " learning_rate=...\n", " 'learning_rate': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x000001F4345D2330>,\n", " 'max_depth': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x000001F433D34290>,\n", " 'n_estimators': [100, 200, 300],\n", " 'subsample': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x000001F433D355E0>},\n", " random_state=42, scoring='neg_mean_absolute_error',\n", " verbose=1)
XGBRegressor(base_score=None, booster=None, callbacks=None,\n", " colsample_bylevel=None, colsample_bynode=None,\n", " colsample_bytree=0.9826605267054558, device=None,\n", " early_stopping_rounds=None, enable_categorical=False,\n", " eval_metric=None, feature_types=None, gamma=0.16898646535366177,\n", " grow_policy=None, importance_type=None,\n", " interaction_constraints=None, learning_rate=0.08708330050798323,\n", " max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,\n", " max_delta_step=None, max_depth=6, max_leaves=None,\n", " min_child_weight=None, missing=nan, monotone_constraints=None,\n", " multi_strategy=None, n_estimators=100, n_jobs=-1,\n", " num_parallel_tree=None, random_state=42, ...)
XGBRegressor(base_score=None, booster=None, callbacks=None,\n", " colsample_bylevel=None, colsample_bynode=None,\n", " colsample_bytree=0.9826605267054558, device=None,\n", " early_stopping_rounds=None, enable_categorical=False,\n", " eval_metric=None, feature_types=None, gamma=0.16898646535366177,\n", " grow_policy=None, importance_type=None,\n", " interaction_constraints=None, learning_rate=0.08708330050798323,\n", " max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,\n", " max_delta_step=None, max_depth=6, max_leaves=None,\n", " min_child_weight=None, missing=nan, monotone_constraints=None,\n", " multi_strategy=None, n_estimators=100, n_jobs=-1,\n", " num_parallel_tree=None, random_state=42, ...)