diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b0404ea --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +.idea/ +.ipynb_checkpoints/ +checkpoint/ +__pycache__/ +.DS_Store +jupyter/ \ No newline at end of file diff --git a/XG_pred_by_step.png b/XG_pred_by_step.png deleted file mode 100644 index 678dc04..0000000 Binary files a/XG_pred_by_step.png and /dev/null differ diff --git a/XG_top_importance.png b/XG_top_importance.png deleted file mode 100644 index db6cbef..0000000 Binary files a/XG_top_importance.png and /dev/null differ diff --git a/__pycache__/calculate.cpython-312.pyc b/__pycache__/calculate.cpython-312.pyc index 56bdc6c..aecdaed 100644 Binary files a/__pycache__/calculate.cpython-312.pyc and b/__pycache__/calculate.cpython-312.pyc differ diff --git a/__pycache__/heatmap.cpython-312.pyc b/__pycache__/heatmap.cpython-312.pyc index 314d919..380e304 100644 Binary files a/__pycache__/heatmap.cpython-312.pyc and b/__pycache__/heatmap.cpython-312.pyc differ diff --git a/__pycache__/sort_matrix.cpython-312.pyc b/__pycache__/sort_matrix.cpython-312.pyc index db38b82..85782f2 100644 Binary files a/__pycache__/sort_matrix.cpython-312.pyc and b/__pycache__/sort_matrix.cpython-312.pyc differ diff --git a/air_quality_prediction.ipynb b/air_quality_prediction.ipynb index 6d3c00d..9571e6f 100644 --- a/air_quality_prediction.ipynb +++ b/air_quality_prediction.ipynb @@ -13,259 +13,92 @@ "id": "b610f839dca4877" }, { - "cell_type": "code", - "id": "initial_id", "metadata": { "collapsed": true, "ExecuteTime": { - "end_time": "2025-03-24T00:39:20.612791Z", - "start_time": "2025-03-24T00:39:20.608728Z" + "end_time": "2025-03-24T07:01:00.767221Z", + "start_time": "2025-03-24T07:00:54.883547Z" } }, + "cell_type": "code", "source": [ + "import os\n", + "import sys\n", + "\n", "#导入基础包\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", + "from calculate import *\n", + "from heatmap import *\n", + "from statsmodels.graphics.tsaplots import plot_acf\n", + "import matplotlib.font_manager as fm\n", + "font_path = '/System/Library/Fonts/STHeiti Light.ttc' # 替换为实际可用的字体文件路径\n", + "font_prop = fm.FontProperties(fname=font_path)\n", + "plt.rcParams['font.family'] = font_prop.get_name()\n", "\n", - "#导入主成分分析相关包\n", + "# 导入主成分分析相关包\n", "from factor_analyzer import Rotator\n", "from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity, calculate_kmo\n", - "# from sklearn.decomposition import PCA\n", - "# from sklearn.preprocessing import StandardScaler\n", + "from sklearn.decomposition import PCA\n", + "from sklearn.preprocessing import StandardScaler\n", "\n", - "#导入XGBOOST相关包\n", + "# 导入SARIMA相关包\n", + "from statsmodels.tsa.statespace.sarimax import SARIMAX\n", + "from sklearn.metrics import mean_squared_error, mean_absolute_error\n", + "\n", + "# 导入XGBOOST相关包\n", "from xgboost import XGBRegressor\n", "from scipy.stats import randint, uniform\n", "from sklearn.model_selection import RandomizedSearchCV\n", "from matplotlib.dates import DateFormatter, HourLocator\n", "\n", - "#导入单独写的函数\n", + "# 导入单独写的函数\n", "from calculate import *\n", "from heatmap import *\n", "from sort_matrix import *" ], + "id": "initial_id", "outputs": [], - "execution_count": 18 + "execution_count": 2 }, { "metadata": { "ExecuteTime": { - "end_time": "2025-03-24T00:39:21.076798Z", - "start_time": "2025-03-24T00:39:20.619798Z" + "end_time": "2025-03-24T03:36:16.367026Z", + "start_time": "2025-03-24T03:36:15.877757Z" } }, "cell_type": "code", "source": [ - "#设置字体\n", - "plt.rcParams['font.family'] = 'SimHei'\n", - "#读取数据\n", + "# 设置字体\n", + "if sys.platform == 'darwin': # macOS\n", + " font_path = '/System/Library/Fonts/STHeiti Light.ttc'\n", + "elif sys.platform == 'win32': # Windows\n", + " plt.rcParams['font.sans-serif'] = ['SimHei'] # Windows系统自带黑体\n", + " plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题\n", + "else: # Linux/其他系统\n", + " font_path = '/usr/share/fonts/truetype/wqy/wqy-zenhei.ttc' # 文泉驿字体\n", + "\n", + "# 仅非Windows系统需要加载字体文件\n", + "if sys.platform != 'win32':\n", + " try:\n", + " font_prop = fm.FontProperties(fname=font_path)\n", + " plt.rcParams['font.family'] = font_prop.get_name()\n", + " except:\n", + " print(f\"警告:{font_path} 字体加载失败,请检查路径有效性\")\n", + "# 读取数据\n", "data=pd.read_excel('北京市空气质量指数与气象数据.xlsx')\n", - "data.head()" + "data.head()\n", + "\n", + "try:\n", + " os.mkdir('./images')\n", + "except FileExistsError:\n", + " pass" ], "id": "92ea7ba1218799cd", - "outputs": [ - { - "data": { - "text/plain": [ - " date hour AQI CO NO2 O3 PM10 \\\n", - "0 2022-11-01 2 18.371429 0.211429 23.771429 29.057143 13.257143 \n", - "1 2022-11-01 5 21.914286 0.180000 26.571429 20.142857 18.914286 \n", - "2 2022-11-01 8 28.628571 0.311429 30.028571 14.285714 27.942857 \n", - "3 2022-11-01 11 19.000000 0.237143 17.971429 40.529412 17.852941 \n", - "4 2022-11-01 14 21.742857 0.252941 15.588235 53.617647 20.941176 \n", - "\n", - " PM2.5 SO2 T ... P Pa U Ff Tn Tx VV Td \\\n", - "0 3.057143 2.628571 6.7 ... 770.5 0.1 36.0 1.0 5.3 17.3 30.0 -7.3 \n", - "1 3.771429 2.542857 2.0 ... 770.8 0.3 62.0 0.0 1.9 17.3 7.0 -4.5 \n", - "2 6.857143 2.400000 6.6 ... 771.7 0.9 56.0 0.0 0.9 17.3 10.0 -7.1 \n", - "3 5.914286 2.176471 13.5 ... 771.3 -0.4 19.0 2.0 0.9 17.3 30.0 -9.7 \n", - "4 6.742857 2.000000 15.7 ... 768.6 -2.7 19.0 2.0 0.9 17.3 30.0 -7.9 \n", - "\n", - " RRR tR \n", - "0 0.0 12 \n", - "1 0.0 12 \n", - "2 0.0 12 \n", - "3 0.0 12 \n", - "4 0.0 12 \n", - "\n", - "[5 rows x 21 columns]" - ], - "text/html": [ - "
\n", - " | date | \n", - "hour | \n", - "AQI | \n", - "CO | \n", - "NO2 | \n", - "O3 | \n", - "PM10 | \n", - "PM2.5 | \n", - "SO2 | \n", - "T | \n", - "... | \n", - "P | \n", - "Pa | \n", - "U | \n", - "Ff | \n", - "Tn | \n", - "Tx | \n", - "VV | \n", - "Td | \n", - "RRR | \n", - "tR | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", - "2022-11-01 | \n", - "2 | \n", - "18.371429 | \n", - "0.211429 | \n", - "23.771429 | \n", - "29.057143 | \n", - "13.257143 | \n", - "3.057143 | \n", - "2.628571 | \n", - "6.7 | \n", - "... | \n", - "770.5 | \n", - "0.1 | \n", - "36.0 | \n", - "1.0 | \n", - "5.3 | \n", - "17.3 | \n", - "30.0 | \n", - "-7.3 | \n", - "0.0 | \n", - "12 | \n", - "
1 | \n", - "2022-11-01 | \n", - "5 | \n", - "21.914286 | \n", - "0.180000 | \n", - "26.571429 | \n", - "20.142857 | \n", - "18.914286 | \n", - "3.771429 | \n", - "2.542857 | \n", - "2.0 | \n", - "... | \n", - "770.8 | \n", - "0.3 | \n", - "62.0 | \n", - "0.0 | \n", - "1.9 | \n", - "17.3 | \n", - "7.0 | \n", - "-4.5 | \n", - "0.0 | \n", - "12 | \n", - "
2 | \n", - "2022-11-01 | \n", - "8 | \n", - "28.628571 | \n", - "0.311429 | \n", - "30.028571 | \n", - "14.285714 | \n", - "27.942857 | \n", - "6.857143 | \n", - "2.400000 | \n", - "6.6 | \n", - "... | \n", - "771.7 | \n", - "0.9 | \n", - "56.0 | \n", - "0.0 | \n", - "0.9 | \n", - "17.3 | \n", - "10.0 | \n", - "-7.1 | \n", - "0.0 | \n", - "12 | \n", - "
3 | \n", - "2022-11-01 | \n", - "11 | \n", - "19.000000 | \n", - "0.237143 | \n", - "17.971429 | \n", - "40.529412 | \n", - "17.852941 | \n", - "5.914286 | \n", - "2.176471 | \n", - "13.5 | \n", - "... | \n", - "771.3 | \n", - "-0.4 | \n", - "19.0 | \n", - "2.0 | \n", - "0.9 | \n", - "17.3 | \n", - "30.0 | \n", - "-9.7 | \n", - "0.0 | \n", - "12 | \n", - "
4 | \n", - "2022-11-01 | \n", - "14 | \n", - "21.742857 | \n", - "0.252941 | \n", - "15.588235 | \n", - "53.617647 | \n", - "20.941176 | \n", - "6.742857 | \n", - "2.000000 | \n", - "15.7 | \n", - "... | \n", - "768.6 | \n", - "-2.7 | \n", - "19.0 | \n", - "2.0 | \n", - "0.9 | \n", - "17.3 | \n", - "30.0 | \n", - "-7.9 | \n", - "0.0 | \n", - "12 | \n", - "
5 rows × 21 columns
\n", - "RandomizedSearchCV(cv=3,\n", + "RandomizedSearchCV(cv=3,\n", " estimator=XGBRegressor(base_score=None, booster=None,\n", " callbacks=None,\n", " colsample_bylevel=None,\n", @@ -1073,15 +1152,16 @@ " early_stopping_rounds=None,\n", " enable_categorical=False,\n", " eval_metric=None, feature_types=None,\n", - " gamma=None, grow_policy=None,\n", + " feature_weights=None, gamma=None,\n", + " grow_policy=None,\n", " importance_type=None,\n", - " interaction_constraints=None,\n", - " learning_rate=...\n", - " 'learning_rate': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x000001EB86B7EC90>,\n", - " 'max_depth': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x000001EBFF5165D0>,\n", + " interaction_constraint...\n", + " 'gamma': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x142527890>,\n", + " 'learning_rate': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x144aa2de0>,\n", + " 'max_depth': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x144a6e690>,\n", " 'n_estimators': [100, 200, 300],\n", - " 'subsample': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x000001EB86A1C5F0>},\n", - " scoring='neg_mean_absolute_error', verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.RandomizedSearchCV(cv=3,\n", + " 'subsample': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x144aa1d90>},\n", + " scoring='neg_mean_absolute_error', verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.RandomizedSearchCV(cv=3,\n", " estimator=XGBRegressor(base_score=None, booster=None,\n", " callbacks=None,\n", " colsample_bylevel=None,\n", @@ -1090,51 +1170,50 @@ " early_stopping_rounds=None,\n", " enable_categorical=False,\n", " eval_metric=None, feature_types=None,\n", - " gamma=None, grow_policy=None,\n", + " feature_weights=None, gamma=None,\n", + " grow_policy=None,\n", " importance_type=None,\n", - " interaction_constraints=None,\n", - " learning_rate=...\n", - " 'learning_rate': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x000001EB86B7EC90>,\n", - " 'max_depth': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x000001EBFF5165D0>,\n", + " interaction_constraint...\n", + " 'gamma': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x142527890>,\n", + " 'learning_rate': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x144aa2de0>,\n", + " 'max_depth': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x144a6e690>,\n", " 'n_estimators': [100, 200, 300],\n", - " 'subsample': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x000001EB86A1C5F0>},\n", - " scoring='neg_mean_absolute_error', verbose=1)" + " colsample_bytree=None, device=None, early_stopping_rounds=None,\n", + " enable_categorical=False, eval_metric=None, feature_types=None,\n", + " feature_weights=None, gamma=None, grow_policy=None,\n", + " importance_type=None, interaction_constraints=None,\n", + " learning_rate=None, max_bin=None, max_cat_threshold=None,\n", + " max_cat_to_onehot=None, max_delta_step=None, max_depth=None,\n", + " max_leaves=None, min_child_weight=None, missing=nan,\n", + " monotone_constraints=None, multi_strategy=None, n_estimators=None,\n", + " n_jobs=-1, num_parallel_tree=None, ...)XGBRegressor(base_score=None, booster=None, callbacks=None,\n", + " 'subsample': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x144aa1d90>},\n", + " scoring='neg_mean_absolute_error', verbose=1)XGBRegressor(base_score=None, booster=None, callbacks=None,\n", " colsample_bylevel=None, colsample_bynode=None,\n", - " colsample_bytree=0.7508184716426058, device=None,\n", - " early_stopping_rounds=None, enable_categorical=False,\n", - " eval_metric=None, feature_types=None, gamma=0.020833743645897518,\n", - " grow_policy=None, importance_type=None,\n", - " interaction_constraints=None, learning_rate=0.05075327204554973,\n", - " max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,\n", - " max_delta_step=None, max_depth=5, max_leaves=None,\n", - " min_child_weight=None, missing=nan, monotone_constraints=None,\n", - " multi_strategy=None, n_estimators=300, n_jobs=-1,\n", - " num_parallel_tree=None, random_state=42, ...)XGBRegressor(base_score=None, booster=None, callbacks=None,\n", + " colsample_bytree=None, device=None, early_stopping_rounds=None,\n", + " enable_categorical=False, eval_metric=None, feature_types=None,\n", + " feature_weights=None, gamma=None, grow_policy=None,\n", + " importance_type=None, interaction_constraints=None,\n", + " learning_rate=None, max_bin=None, max_cat_threshold=None,\n", + " max_cat_to_onehot=None, max_delta_step=None, max_depth=None,\n", + " max_leaves=None, min_child_weight=None, missing=nan,\n", + " monotone_constraints=None, multi_strategy=None, n_estimators=None,\n", + " n_jobs=-1, num_parallel_tree=None, ...)XGBRegressor(base_score=None, booster=None, callbacks=None,\n", " colsample_bylevel=None, colsample_bynode=None,\n", - " colsample_bytree=0.7508184716426058, device=None,\n", - " early_stopping_rounds=None, enable_categorical=False,\n", - " eval_metric=None, feature_types=None, gamma=0.020833743645897518,\n", - " grow_policy=None, importance_type=None,\n", - " interaction_constraints=None, learning_rate=0.05075327204554973,\n", - " max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,\n", - " max_delta_step=None, max_depth=5, max_leaves=None,\n", - " min_child_weight=None, missing=nan, monotone_constraints=None,\n", - " multi_strategy=None, n_estimators=300, n_jobs=-1,\n", - " num_parallel_tree=None, random_state=42, ...)