468 lines
13 KiB
Plaintext
468 lines
13 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"metadata": {},
|
||
"cell_type": "markdown",
|
||
"source": [
|
||
"# 预测建模\n",
|
||
"北京市空气质量指数预测(推荐难度系数10)\n",
|
||
"\n",
|
||
"这个数据集是北京市2022年11月1日至2023年10月31日期间空气质量相关数据。\n",
|
||
"根据这个数据集,回答以下问题"
|
||
],
|
||
"id": "b610f839dca4877"
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"id": "initial_id",
|
||
"metadata": {
|
||
"collapsed": true,
|
||
"ExecuteTime": {
|
||
"end_time": "2025-03-22T07:55:04.926730Z",
|
||
"start_time": "2025-03-22T07:55:03.071940Z"
|
||
}
|
||
},
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import numpy as np\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"from calculate import *\n",
|
||
"from heatmap import *"
|
||
],
|
||
"outputs": [],
|
||
"execution_count": 1
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-03-22T07:55:05.632142Z",
|
||
"start_time": "2025-03-22T07:55:04.941177Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": [
|
||
"#读取数据\n",
|
||
"data=pd.read_excel('北京市空气质量指数与气象数据.xlsx')\n",
|
||
"data.head()"
|
||
],
|
||
"id": "92ea7ba1218799cd",
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
" date hour AQI CO NO2 O3 PM10 \\\n",
|
||
"0 2022-11-01 2 18.371429 0.211429 23.771429 29.057143 13.257143 \n",
|
||
"1 2022-11-01 5 21.914286 0.180000 26.571429 20.142857 18.914286 \n",
|
||
"2 2022-11-01 8 28.628571 0.311429 30.028571 14.285714 27.942857 \n",
|
||
"3 2022-11-01 11 19.000000 0.237143 17.971429 40.529412 17.852941 \n",
|
||
"4 2022-11-01 14 21.742857 0.252941 15.588235 53.617647 20.941176 \n",
|
||
"\n",
|
||
" PM2.5 SO2 T ... P Pa U Ff Tn Tx VV Td \\\n",
|
||
"0 3.057143 2.628571 6.7 ... 770.5 0.1 36.0 1.0 5.3 17.3 30.0 -7.3 \n",
|
||
"1 3.771429 2.542857 2.0 ... 770.8 0.3 62.0 0.0 1.9 17.3 7.0 -4.5 \n",
|
||
"2 6.857143 2.400000 6.6 ... 771.7 0.9 56.0 0.0 0.9 17.3 10.0 -7.1 \n",
|
||
"3 5.914286 2.176471 13.5 ... 771.3 -0.4 19.0 2.0 0.9 17.3 30.0 -9.7 \n",
|
||
"4 6.742857 2.000000 15.7 ... 768.6 -2.7 19.0 2.0 0.9 17.3 30.0 -7.9 \n",
|
||
"\n",
|
||
" RRR tR \n",
|
||
"0 0.0 12 \n",
|
||
"1 0.0 12 \n",
|
||
"2 0.0 12 \n",
|
||
"3 0.0 12 \n",
|
||
"4 0.0 12 \n",
|
||
"\n",
|
||
"[5 rows x 21 columns]"
|
||
],
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>date</th>\n",
|
||
" <th>hour</th>\n",
|
||
" <th>AQI</th>\n",
|
||
" <th>CO</th>\n",
|
||
" <th>NO2</th>\n",
|
||
" <th>O3</th>\n",
|
||
" <th>PM10</th>\n",
|
||
" <th>PM2.5</th>\n",
|
||
" <th>SO2</th>\n",
|
||
" <th>T</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>P</th>\n",
|
||
" <th>Pa</th>\n",
|
||
" <th>U</th>\n",
|
||
" <th>Ff</th>\n",
|
||
" <th>Tn</th>\n",
|
||
" <th>Tx</th>\n",
|
||
" <th>VV</th>\n",
|
||
" <th>Td</th>\n",
|
||
" <th>RRR</th>\n",
|
||
" <th>tR</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>2022-11-01</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>18.371429</td>\n",
|
||
" <td>0.211429</td>\n",
|
||
" <td>23.771429</td>\n",
|
||
" <td>29.057143</td>\n",
|
||
" <td>13.257143</td>\n",
|
||
" <td>3.057143</td>\n",
|
||
" <td>2.628571</td>\n",
|
||
" <td>6.7</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>770.5</td>\n",
|
||
" <td>0.1</td>\n",
|
||
" <td>36.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>5.3</td>\n",
|
||
" <td>17.3</td>\n",
|
||
" <td>30.0</td>\n",
|
||
" <td>-7.3</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>12</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>2022-11-01</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>21.914286</td>\n",
|
||
" <td>0.180000</td>\n",
|
||
" <td>26.571429</td>\n",
|
||
" <td>20.142857</td>\n",
|
||
" <td>18.914286</td>\n",
|
||
" <td>3.771429</td>\n",
|
||
" <td>2.542857</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>770.8</td>\n",
|
||
" <td>0.3</td>\n",
|
||
" <td>62.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.9</td>\n",
|
||
" <td>17.3</td>\n",
|
||
" <td>7.0</td>\n",
|
||
" <td>-4.5</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>12</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>2022-11-01</td>\n",
|
||
" <td>8</td>\n",
|
||
" <td>28.628571</td>\n",
|
||
" <td>0.311429</td>\n",
|
||
" <td>30.028571</td>\n",
|
||
" <td>14.285714</td>\n",
|
||
" <td>27.942857</td>\n",
|
||
" <td>6.857143</td>\n",
|
||
" <td>2.400000</td>\n",
|
||
" <td>6.6</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>771.7</td>\n",
|
||
" <td>0.9</td>\n",
|
||
" <td>56.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.9</td>\n",
|
||
" <td>17.3</td>\n",
|
||
" <td>10.0</td>\n",
|
||
" <td>-7.1</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>12</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>2022-11-01</td>\n",
|
||
" <td>11</td>\n",
|
||
" <td>19.000000</td>\n",
|
||
" <td>0.237143</td>\n",
|
||
" <td>17.971429</td>\n",
|
||
" <td>40.529412</td>\n",
|
||
" <td>17.852941</td>\n",
|
||
" <td>5.914286</td>\n",
|
||
" <td>2.176471</td>\n",
|
||
" <td>13.5</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>771.3</td>\n",
|
||
" <td>-0.4</td>\n",
|
||
" <td>19.0</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0.9</td>\n",
|
||
" <td>17.3</td>\n",
|
||
" <td>30.0</td>\n",
|
||
" <td>-9.7</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>12</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>2022-11-01</td>\n",
|
||
" <td>14</td>\n",
|
||
" <td>21.742857</td>\n",
|
||
" <td>0.252941</td>\n",
|
||
" <td>15.588235</td>\n",
|
||
" <td>53.617647</td>\n",
|
||
" <td>20.941176</td>\n",
|
||
" <td>6.742857</td>\n",
|
||
" <td>2.000000</td>\n",
|
||
" <td>15.7</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>768.6</td>\n",
|
||
" <td>-2.7</td>\n",
|
||
" <td>19.0</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0.9</td>\n",
|
||
" <td>17.3</td>\n",
|
||
" <td>30.0</td>\n",
|
||
" <td>-7.9</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>12</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 21 columns</p>\n",
|
||
"</div>"
|
||
]
|
||
},
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"execution_count": 2
|
||
},
|
||
{
|
||
"metadata": {},
|
||
"cell_type": "markdown",
|
||
"source": [
|
||
"## 题目1\n",
|
||
"研究单日内空气质量指数与各项指标的变化趋势,这种趋势是否具有周期性?"
|
||
],
|
||
"id": "bca65e544d8bef55"
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-03-22T07:55:05.749697Z",
|
||
"start_time": "2025-03-22T07:55:05.746320Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": [
|
||
"#数据预处理:将数据按小时分组,计算每个小时各指标的平均值\n",
|
||
"\n",
|
||
"#可视化:绘制各指标小时均值的折线图,观察是否存在规律性波动\n"
|
||
],
|
||
"id": "5f8e89a8d1561e4f",
|
||
"outputs": [],
|
||
"execution_count": 3
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-03-22T07:55:05.777089Z",
|
||
"start_time": "2025-03-22T07:55:05.774038Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": "#ACF检验周期性\n",
|
||
"id": "4521bfa63d480997",
|
||
"outputs": [],
|
||
"execution_count": 4
|
||
},
|
||
{
|
||
"metadata": {},
|
||
"cell_type": "markdown",
|
||
"source": [
|
||
"## 题目2\n",
|
||
"简述各项指标间的相互关系。"
|
||
],
|
||
"id": "59e20f3463e819a6"
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-03-22T07:55:05.992326Z",
|
||
"start_time": "2025-03-22T07:55:05.988969Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": [
|
||
"#计算相关系数矩阵\n",
|
||
"\n",
|
||
"#绘制热力图\n"
|
||
],
|
||
"id": "c917d14115569bcd",
|
||
"outputs": [],
|
||
"execution_count": 5
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-03-22T07:55:06.153442Z",
|
||
"start_time": "2025-03-22T07:55:06.150747Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": "#因子分析(PCA)\n",
|
||
"id": "509d783a82bbdcb2",
|
||
"outputs": [],
|
||
"execution_count": 6
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-03-22T07:55:06.261340Z",
|
||
"start_time": "2025-03-22T07:55:06.258833Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": "#多元线性回归(我试试玩的)\n",
|
||
"id": "bb2d87337f46df",
|
||
"outputs": [],
|
||
"execution_count": 7
|
||
},
|
||
{
|
||
"metadata": {},
|
||
"cell_type": "markdown",
|
||
"source": [
|
||
"## 题目3\n",
|
||
"令2022年11月1日至2023年9月30日的空气质量数据为训练集,剩余数据为测试集。基于训练集,尝试使用两种不同的方法构建空气质量指数预测模型,并在测试集上测试。比较所选模型的预测效果。"
|
||
],
|
||
"id": "3f89fa62a897a3e3"
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-03-22T07:55:06.414915Z",
|
||
"start_time": "2025-03-22T07:55:06.410784Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": "#数据划分:训练集:2022-11-01至2023-09-30,测试集:2023-10-1至2023-10-31。\n",
|
||
"id": "d1bdac1e4e1562f2",
|
||
"outputs": [],
|
||
"execution_count": 8
|
||
},
|
||
{
|
||
"metadata": {},
|
||
"cell_type": "markdown",
|
||
"source": "### (1)SARIMA模型",
|
||
"id": "75bc1cfcc85f60a7"
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-03-22T07:55:06.452015Z",
|
||
"start_time": "2025-03-22T07:55:06.446830Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": [
|
||
"\"\"\"\n",
|
||
"该模型在假设不知道测试集其他指标的情况下,仅使用AQI历史数据预测未来AQI\n",
|
||
"\"\"\"\n",
|
||
"\n",
|
||
"#训练模型\n",
|
||
"\n",
|
||
"#输出预测与实际AQI的对比图\n",
|
||
"\n",
|
||
"#计算拟合度\n"
|
||
],
|
||
"id": "24996a0c06820cdc",
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'\\n该模型在假设不知道测试集其他指标的情况下,仅使用AQI历史数据预测未来AQI\\n'"
|
||
]
|
||
},
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"execution_count": 9
|
||
},
|
||
{
|
||
"metadata": {},
|
||
"cell_type": "markdown",
|
||
"source": "### (2)XGBOOST模型",
|
||
"id": "ebe88094b6c13e0c"
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-03-22T07:55:06.482520Z",
|
||
"start_time": "2025-03-22T07:55:06.477496Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": [
|
||
"\"\"\"\n",
|
||
"该模型在同样未知测试集其他指标的情况下,考虑到训练集的多种参数预测未来AQI\n",
|
||
"\"\"\"\n",
|
||
"\n",
|
||
"#训练模型\n",
|
||
"\n",
|
||
"#输出预测与实际AQI的对比图\n",
|
||
"\n",
|
||
"#计算拟合度\n"
|
||
],
|
||
"id": "66f104e110aba36",
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'\\n该模型在同样未知测试集其他指标的情况下,考虑到训练集的多种参数预测未来AQI\\n'"
|
||
]
|
||
},
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"execution_count": 10
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 2
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython2",
|
||
"version": "2.7.6"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|