1573 lines
37 KiB
Plaintext
1573 lines
37 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "196a647a-6faa-4aee-a0bf-a345852251dd",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 深入浅出pandas\n",
|
||
"\n",
|
||
"pandas是一个支持数据分析全流程的Python开源库,它的作者Wes McKinney于2008年开始开发这个库,其主要目标是提供一个大数据分析和处理的工具。pandas封装了从数据加载、数据重塑、数据清洗到数据透视、数据呈现等一系列操作,提供了三种核心的数据类型:\n",
|
||
"1. `Series`:数据系列,表示一维的数据。跟一维数组的区别在于每条数据都有对应的索引,处理数据的方法比`ndarray`更为丰富。\n",
|
||
"2. `DataFrame`:数据框、数据窗、数据表,表示二维的数据。跟二维数组相比,`DataFrame`有行索引和列索引,而且提供了100+方法来处理数据。\n",
|
||
"3. `Index`:为`Series`和`DataFrame`提供索引服务。"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "eb84f909-921a-47da-87b1-61578c871422",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import numpy as np\n",
|
||
"import pandas as pd\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"\n",
|
||
"plt.rcParams['font.sans-serif'].insert(0, 'SimHei')\n",
|
||
"plt.rcParams['axes.unicode_minus'] = False\n",
|
||
"get_ipython().run_line_magic('config', \"InlineBackend.figure_format = 'svg'\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "2102e83e-2a6d-47aa-b449-c058bea1a601",
|
||
"metadata": {},
|
||
"source": [
|
||
"### 创建DataFrame对象"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "87dbde08-dcab-4ede-a791-b56e11dd9115",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"np.random.seed(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "4c5b2767-2074-4cdf-b1ba-beff6f425942",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"stu_names = ['狄仁杰', '白起', '李元芳', '苏妲己', '孙尚香']\n",
|
||
"cou_names = ['语文', '数学', '英语']\n",
|
||
"scores_arr = np.random.randint(60, 101, (5, 3))\n",
|
||
"scores_arr"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "f8c2a6bf-ca5e-479d-ab63-f5c3620186e3",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 方法一:通过二维数组构造DataFrame对象\n",
|
||
"df1 = pd.DataFrame(data=scores_arr, columns=cou_names, index=stu_names)\n",
|
||
"df1"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "baad5381-fb7d-4cc9-9288-a05d750144af",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 行索引\n",
|
||
"df1.index"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "d7f06b76-b60b-49cb-be72-adafb0978fca",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 列索引\n",
|
||
"df1.columns"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "13b1275d-77e5-4d5d-b227-19db3f4196fd",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 值 - 二维数组\n",
|
||
"df1.values"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "dbf5bb11-1600-4ae4-bc95-369bc8189c20",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"scores_dict = {\n",
|
||
" '语文': [95, 91, 69, 82, 92],\n",
|
||
" '数学': [86, 88, 80, 67, 100],\n",
|
||
" '英语': [75, 86, 71, 94, 81]\n",
|
||
"}"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "c300bbbd-329a-4852-bf76-78ce1de02b8f",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 方法二:通过数据字典构造DataFrame对象\n",
|
||
"df2 = pd.DataFrame(data=scores_dict, index=stu_names)\n",
|
||
"df2"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "705c0de6-43ff-46c6-85d5-301743d18d43",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 查看DataFrame信息\n",
|
||
"df2.info(memory_usage='deep')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "71417ac2-8f4b-4950-9336-de6fbc1f5da4",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 方法三:从CSV文件加载数据创建DataFrame对象\n",
|
||
"df3 = pd.read_csv(\n",
|
||
" 'res/2023年北京积分落户数据.csv',\n",
|
||
" # encoding='utf-8', # 指定字符编码\n",
|
||
" # sep='', # 指定字段的分隔符(默认逗号)\n",
|
||
" # delimiter='#',\n",
|
||
" # header=0, # 表头所在的行\n",
|
||
" # quotechar='\"', # 包裹字符串的字符(默认双引号)\n",
|
||
" # index_col='公示编号', # 索引列\n",
|
||
" # usecols=['公示编号', '姓名', '积分分值'], # 指定加载的列\n",
|
||
" # nrows=10, # 加载的行数\n",
|
||
" # skiprows=np.arange(1, 101), # 跳过哪些行\n",
|
||
" # true_values=['是', 'Yes', 'YES'], # 哪些值会被视为布尔值True\n",
|
||
" # false_values=['否', 'No', 'NO'], # 哪些值会被视为布尔值False\n",
|
||
" # na_values=['---', 'N/A'], # 哪些值会被视为空值\n",
|
||
" # iterator=True, # 开启迭代器模式\n",
|
||
" # chunksize=1000, # 每次加载的数据体量\n",
|
||
")\n",
|
||
"df3"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "67b86b13-566b-4f97-86fd-3723ef21a87f",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df4 = pd.read_csv('res/big_data_file.csv.gz', low_memory=False)\n",
|
||
"df4"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "e52ff38d-8e40-4532-8df9-4d2807a3e2ec",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df4.info(memory_usage='deep')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "65e871ff-e87c-4e6b-86cc-624af7ccbdc1",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# %pip install pyarrow"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "48fab84a-8b86-4405-966a-6bfb99582de5",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df5 = pd.read_csv('res/big_data_file.csv.gz', engine='pyarrow')\n",
|
||
"df5"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "ea575de9-4398-46fe-b2f8-8fb37b93179b",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df5.info(memory_usage='deep')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "5723cf03-b78f-4fc9-943c-f9b10036affa",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# %pip install xlrd xlwt"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "cb3387b9-3402-4b25-a5d5-ff9690a1ac06",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 方法四:从Excel文件加载数据创建DataFrame对象\n",
|
||
"df6 = pd.read_excel(\n",
|
||
" 'res/2020年销售数据.xlsx',\n",
|
||
" sheet_name='data',\n",
|
||
")\n",
|
||
"df6"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "d06abbd8-9a34-4ab3-a75c-76e3ed8eb36c",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# %pip install -U pymysql cryptography sqlalchemy"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "5aa0e35f-2a13-4c8e-a9fd-87b0bf72307e",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 方法五:从数据服务器加载数据创建DataFrame对象\n",
|
||
"from sqlalchemy import create_engine\n",
|
||
"\n",
|
||
"# URL \n",
|
||
"engine = create_engine('mysql+pymysql://guest:Guest.618@47.109.26.237:3306/hrs')\n",
|
||
"engine"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "4b344f17-f5a1-4d7d-ad3c-ede4b122609c",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"dept_df = pd.read_sql('tb_dept', engine, index_col='dno')\n",
|
||
"dept_df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "c5d1ffa3-6962-4c26-ae92-a8d7bc7da0cb",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"emp_df1 = pd.read_sql('tb_emp', engine, index_col='eno')\n",
|
||
"emp_df1"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "f84b6886-09d8-4f13-89cc-487574991dba",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"emp_df2 = pd.read_sql('tb_emp2', engine, index_col='eno')\n",
|
||
"emp_df2"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "c60e96d2-9a0d-4901-b39c-c31760de47a0",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 关闭连接释放资源\n",
|
||
"engine.connect().close()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "12086a7a-c161-4753-9a8e-180f9e8b2edf",
|
||
"metadata": {},
|
||
"source": [
|
||
"### 查看信息"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "785e58f9-b3f7-49a6-affc-8caaa66cebf1",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df6.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "fd8a9156-3939-430d-9738-60b3d8a95563",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 获取前N行\n",
|
||
"df6.head(3)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "b75ace23-9b92-4425-b58f-bcd81e8d72e7",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 获取后N行\n",
|
||
"df6.tail(5)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "c2b2a909-0b40-473c-bb3f-85aca1925a19",
|
||
"metadata": {},
|
||
"source": [
|
||
"### 操作行、列、单元格"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "fe964b3b-7f51-4202-b528-f5102d9be9f0",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 访问列\n",
|
||
"df6['销售日期']"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "b2e5ccb3-4b97-4a02-8316-b1321390f286",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df6.销售渠道"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "80ad78dc-4f47-4421-8478-ba7797350db4",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df6['销售渠道']"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "7b970671-6f16-4e07-8666-715495de2832",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"type(df6['销售日期'])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "2c9cb56b-6a2b-479e-8c57-c61683858387",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df6[['销售渠道']]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "75730cd3-0459-4a62-97ee-e037256cc98a",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"type(df6[['销售渠道']])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "9e097e49-b762-4c9f-9d93-98abb1701d97",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 访问多个列 - 花式索引\n",
|
||
"df6[['销售日期', '销售区域', '直接成本']]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "cf31a169-549e-4182-8206-789f97316115",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df6.columns[3:7]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "792713c0-13bc-4810-86cc-5f6f6ce78719",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df6[df6.columns[3:7]]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "02d43b17-15e3-44d5-844b-a50d365bf863",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 访问行 - loc属性\n",
|
||
"df6.loc[1944]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "79da6932-f985-44dc-9f4b-e051e4749c65",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df6.iloc[-1]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "6246b39b-7229-4e0f-af7b-0915e707492a",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 访问多行 - 花式索引\n",
|
||
"df6.loc[[0, 100, 58, 1000, 1000, 1000, 1099]]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "77321324-0ca9-4c2e-a792-3c717189cb27",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 访问多行 - 切片索引\n",
|
||
"df6.loc[101:200]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "5eb250eb-18e0-4181-a37a-dec55c633116",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# df6[101:200]\n",
|
||
"df6.iloc[101:200]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "f2daddd7-3635-40b1-9416-c1137315948c",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df6.iloc[-1:-101:-1]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "9321811f-e62b-4db5-a478-cdc0934f097b",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 访问单元格\n",
|
||
"df6.at[2, '售价']"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "bd1670bc-0a13-457f-95f1-352a4d61b3a7",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df6.at[2, '售价'] = 999\n",
|
||
"df6"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "7460ef03-3f45-4cc0-99a3-85039c2606b0",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df6.iat[2, -3] = 888\n",
|
||
"df6"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "34c81da6-f58f-4c36-8596-004266e9374b",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 添加列\n",
|
||
"df6['销售额'] = df6['售价'] * df6['销售数量']\n",
|
||
"df6['季度'] = df6['销售日期'].dt.quarter\n",
|
||
"df6['月份'] = df6['销售日期'].dt.month\n",
|
||
"df6"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "c3c60210-202d-4bd8-8804-1d657746b29c",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 添加行 - 实际工作中基本没有意义"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "6bf78f3d-05a2-4c7a-a0f0-fb6659f1bd6f",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 删除列\n",
|
||
"# inplace=False - 默认设定 - 不修改原对象返回修改后的新对象\n",
|
||
"# inplace=True - 直接修改DataFrame对象不返回新对象 - 方法没有返回值\n",
|
||
"df6.drop(columns=['季度'], inplace=True)\n",
|
||
"df6"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "cdf8cf10-5193-4c38-8fef-bc3d38a8a0a8",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 删除行\n",
|
||
"# df6.drop(index=[0, 1, 2, 100, 1944, 1943])\n",
|
||
"df6.drop(index=[0, 1, 2, 100, 1944, 1943], inplace=True)\n",
|
||
"df6"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "1ddfe77d-aa92-4d6a-b2db-8469b1222ed3",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df6.drop(index=df6.index[100:200], inplace=True)\n",
|
||
"df6"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "8020bbb0-740e-496a-9224-fe3495a19c92",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 重命名\n",
|
||
"df6.rename(columns={'销售区域': '区域', '销售渠道': '渠道', '销售订单': '订单号'}, inplace=True)\n",
|
||
"df6"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "d028d2be-0944-4b70-a3ea-f7d06cdd458f",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 重置索引\n",
|
||
"# drop=False - 默认值 - 原来的索引变成一个普通列\n",
|
||
"# drop=True - 原来的索引直接丢弃\n",
|
||
"df6.reset_index(drop=True, inplace=True)\n",
|
||
"df6"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "cb55a518-f4bd-4fac-8554-4353c0798bc6",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 设置索引\n",
|
||
"df6.set_index('订单号', inplace=True)\n",
|
||
"df6"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "101bd804-5a90-4cd3-a545-613df6d9b8e5",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 筛选数据 - 布尔索引\n",
|
||
"df6[df6['销售额'] > 100000]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "64c83a43-fcb0-4ba1-9400-ae4a5b21715c",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df6[(df6['销售额'] > 100000) & (df6['月份'] == 6)]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "22c01e56-b188-40f7-9e53-3a3d2f0bcb29",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df6[(df6['销售额'] > 100000) | (df6['月份'] == 6)]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "5adb86b9-8b31-49cb-9292-94189f3714c5",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df6.query('销售额 > 100000')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "b768afa0-7066-4a1d-8f10-b88386587388",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df6.query('月份 == 6 and 渠道 == \"实体\"')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "2e57b21c-0565-4352-8924-de169497bce0",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df6.query('销售额 > 100000 and 月份 == 6')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "7ef8ba56-5293-41b0-8208-85a0eed735e8",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 随机抽样\n",
|
||
"df6.sample(n=100)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "bfcd52d7-eac4-4776-b0e3-a37e67e349f3",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df6.sample(frac=0.05)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "1c654ca8-3179-4fa2-9213-7d7029357342",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# replace=False - 无放回抽样\n",
|
||
"ignore_rows = np.random.choice(np.arange(1, 1946), size=int(1945 * 0.9), replace=False)\n",
|
||
"pd.read_excel(\n",
|
||
" 'res/2020年销售数据.xlsx',\n",
|
||
" sheet_name='data',\n",
|
||
" skiprows=ignore_rows\n",
|
||
")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "2037ed6a-d616-4c67-9f5d-ea517d6e1c6b",
|
||
"metadata": {},
|
||
"source": [
|
||
"### 数据重塑\n",
|
||
"\n",
|
||
"1. 拼接(合并结构一致的数据)\n",
|
||
"2. 合并(事实表连接维度表)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "d2184fd4-bd44-459f-bda4-6dc11c09c219",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 拼接两个DataFrame - union\n",
|
||
"all_emp_df = pd.concat([emp_df1, emp_df2])\n",
|
||
"all_emp_df.shape"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "05bc65a1-42ac-463c-a089-08fb8dc60855",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 连表 - 连接事实表和维度表 - 用维度把数据分组然后再做聚合\n",
|
||
"# 连接两个DataFrame(内连接、左外连接、右外连接、全外连接)- join\n",
|
||
"# how - 连表方式 - inner、left、right、outer\n",
|
||
"# on - 基于哪个字段连表 - left_on、right_on\n",
|
||
"all_emp_df = pd.merge(all_emp_df, dept_df, how='inner', on='dno')\n",
|
||
"all_emp_df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "c6a3d52d-a04c-494d-9ee9-2dad9805b1c1",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 作业:在jobs目录下有若干个CVS文件,它们的数据结构是一样的,现在需要把所有CSV文件的数据拼接到一个DataFrame中\n",
|
||
"import os\n",
|
||
"\n",
|
||
"dfs = [pd.read_csv(os.path.join('res/jobs', filename))\n",
|
||
" for filename in os.listdir('res/jobs') \n",
|
||
" if filename.endswith('.csv')]\n",
|
||
"pd.concat(dfs, ignore_index=True).to_csv('res/all_jobs.csv', index=False)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "6b9ad1e1-fe5d-45a0-8755-ac6720a32ba0",
|
||
"metadata": {},
|
||
"source": [
|
||
"### 数据清洗\n",
|
||
"\n",
|
||
"1. 缺失值\n",
|
||
"2. 重复值\n",
|
||
"3. 异常值\n",
|
||
"4. 预处理"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "45c835c4-559f-45f1-a501-70a8c12bbbb1",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 甄别缺失值\n",
|
||
"all_emp_df.isna()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "fd7fbdf8-ebf2-463b-ac3b-cdb24560873a",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# all_emp_df['comm'].isna()\n",
|
||
"all_emp_df['comm'].isnull()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "a4f16d30-83e9-4761-92a1-780e85e721e1",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# all_emp_df['comm'].notna()\n",
|
||
"all_emp_df['comm'].notnull()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "9f2a153d-ab4a-475e-9ee3-0d623a289f7f",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"all_emp_df['comm'].notna().value_counts()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "5d388d57-fa1a-405b-880e-9316354a6f05",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 删除空值 - 删除带有空值的行\n",
|
||
"all_emp_df.dropna()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "b40fa037-3fab-454e-a300-2e9dcf4b2b60",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"all_emp_df.dropna(axis=1)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "67ae21a1-7dc1-496b-85b5-013d79d25a63",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"all_emp_df.mgr.dropna()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "66745379-9db7-42b0-ab6b-a55e870a515b",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 填充空值\n",
|
||
"all_emp_df.fillna(0)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "e1664115-30e0-4946-ae4b-c919bb319ddc",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"all_emp_df.comm.fillna(0).astype('i8')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "e1743531-66a2-42a4-8c28-ad268efc848c",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 将空值下方的非空值向上填充 - backward fill\n",
|
||
"all_emp_df.comm.bfill()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "5fcef0a0-ff29-42bd-9955-5a97595390fd",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 将空值上方的非空值向下填充 - forward fill\n",
|
||
"all_emp_df.comm.ffill()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "eeeb9be3-802c-44e3-80a0-465aba1a485a",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 通过插值算法填充空值 - interpolate\n",
|
||
"all_emp_df['comm'] = all_emp_df.comm.interpolate(method='linear')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "f1f094c3-1cc2-4826-a04a-24150ea9cef8",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"all_emp_df['comm'] = all_emp_df.comm.astype('i8')\n",
|
||
"all_emp_df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "a739242d-ebd2-42d2-9ec7-9a5939cbf74a",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"all_emp_df['mgr'] = all_emp_df.mgr.fillna(-1).astype('i8')\n",
|
||
"all_emp_df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "cd376d13-2245-48b8-ba14-3315d4c48f9c",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 甄别重复值\n",
|
||
"all_emp_df.ename.duplicated()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "6e107a38-c5e8-4e5e-9e42-71481c54e0d1",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"all_emp_df.duplicated(['ename', 'job'])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "097eaaf2-1112-4e0f-b361-786bf91d6c1f",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 统计每个元素出现的频次\n",
|
||
"all_emp_df.ename.value_counts()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "6494bb56-7ac7-47df-a9f1-960b02586e31",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"all_emp_df.job.value_counts()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "172e4d9a-63bd-44ca-98ea-e4614c8823ab",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 统计不重复的元素的个数\n",
|
||
"all_emp_df.ename.nunique()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "d6fa062c-d338-407f-8647-e84878a5642e",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 删除重复值\n",
|
||
"# keep='first' - 默认值,重复元素保留第一项 - 'last' / False\n",
|
||
"all_emp_df.drop_duplicates(['ename', 'job'], keep='last', inplace=True)\n",
|
||
"all_emp_df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "832a2ea2-6941-4364-b143-af7db9ff9701",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 异常值的甄别\n",
|
||
"# 数值判定法(data < Q1 - 1.5 * IQR 或者 data > Q3 + 1.5 * IQR)\n",
|
||
"\n",
|
||
"\n",
|
||
"def find_outliers_by_iqr(data, whis=1.5):\n",
|
||
" q1, q3 = np.quantile(data, [0.25, 0.75])\n",
|
||
" iqr = q3 - q1\n",
|
||
" return data[(data < q1 - whis * iqr) | (data > q3 + whis * iqr)]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "1cd5d6aa-c60e-483e-995c-a627a0dfec15",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"temp = np.random.normal(80, 8, 50).round(0)\n",
|
||
"temp = np.append(temp, [120, 160, 200, 40, 20, -50])\n",
|
||
"temp"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "2121dab4-0efc-4fcd-a5fe-67585552cb53",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"find_outliers_by_iqr(temp)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "da048825-3f88-4009-9db5-159e8e883b10",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"find_outliers_by_iqr(temp, whis=3)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "0da7034b-2350-43ff-a6eb-9e7f4361bdee",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# zscore判定法(三西格玛法则 ---> 68-95-99.7法则)\n",
|
||
"\n",
|
||
"\n",
|
||
"def find_outliers_by_zscore(data, mul=3):\n",
|
||
" mu, sigma = np.mean(data), np.std(data)\n",
|
||
" zscore = (data - mu) / sigma\n",
|
||
" return data[np.abs(zscore) > mul]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "e88616c0-a4d8-4fd8-9ec2-e761cb5ba056",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"find_outliers_by_zscore(temp)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "c902031c-2f78-4721-9734-5c5b0ca81650",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"find_outliers_by_zscore(temp, mul=2)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "1e295014-d582-4e78-b5b9-6d9f0463ff8d",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"find_outliers_by_zscore(df6.直接成本)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "97b98c82-fd09-42a9-8a75-a3e71ae10fbc",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 根据离群点的行索引删除行\n",
|
||
"df6.drop(index=find_outliers_by_zscore(df6.直接成本).index)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "0053ed12-c09f-4331-a6dd-487ff990c680",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"med_value = np.median(temp)\n",
|
||
"med_value"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "f02c2985-1b07-4b1c-b248-aa1de9e98451",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"find_outliers_by_zscore(temp, mul=2)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "485adc15-f39d-419b-9869-2b366f5d88ec",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"np.in1d(temp, find_outliers_by_zscore(temp, mul=2))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "ce92f242-1f0f-476e-ae85-91e1615783ef",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 替换离群点\n",
|
||
"np.place(temp, np.in1d(temp, find_outliers_by_zscore(temp, mul=2)), med_value)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "10b0b0bc-f98c-40fe-890f-976df9d9c52b",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"temp"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "d970e838-42f2-44d0-8f2d-07ebbf6de2b0",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### 案例1:招聘数据清洗和预处理\n",
|
||
"\n",
|
||
"1. 数据加载\n",
|
||
"2. 去重\n",
|
||
"3. 数据抽取\n",
|
||
"4. 拆分列\n",
|
||
"5. 替换值\n",
|
||
"6. 数据筛选"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "1ec417a9-457f-434e-96a6-f4fd35d75987",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"jobs_df = pd.read_csv('res/all_jobs.csv')\n",
|
||
"jobs_df.head(10)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "74e0e4a5-3c03-4617-9661-8cfa03b88fd7",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 根据URI列去重\n",
|
||
"jobs_df.drop_duplicates('uri', inplace=True)\n",
|
||
"jobs_df.shape"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "6cca7b8b-25f1-46b8-9946-34ba90f42116",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 通过正则表达式从列中提取信息\n",
|
||
"jobs_df[['salary_lower', 'salary_upper']] = jobs_df.salary.str.extract(r'(\\d+)-(\\d+)').astype('i8')\n",
|
||
"jobs_df['salary'] = (jobs_df.salary_lower + jobs_df.salary_upper) / 2\n",
|
||
"jobs_df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "ffaea2af-09f6-4577-9c0d-024966d6854f",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"jobs_df.drop(columns=['uri', 'city'], inplace=True)\n",
|
||
"jobs_df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "d9ba5998-ca1d-44c8-87ca-363356074dd5",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 拆分列\n",
|
||
"jobs_df['city'] = jobs_df.site.str.split(expand=True)[0]\n",
|
||
"jobs_df.drop(columns='site', inplace=True)\n",
|
||
"jobs_df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "933e9006-4f5e-4238-b6d9-940dfeb6caf1",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 字符串正则表达式替换\n",
|
||
"jobs_df['year'] = jobs_df.year.replace(r'5-10年|10年以上', '5年以上', regex=True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "d10a9c1c-a9d5-49e1-8fdf-a68b5bb3d59a",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"jobs_df.year.unique()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "d248e233-bac5-48d5-8a69-a1f04350867a",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"jobs_df['edu'] = jobs_df.edu.replace(r'中专|高中', '学历不限', regex=True)\n",
|
||
"jobs_df['edu'] = jobs_df.edu.replace(r'硕士|博士', '研究生', regex=True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "eec6fbd5-2355-4674-9e5d-7f47a5a808a2",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"jobs_df.edu.unique()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "352b1921-aa2b-4016-af3e-02032b2a3935",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"jobs_df['job_name'] = jobs_df.job_name.str.lower()\n",
|
||
"jobs_df = jobs_df[jobs_df.job_name.str.contains('python|数据|产品|运营|data', regex=True)]\n",
|
||
"jobs_df.shape"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "df370013-1278-48d2-9891-8647df3c5e15",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"jobs_df.to_csv('res/cleand_jobs.csv', index=False)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "8ee07676-737c-420e-b11a-235ff7f2c4c8",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### 案例2:北京积分落户数据预处理\n",
|
||
"\n",
|
||
"1. 加载数据\n",
|
||
"2. 日期时间处理\n",
|
||
"3. 年龄段分箱\n",
|
||
"4. 落户积分归一化"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "1232d023-7591-47b3-b67b-4920642dd28d",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"settle_df = pd.read_csv('res/2023年北京积分落户数据.csv', index_col='公示编号')\n",
|
||
"settle_df.head(5)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "734eb268-3ad7-4e67-9661-08328075992b",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"settle_df.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "63698465-ddcd-430c-bd96-e78abaaebda3",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 将字符串处理成日期\n",
|
||
"settle_df['出生年月'] = pd.to_datetime(settle_df['出生年月'])\n",
|
||
"settle_df.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "989c56c7-85fa-4180-9b86-5247a41cdbab",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 将生日换算成年龄\n",
|
||
"settle_df['年龄'] = (pd.to_datetime('2023-01-01') - settle_df.出生年月).dt.days // 365\n",
|
||
"settle_df.head(5)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "4191c7a2-19fd-4347-ac79-2371c8e59c10",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 将年龄划分到年龄段 - 分箱 - 数据桶\n",
|
||
"settle_df['年龄段'] = pd.cut(\n",
|
||
" settle_df.年龄,\n",
|
||
" bins=np.arange(35, 61, 5),\n",
|
||
" labels=['35~39岁', '40~44岁', '45~49岁', '50~54岁', '55~59岁'],\n",
|
||
" right=False\n",
|
||
")\n",
|
||
"settle_df.head(5)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "ea2e0c9b-0aa0-41d3-a52a-6926b797465c",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 统计每个元素出现的频次\n",
|
||
"temp = settle_df.年龄段.value_counts()\n",
|
||
"temp"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "30843274-b940-4527-92ed-97db86bb4ec7",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"plt.cm.Greens(np.linspace(0.9, 0.1, 5))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "375dd407-9d0a-4788-a38e-3a37efbb6d3b",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 绘制柱状图\n",
|
||
"temp.plot(\n",
|
||
" kind='bar', # 图表类型\n",
|
||
" figsize=(8, 4), # 图表尺寸\n",
|
||
" xlabel='', # 横轴标签\n",
|
||
" ylabel='Count', # 纵轴标签\n",
|
||
" width=0.5, # 柱子宽度\n",
|
||
" hatch='//', # 柱子条纹\n",
|
||
" color=plt.cm.Greens(np.linspace(0.9, 0.3, temp.size)) # 颜色值\n",
|
||
")\n",
|
||
"\n",
|
||
"for i in range(temp.size):\n",
|
||
" # plt.text(横坐标, 纵坐标, 标签内容)\n",
|
||
" plt.text(i, temp.iloc[i] + 30, temp.iloc[i], ha='center')\n",
|
||
"\n",
|
||
"# 定制横轴的刻度\n",
|
||
"plt.xticks(rotation=0)\n",
|
||
"plt.show()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "e020ba6c-d16d-482f-ad3b-a9e855257b91",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 绘制饼图\n",
|
||
"temp.plot(\n",
|
||
" kind='pie',\n",
|
||
" ylabel='',\n",
|
||
" autopct='%.1f%%', # 自动计算并显示百分比\n",
|
||
" wedgeprops={'width': 0.3}, # 环状结构部分的宽度\n",
|
||
" pctdistance=0.85, # 百分比到圆心的距离\n",
|
||
" labeldistance=1.1, # 标签到圆心的距离\n",
|
||
" # shadow=True, # 阴影效果\n",
|
||
" # startangle=0, # 起始角度\n",
|
||
" counterclock=True, # 是否反时针方向绘制\n",
|
||
")\n",
|
||
"plt.show()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "e846eec2-6c95-409c-8b15-2b14cab3f57c",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# agg - aggregate - 聚合\n",
|
||
"settle_df.积分分值.agg(['mean', 'max', 'min', 'std', 'skew', 'kurt'])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "b1669102-1c03-4751-813c-b241a05718e3",
|
||
"metadata": {},
|
||
"source": [
|
||
"线性归一化:\n",
|
||
"$$\n",
|
||
"x^{\\prime} = \\frac{x - x_{min}}{x_{max} - x_{min}}\n",
|
||
"$$"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "e8d9dca7-b976-43ab-96b8-abefca66cc53",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 将积分分值处理成0~1范围的值\n",
|
||
"max_score, min_score = settle_df.积分分值.agg(['max', 'min'])\n",
|
||
"max_score, min_score"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "10acd550-8422-4934-b38f-03554f86d305",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# map - 映射 - 将指定的函数作用到数据系列的每个元素上\n",
|
||
"# apply - 应用 - 将指定的函数应用到数据系列的每个元素上\n",
|
||
"settle_df['线性归一化积分'] = settle_df.积分分值.map(lambda x: (x - min_score) / (max_score - min_score)).round(2)\n",
|
||
"settle_df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "55e57b00-cb9e-4c9e-bc59-e99b738e2f5d",
|
||
"metadata": {},
|
||
"source": [
|
||
"zscore标准化:\n",
|
||
"$$\n",
|
||
"x^{\\prime} = \\frac{x - \\mu}{\\sigma}\n",
|
||
"$$"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "b5fc6260-5337-4161-99f0-d7be43d59361",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"mu, sigma = settle_df.积分分值.agg(['mean', 'std'])\n",
|
||
"settle_df['zscore评分'] = settle_df.积分分值.apply(lambda x: (x - mu) / sigma)\n",
|
||
"settle_df"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.11.7"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|