Python-100-Days/Day66-80/code/day05.ipynb

674 lines
18 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"id": "97982f4b-b3d0-47fe-b6e8-a1a327d03d93",
"metadata": {},
"source": [
"## 深入浅出pandas"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "834d7d19-4015-4048-a1a6-2b8c756f0115",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"\n",
"plt.rcParams['font.sans-serif'].insert(0, 'SimHei')\n",
"plt.rcParams['axes.unicode_minus'] = False"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "56becc00-f1c7-4f81-86d1-7f7b23fd65d4",
"metadata": {},
"outputs": [],
"source": [
"%config InlineBackend.figure_format = 'svg'"
]
},
{
"cell_type": "markdown",
"id": "c81bfe94-906c-4e13-b321-0e6c397aea46",
"metadata": {},
"source": [
"### 数据透视\n",
"\n",
"1. 数据聚合(指标统计)\n",
"2. 排序和头部值\n",
"3. 透视表和交叉表"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7836ce8b-1e1d-40e5-b01c-c228ebf60928",
"metadata": {},
"outputs": [],
"source": [
"sales_df = pd.read_excel('res/2020年销售数据.xlsx', sheet_name='data')\n",
"sales_df.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ca3b2020-9470-4e49-a3ca-066dc9a437b3",
"metadata": {},
"outputs": [],
"source": [
"sales_df.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "224ae45b-172a-48bc-8df0-7d6c7099529a",
"metadata": {},
"outputs": [],
"source": [
"# 添加销售额、毛利润、月份列\n",
"sales_df['销售额'] = sales_df.售价 * sales_df.销售数量\n",
"sales_df['毛利润'] = sales_df.销售额 - sales_df.直接成本\n",
"sales_df['月份'] = sales_df.销售日期.dt.month\n",
"sales_df.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c8878bfd-64de-4a6a-9ae3-36e9af40d45e",
"metadata": {},
"outputs": [],
"source": [
"def make_tag(price):\n",
" if price < 300:\n",
" return '低端'\n",
" elif price < 800:\n",
" return '中端'\n",
" return '高端'"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4cba4262-b952-4fce-9b07-fed9c5f2a2e3",
"metadata": {},
"outputs": [],
"source": [
"# 根据商品的价格添加价位标签\n",
"sales_df['价位'] = sales_df.售价.apply(make_tag)\n",
"sales_df.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b2a2b472-55fc-4d35-8530-445e8ffbf800",
"metadata": {},
"outputs": [],
"source": [
"# 统计北极星指标\n",
"GMV, profit, quantity = sales_df[['销售额', '毛利润', '销售数量']].sum()\n",
"print(f'销售额: {GMV}元')\n",
"print(f'毛利润: {profit}元')\n",
"print(f'销售数量: {quantity}件')\n",
"print(f'毛利率: {profit / GMV:.2%}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5b6ac225-6834-462f-a5b7-8be8297b66d0",
"metadata": {},
"outputs": [],
"source": [
"# 统计每个月的销售额和毛利润\n",
"temp1 = sales_df.groupby('月份')[['销售额', '毛利润']].agg('sum')\n",
"temp1"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a8f179a1-c270-458d-be84-b862ca2c143d",
"metadata": {},
"outputs": [],
"source": [
"# 使用透视表统计每个月的销售额和毛利润\n",
"pd.pivot_table(\n",
" sales_df,\n",
" index='月份',\n",
" values=['销售额', '毛利润'],\n",
" aggfunc='sum'\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3a3cb912-c47f-4d47-8bd4-ffc9eb171cba",
"metadata": {},
"outputs": [],
"source": [
"# 绘制折线图\n",
"temp1.plot(\n",
" kind='line',\n",
" figsize=(10, 5),\n",
" y=['销售额', '毛利润'], # 放到纵轴上的数据\n",
" xlabel='', # 横轴的标签\n",
" ylabel='销售额和毛利润', # 纵轴的标签\n",
" marker='^', # 标记点符号\n",
")\n",
"# plt.fill_between(np.arange(1, 13), temp1.销售额, where=temp1.销售额 >= 3e6, facecolor='red', alpha=0.25)\n",
"# plt.fill_between(np.arange(1, 13), temp1.销售额, where=temp1.销售额 < 3e6, facecolor='green', alpha=0.25)\n",
"# 定制纵轴的取值范围\n",
"plt.ylim(0, 6e6)\n",
"# 定制横轴的刻度\n",
"plt.xticks(np.arange(1, 13), labels=[f'{x}月' for x in range(1, 13)])\n",
"# 定制标题\n",
"plt.title('2020年月度销售额和毛利润', fontdict={'fontsize': 22, 'color': 'navy'})\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "389db27d-2b11-47ea-a8c6-660fcc39b863",
"metadata": {},
"outputs": [],
"source": [
"plt.cm.RdYlBu_r"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "621a2487-2d15-450d-9f69-df8089616bd9",
"metadata": {},
"outputs": [],
"source": [
"# 计算月环比\n",
"temp1['销售额月环比'] = temp1.销售额.pct_change()\n",
"temp1['毛利润月环比'] = temp1.毛利润.pct_change()\n",
"# 索引重排序\n",
"temp1 = temp1.reindex(columns=['销售额', '销售额月环比', '毛利润', '毛利润月环比'])\n",
"# 渲染输出\n",
"temp1.style.format(\n",
" formatter={\n",
" '销售额月环比': '{:.2%}',\n",
" '毛利润月环比': '{:.2%}'\n",
" },\n",
" na_rep='-------'\n",
").background_gradient(\n",
" 'RdYlBu_r',\n",
" subset=['销售额月环比', '毛利润月环比']\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6b158aea-fe6e-4938-a4f8-4b880c6f23d0",
"metadata": {},
"outputs": [],
"source": [
"# 绘制横线图\n",
"mu = temp1.销售额.mean()\n",
"temp1['diff'] = temp1.销售额 - mu\n",
"temp1['colors'] = temp1.销售额.map(lambda x: 'green' if x > mu else 'red')\n",
"\n",
"plt.figure(figsize=(8, 6), dpi=200)\n",
"plt.hlines(y=temp1.index, xmin=0, xmax=temp1['diff'], color=temp1.colors, alpha=0.6, linewidth=6)\n",
"plt.yticks(np.arange(1, 13), labels=[f'{x}月' for x in np.arange(1, 13)])\n",
"# 定制网格线\n",
"plt.grid(linestyle='--', linewidth=0.4, alpha=0.5)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "810936a1-f6c5-43e9-9eda-b9fe8014ba9f",
"metadata": {},
"outputs": [],
"source": [
"# 各品牌对销售额贡献占比\n",
"temp2 = sales_df.groupby('品牌')['销售额'].sum()\n",
"temp2.plot(\n",
" kind='pie',\n",
" ylabel='',\n",
" autopct='%.2f%%', # 自动计算并显示百分比\n",
" pctdistance=0.82, # 百分比标签到圆心的距离\n",
" wedgeprops=dict(width=0.35, edgecolor='w'), # 定制环状饼图\n",
" explode=[0.1, 0, 0, 0, 0], # 分离饼图\n",
")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1465d545-14e8-435d-9b12-651df3bab9ea",
"metadata": {},
"outputs": [],
"source": [
"# 各销售区域每个月的销售额\n",
"temp3 = sales_df.groupby(['销售区域', '月份'], as_index=False)[['销售额']].sum()\n",
"# pivot - 将行旋转到列上(窄表 ----> 宽表)\n",
"# melt - 将列旋转到行上(宽表 ----> 窄表)\n",
"temp3.pivot(index='销售区域', columns='月份', values='销售额').fillna(0).astype('i8')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1adf23bb-7120-416c-846e-1098aac2e1f4",
"metadata": {},
"outputs": [],
"source": [
"# 创建透视表\n",
"pd.pivot_table(\n",
" sales_df,\n",
" index='销售区域',\n",
" columns='月份',\n",
" values='销售额',\n",
" aggfunc='sum',\n",
" fill_value=0,\n",
" margins=True,\n",
" margins_name='总计'\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fdf54c9a-64c3-453d-bc5b-533fce7b25ed",
"metadata": {},
"outputs": [],
"source": [
"# 将价位字段处理成category类型并指定排序的顺序\n",
"sales_df['价位'] = sales_df.价位.astype('category').cat.reorder_categories(['高端', '中端', '低端'])\n",
"sales_df.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6e706575-3260-4b37-99fe-696db4f6fe7b",
"metadata": {},
"outputs": [],
"source": [
"# 统计每个月各种价位产品的销量\n",
"temp4 = sales_df.pivot_table(\n",
" index='价位',\n",
" columns='月份',\n",
" values='销售数量',\n",
" observed=False,\n",
" fill_value=0,\n",
" aggfunc='sum'\n",
")\n",
"temp4"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "12527025-64f5-4a0a-9bf5-6bdbc4b6f93b",
"metadata": {},
"outputs": [],
"source": [
"# 交叉表\n",
"pd.crosstab(\n",
" index=sales_df.价位,\n",
" columns=sales_df.月份,\n",
" values=sales_df.销售数量,\n",
" aggfunc='sum'\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d99ecd49-2669-493c-ae04-974afa71dc4d",
"metadata": {},
"outputs": [],
"source": [
"blood_types = np.array(['B', 'A', 'O', 'O', 'AB', 'B', 'O', 'B', 'AB', 'A', 'A', 'O', 'B', 'O', 'O', 'O', 'O', 'A', 'B', 'B'])\n",
"personality_types = np.array(['𝛃', '𝛂', '𝛂', '𝛂', '𝛃', '𝛂', '𝛄', '𝛄', '𝛂', '𝛄', '𝛃', '𝛂', '𝛂', '𝛂', '𝛄', '𝛄', '𝛂', '𝛂', '𝛂', '𝛂'])\n",
"\n",
"# 创建交叉表\n",
"pd.crosstab(\n",
" index=blood_types,\n",
" columns=personality_types,\n",
" rownames=['血型'],\n",
" colnames=['人格'],\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3cba2e9e-8335-4081-a253-5ab6d08d8559",
"metadata": {},
"outputs": [],
"source": [
"# 绘制堆叠柱状图\n",
"temp4.T.plot(\n",
" figsize=(10, 4),\n",
" kind='bar',\n",
" width=0.6,\n",
" xlabel='',\n",
" ylabel='销售数量',\n",
" stacked=True\n",
")\n",
"plt.xticks(rotation=0)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9eba1698-d8a5-4b66-b617-fc68072a670e",
"metadata": {},
"outputs": [],
"source": [
"# 让每一项数据除以对应月份的销售数量之和\n",
"temp5 = temp4.T.divide(temp4.sum(), axis=0)\n",
"temp5"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1ce2e80e-0af9-4162-a46d-b5d357a9362d",
"metadata": {},
"outputs": [],
"source": [
"# 绘制百分比堆叠柱状图\n",
"temp5.plot(\n",
" figsize=(10, 4),\n",
" kind='bar',\n",
" width=0.6,\n",
" xlabel='',\n",
" ylabel='销量占比',\n",
" stacked=True\n",
")\n",
"plt.xticks(rotation=0)\n",
"plt.yticks(np.linspace(0, 1, 6), labels=[f'{x:.0%}' for x in np.linspace(0, 1, 6)])\n",
"plt.legend(loc='lower center')\n",
"\n",
"for i in temp5.index:\n",
" y1, y2, y3 = temp5.loc[i]\n",
" plt.text(i - 1, y2 / 2 + y1, f'{y2:.2%}', ha='center', va='center', fontdict={'size': 8})\n",
" plt.text(i - 1, y3 / 2 + y2 + y1, f'{y3:.2%}', ha='center', va='center', fontdict={'size': 8})\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "82ac6b37-302e-4913-a973-4a7f4e6daf1d",
"metadata": {},
"source": [
"### 作业:招聘岗位数据分析\n",
"\n",
"1. 统计出城市、招聘信息、招聘岗位的数量和平均月薪。\n",
"2. 统计每个城市的岗位数量从高到低排序。\n",
"3. 统计每个城市的平均薪资从高到低排序。\n",
"4. 统计招聘岗位对学历要求的占比。\n",
"5. 统计招聘岗位对工作年限的要求占比。\n",
"6. 分析薪资跟学历和工作年限的关系。"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "91e8b171-7568-4da1-8170-5d13fc8945bc",
"metadata": {},
"outputs": [],
"source": [
"jobs_df = pd.read_csv('res/cleaned_jobs.csv')\n",
"jobs_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7cadf547-f7aa-494b-beeb-678f5a09e967",
"metadata": {},
"outputs": [],
"source": [
"# 统计北极星指标\n",
"city_count = jobs_df['city'].nunique()\n",
"info_count = jobs_df['company_name'].count()\n",
"post_count = jobs_df['pos_count'].sum()\n",
"salary_avg = jobs_df['salary'].mean().round(1)\n",
"print(f'城市数量: {city_count}')\n",
"print(f'信息数量: {info_count}')\n",
"print(f'岗位数量: {post_count}')\n",
"print(f'平均薪资: {salary_avg}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cb9fc057-df56-4f01-a08e-ab3427dca467",
"metadata": {},
"outputs": [],
"source": [
"# 统计每个城市的岗位数量从高到低排序\n",
"jobs_df.groupby('city')[['pos_count']].sum().sort_values(by='pos_count', ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "430af08e-2ce2-4f4f-9d64-df2d09c7c9f3",
"metadata": {},
"outputs": [],
"source": [
"pd.pivot_table(\n",
" jobs_df,\n",
" index='city',\n",
" values='pos_count',\n",
" aggfunc='sum'\n",
").sort_values(by='pos_count', ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bcd4aeca-bb90-42e6-b45c-6cdea2c76271",
"metadata": {},
"outputs": [],
"source": [
"jobs_df.groupby('city')[['salary']].mean().round(1).sort_values(by='salary', ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6e93f5c8-da3e-4064-ac77-748eb3167504",
"metadata": {},
"outputs": [],
"source": [
"# 统计每个城市的平均薪资从高到低排序\n",
"pd.pivot_table(\n",
" jobs_df,\n",
" index='city',\n",
" values='salary',\n",
" aggfunc='mean'\n",
").round(1).sort_values(by='salary', ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7ae067bf-2981-49f2-a224-28a5223dc21f",
"metadata": {},
"outputs": [],
"source": [
"jobs_df['edu'] = jobs_df.edu.astype('category').cat.reorder_categories(['学历不限', '大专', '本科', '研究生'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a87fab8f-18af-4c0b-9546-63d79778b4d8",
"metadata": {},
"outputs": [],
"source": [
"# 统计招聘岗位对学历要求占比\n",
"pd.pivot_table(\n",
" jobs_df,\n",
" index='edu',\n",
" values='pos_count',\n",
" aggfunc='sum',\n",
" observed=True\n",
").plot(\n",
" kind='pie',\n",
" ylabel='',\n",
" subplots=True,\n",
" legend=False,\n",
" autopct='%.2f%%',\n",
" pctdistance=0.85,\n",
" wedgeprops={'width': 0.35}\n",
")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "15202818-48a8-40bb-8701-47bc1b481f80",
"metadata": {},
"outputs": [],
"source": [
"jobs_df['year'] = jobs_df.year.astype('category').cat.reorder_categories(['应届生', '1年以内', '经验不限', '1-3年', '3-5年', '5年以上'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6d965229-36d5-4c38-850c-3300726f208a",
"metadata": {},
"outputs": [],
"source": [
"# 统计招聘岗位对工作年限要求绘制饼图\n",
"pd.pivot_table(\n",
" jobs_df,\n",
" index='year',\n",
" values='pos_count',\n",
" aggfunc='sum',\n",
" observed=True\n",
").plot(\n",
" kind='pie',\n",
" y='pos_count',\n",
" ylabel='',\n",
" legend=False,\n",
" autopct='%.2f%%',\n",
" pctdistance=0.85,\n",
" wedgeprops={'width': 0.35}\n",
")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9c7ab419-0bc4-428f-be9b-6d4d617b6663",
"metadata": {},
"outputs": [],
"source": [
"# 统计不同学历和工作年限平均薪资\n",
"temp6 = pd.pivot_table(\n",
" jobs_df,\n",
" index='edu',\n",
" columns='year',\n",
" values='salary',\n",
" observed=False,\n",
" fill_value=0\n",
").round(1)\n",
"temp6"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d8492b17-5ae8-47f0-a058-ab303a6087a9",
"metadata": {},
"outputs": [],
"source": [
"# 绘制热力图\n",
"plt.imshow(temp6, cmap='Reds')\n",
"plt.xticks(np.arange(6), labels=temp6.columns)\n",
"plt.yticks(np.arange(4), labels=temp6.index)\n",
"\n",
"for i in range(temp6.index.size):\n",
" for j in range(temp6.columns.size):\n",
" value = temp6.iat[i, j]\n",
" color = 'w' if value > salary_avg else 'k'\n",
" plt.text(j, i, value, ha='center', va='center', color=color)\n",
"\n",
"# 定制颜色条\n",
"plt.colorbar()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a3eb8cd0-fe07-43a4-9514-8a38c08ca081",
"metadata": {},
"outputs": [],
"source": [
"# %pip install seaborn"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f6287994-0865-432a-9932-92b05b5bf7e8",
"metadata": {},
"outputs": [],
"source": [
"import seaborn as sns\n",
"\n",
"sns.heatmap(temp6, cmap='Reds', annot=True)\n",
"plt.xlabel('')\n",
"plt.ylabel('')\n",
"plt.yticks(rotation=0)\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}