{
"cells": [
{
"cell_type": "markdown",
"id": "6fc07f67-318b-4d79-8d4e-4eb8a2c61be2",
"metadata": {},
"source": [
"## NumPy进阶"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "a9d74703-47d5-44f4-8566-eb7d5476c792",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"\n",
"plt.rcParams['font.sans-serif'].insert(0, 'SimHei')\n",
"plt.rcParams['axes.unicode_minus'] = False"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "d139c565-6bf2-4bf6-9d66-d2755b29d1db",
"metadata": {},
"outputs": [],
"source": [
"%config InlineBackend.figure_format = 'svg'\n",
"%matplotlib inline"
]
},
{
"cell_type": "markdown",
"id": "d41a57e5-6009-455b-aff9-4f96682423fc",
"metadata": {},
"source": [
"### NumPy中的函数\n",
"\n",
"#### 通用一元函数"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "5f881886-8aca-40cb-a9f3-4514e28b8fe3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 1., 2., 3., inf, nan, -inf, nan, 5.])"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# inf - infinity\n",
"# nan - not a number\n",
"array1 = np.array([1, 2, 3, np.inf, np.nan, -np.inf, np.nan, 5])\n",
"array1"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "b6e891cc-035c-4e98-9406-1e78e3623e76",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"dtype('float64')"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"array1.dtype"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "674995a2-e50a-45a6-b1ad-7a9f88331dd0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([False, False, False, False, True, False, True, False])"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.isnan(array1)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "358641bc-510c-4f1b-9df7-5dd54de47978",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 1., 2., 3., inf, -inf, 5.])"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"array1[~np.isnan(array1)]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "20030d7d-822e-45c7-b962-3aaf706e133c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ True, True, True, False, False, False, False, True])"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.isfinite(array1)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "357cc22a-7acf-46ee-9523-631134dc8eae",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([1., 2., 3., 5.])"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"array1[np.isfinite(array1)]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "c38f23a4-7d72-4ce4-9bc4-9f8cd9433fa8",
"metadata": {},
"outputs": [],
"source": [
"x = np.linspace(0.5, 10, 72)\n",
"y1 = np.sin(x)\n",
"y2 = np.log2(x)\n",
"y3 = np.sqrt(x)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "08a272bc-8765-455b-b0d0-700872071cf4",
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# 定制画布\n",
"plt.figure(figsize=(8, 4))\n",
"# 绘制折线图\n",
"plt.plot(x, y1, marker='.', label='$y=sin(x)$')\n",
"plt.plot(x, y2, label='$y=log_{2}x$', linewidth=3, color='#9c9c9c')\n",
"plt.plot(x, y3, label='$y=\\sqrt{x}$', linestyle='-.', linewidth=0.5)\n",
"# 显示图例\n",
"plt.legend(loc='center right')\n",
"# 显示图表\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "0b292f56-dab7-469e-89ed-0fb2114902aa",
"metadata": {},
"source": [
"#### 通用二元函数"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "8b67932a-481e-4e2d-9d83-00994a01d959",
"metadata": {},
"outputs": [],
"source": [
"array2 = np.array([0.1 + 0.2, 0.1 + 0.2 + 0.3])\n",
"array3 = np.array([0.3, 0.6])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "23581a64-7b02-4f3f-8a5f-ea49ec20e48a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([False, False])"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"array2 == array3"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "ddcb612c-c7aa-44c6-b8d3-1ee123fba534",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.all(array2 == array3)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "40d454ad-8c60-4132-8dde-10726180e552",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 比较两个数组元素是否(几乎)完全相等 - 有误差容忍度\n",
"np.allclose(array2, array3)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "f1aab287-9d50-4b32-94fa-26d7f183fde3",
"metadata": {},
"outputs": [],
"source": [
"array4 = np.array([1, 2, 3, 4, 5, 6])\n",
"array5 = np.array([2, 4, 6, 8, 10])"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "ea25f2ad-e007-486b-a402-eb3436c9346c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([2, 4, 6])"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 交集\n",
"np.intersect1d(array4, array5)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "e4d2116c-c895-4597-95dc-fda67a1c99a8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 1, 2, 3, 4, 5, 6, 8, 10])"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 并集\n",
"np.union1d(array4, array5)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "5348c4f2-4222-4904-bd07-a3c85e62e4c7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([1, 3, 5])"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 差集\n",
"np.setdiff1d(array4, array5)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "9cd3f3e5-a986-469f-97ba-c739aa4b8577",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 1, 3, 5, 8, 10])"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 对称差\n",
"np.setxor1d(array4, array5)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "78eb3a98-5992-452e-ab13-eaf578cab7a0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([False, True, False, True, False, True])"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 成员运算\n",
"# np.in1d(array4, array5)\n",
"np.isin(array4, array5)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "c7bbc624-dfdc-41d9-bfa2-7a4f3a387bce",
"metadata": {},
"outputs": [],
"source": [
"# 杰卡德相似度\n",
"user_a = np.array(['平板电脑', '尿不湿', '手机', '键盘', '手机支架', '奶瓶', '婴儿辅食', '基围虾', '巴沙鱼', '生抽', '沙拉酱'])\n",
"user_b = np.array(['平板电脑', '键盘', '充电宝', '补光灯', '生抽', '散热器', '笔记本电脑', '双肩包', '登山杖', '露营帐篷', '睡袋'])\n",
"user_c = np.array(['沐浴露', '维C泡腾片', '牛奶', '尿不湿', '平板电脑', '奶瓶', '婴儿辅食', '手机', '磨牙棒', '生抽', '基围虾'])"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "c4132979-1ef5-4e2b-93a4-2d6bfc66f38b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.15789473684210525"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.intersect1d(user_a, user_b).size / np.union1d(user_a, user_b).size"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "46dda506-908b-405a-8e9b-c14090da05b7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.4666666666666667"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.intersect1d(user_a, user_c).size / np.union1d(user_a, user_c).size"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "fb5435f8-b8d4-4b37-88fb-13149a62660e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['巴沙鱼', '手机支架', '沙拉酱', '键盘'], dtype=' 50], [array11 * 10, array11 // 10], default=100)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "4d7b8700-a431-4da4-99c6-54eda9f065dd",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 60, 260, 440, 10, 7, 260, 150, 430, 9, 7])"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 给出一个条件和两个表达式,满足条件执行表达式1,不满足条件执行表达式2\n",
"np.where(array11 < 50, array11 * 10, array11 // 10)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "bc672588-f36b-4951-902c-6b70ef83d1af",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 6, 26, 44, 1, 77, 26, 15, 43, 93, 72])"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"array11"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "aa96b613-cef4-475e-b88c-f6ee8194ef4c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([93, 72, 6, 26, 44, 1, 77, 26, 15, 43])"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 滚动数组元素\n",
"np.roll(array11, 2)"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "bfeed7b4-d835-4a1a-9858-d27d4bc363c3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([44, 1, 77, 26, 15, 43, 93, 72, 6, 26])"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.roll(array11, -2)"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "52b9da9d-dd2b-4fea-984c-6d4b94efedab",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[6, 6, 1],\n",
" [1, 1, 2],\n",
" [2, 2, 3],\n",
" [3, 3, 4],\n",
" [4, 4, 5],\n",
" [5, 5, 6]])"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.roll(array10, 2)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "53aa8382-51e2-4634-bb9f-4ff84ddaef60",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[5, 5, 5],\n",
" [6, 6, 6],\n",
" [1, 1, 1],\n",
" [2, 2, 2],\n",
" [3, 3, 3],\n",
" [4, 4, 4]])"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.roll(array10, 2, axis=0)"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "5f2de13b-2773-4a87-a854-ee9486dfcc0d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[1, 2, 3],\n",
" [4, 5, 6],\n",
" [7, 8, 9]])"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"array12 = np.arange(1, 10).reshape((3, 3))\n",
"array12"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "3808483f-2811-45fd-adbb-a10bcc9d7dc6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[4, 5, 6],\n",
" [7, 8, 9],\n",
" [1, 2, 3]])"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.roll(array12, 2, axis=0)"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "7f6d8d16-2e35-4926-ba13-d5e9cf77660d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[3, 1, 2],\n",
" [6, 4, 5],\n",
" [9, 7, 8]])"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.roll(array12, 1, axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "29b76e3a-8414-4658-b0c6-7795f2186fbe",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 6, 33, 44, 88, 77, 33, 15, 88, 93, 72])"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 替换数组元素\n",
"np.put(array11, [1, 3, 5, 7], [33, 88])\n",
"array11"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "eb4653ef-5c5a-4d3c-adb2-0f10d79ca6c6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 6, 33, 44, 44, 99, 33, 15, 44, 99, 44])"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.place(array11, array11 > 50, [44, 99])\n",
"array11"
]
},
{
"cell_type": "code",
"execution_count": 55,
"id": "03bcc628-1471-44ef-ad56-88468c08548d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(750, 500, 3)"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"guido_image = plt.imread('res/guido.jpg')\n",
"guido_image.shape"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "16fa2543-195b-47d6-8e29-2682800f91aa",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.imshow(np.flip(guido_image, axis=0))"
]
},
{
"cell_type": "code",
"execution_count": 57,
"id": "259830fa-6cd3-43be-bffe-560050f795b9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.imshow(np.flip(guido_image, axis=1))"
]
},
{
"cell_type": "code",
"execution_count": 58,
"id": "6059087d-85ba-4151-9af0-8870876a6b1a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.imshow(np.flip(guido_image, axis=2))"
]
},
{
"cell_type": "code",
"execution_count": 59,
"id": "70ebbf72-87c9-41d8-8c46-f1eccb531206",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.imshow(guido_image)"
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "015548b9-819c-49ba-a13d-7777662a7414",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.imshow(guido_image.swapaxes(0, 1))"
]
},
{
"cell_type": "markdown",
"id": "7c4c00de-f16f-4aac-ae37-e0b9df0eb6c2",
"metadata": {},
"source": [
"#### 普通函数矢量化"
]
},
{
"cell_type": "code",
"execution_count": 61,
"id": "29d231a2-57cd-4786-85cd-1366f5378185",
"metadata": {},
"outputs": [],
"source": [
"# 通过vectorize装饰器将普通函数做矢量化处理\n",
"@np.vectorize\n",
"def fac(n):\n",
" if n == 0:\n",
" return 1\n",
" return n * fac(n - 1)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "7c04cc06-ba86-4b1b-a4e1-75e4527f6dde",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([1, 2, 3, 4, 5, 6, 7, 8])"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"temp = np.arange(1, 9)\n",
"temp"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "38e8026a-6e75-4e1b-a646-98c86736797f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 1, 2, 6, 24, 120, 720, 5040, 40320])"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fac(temp)"
]
},
{
"cell_type": "code",
"execution_count": 64,
"id": "4ac8bb35-87f0-44d0-930e-3fc0c5fb63c3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(array([24, 70, 79, 26, 41, 53, 56, 59, 72, 21]),\n",
" array([63, 56, 32, 59, 51, 60, 62, 58, 67, 59]))"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x1 = np.random.randint(20, 80, 10)\n",
"x2 = np.random.randint(30, 70, 10)\n",
"x1, x2"
]
},
{
"cell_type": "code",
"execution_count": 65,
"id": "4baced9b-fee2-4c2b-b7ca-dc5914b120b9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 3, 14, 1, 1, 1, 1, 2, 1, 1, 1])"
]
},
"execution_count": 65,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from math import gcd, lcm\n",
"\n",
"gcd = np.vectorize(gcd)\n",
"gcd(x1, x2)"
]
},
{
"cell_type": "code",
"execution_count": 66,
"id": "08c6a075-8b61-4716-92c5-126cd78108c1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 504, 280, 2528, 1534, 2091, 3180, 1736, 3422, 4824, 1239])"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lcm = np.vectorize(lcm)\n",
"lcm(x1, x2)"
]
},
{
"cell_type": "markdown",
"id": "859bba4b-a0cf-4140-a8de-b2f3fffcd355",
"metadata": {},
"source": [
"### 广播机制\n",
"\n",
"两个形状(shape属性)不一样的数组如果要做运算,要先通过广播机制使其形状一样才能运算。
\n",
"如果要执行广播机制使得两个数组形状一样,需要满足以下两个条件其中一个:\n",
"\n",
"1. 两个数组后缘维度(shape属性从后往前看对应的部分)相同。\n",
"2. 两个数组后缘维度不同,但是其中一方为1。"
]
},
{
"cell_type": "code",
"execution_count": 67,
"id": "3339c56a-b68c-401e-a27c-134be60ccf14",
"metadata": {},
"outputs": [],
"source": [
"temp1 = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2], [3, 3, 3]])\n",
"temp2 = np.array([1, 2, 3])"
]
},
{
"cell_type": "code",
"execution_count": 68,
"id": "4ecc9498-792c-4de1-a120-585003b19087",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[1, 2, 3],\n",
" [2, 3, 4],\n",
" [3, 4, 5],\n",
" [4, 5, 6]])"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"temp1 + temp2"
]
},
{
"cell_type": "code",
"execution_count": 69,
"id": "2cb20c46-3f87-4346-80ab-7e2786ca1475",
"metadata": {},
"outputs": [],
"source": [
"temp3 = np.array([[1], [2], [3], [4]])"
]
},
{
"cell_type": "code",
"execution_count": 70,
"id": "58932222-bb83-43b7-8cef-b9574898dbb5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[1, 1, 1],\n",
" [3, 3, 3],\n",
" [5, 5, 5],\n",
" [7, 7, 7]])"
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"temp1 + temp3"
]
},
{
"cell_type": "code",
"execution_count": 71,
"id": "74b6c376-05ac-4049-a4b9-5f0614de780e",
"metadata": {},
"outputs": [],
"source": [
"temp4 = np.array([1 ,2, 3])\n",
"temp5 = np.array([[3], [2], [1]])"
]
},
{
"cell_type": "code",
"execution_count": 72,
"id": "eefe6354-4614-4dfb-aa94-3d146b770b3b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(3,)"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"temp4.shape"
]
},
{
"cell_type": "code",
"execution_count": 73,
"id": "04dd44a5-6bea-41eb-aad0-b0a18d64a512",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(3, 1)"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"temp5.shape"
]
},
{
"cell_type": "code",
"execution_count": 74,
"id": "764eb0c3-991d-4a5f-a4eb-7da2c0b445bd",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[4, 5, 6],\n",
" [3, 4, 5],\n",
" [2, 3, 4]])"
]
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"temp4 + temp5"
]
},
{
"cell_type": "markdown",
"id": "8f1022cb-c07a-4149-aafd-9f53d235da4f",
"metadata": {},
"source": [
"### 矩阵"
]
},
{
"cell_type": "code",
"execution_count": 75,
"id": "8a16c29a-088a-473b-b856-3dbd6660f9cd",
"metadata": {},
"outputs": [],
"source": [
"m1 = np.array([[1, 0, 2], [-1, 3, 1]])\n",
"m2 = np.array([[3, 1], [2, 1], [1, 0]])"
]
},
{
"cell_type": "code",
"execution_count": 76,
"id": "74a54196-ca95-4425-9212-62869795aed7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(2, 3)"
]
},
"execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"m1.shape"
]
},
{
"cell_type": "code",
"execution_count": 77,
"id": "1afbceea-782d-49c9-b6d4-d0848f3fdd99",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(3, 2)"
]
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"m2.shape"
]
},
{
"cell_type": "code",
"execution_count": 78,
"id": "bc65a37a-84eb-4aac-b63f-5625a0c15a3a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[5, 1],\n",
" [4, 2]])"
]
},
"execution_count": 78,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"m1 @ m2"
]
},
{
"cell_type": "code",
"execution_count": 79,
"id": "67be1c96-a20c-4862-87cb-8579d3a303f5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[5, 1],\n",
" [4, 2]])"
]
},
"execution_count": 79,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.matmul(m1, m2)"
]
},
{
"cell_type": "markdown",
"id": "5fd602a8-61fc-4c5c-94a6-a930bcf6fb2f",
"metadata": {},
"source": [
"$$\n",
"\\begin{cases}\n",
"x_1 + 2x_2 + x_3 = 8 \\\\\n",
"3x_1 + 7x_2 + 2x_3 = 23 \\\\\n",
"2x_1 + 2x_2 + x_3 = 9\n",
"\\end{cases}\n",
"$$"
]
},
{
"cell_type": "markdown",
"id": "d41b4856-79c3-4f48-9d8e-58c8d6045884",
"metadata": {},
"source": [
"$$\n",
"\\boldsymbol{A} = \\begin{bmatrix}\n",
"1 & 2 & 1\\\\\n",
"3 & 7 & 2\\\\\n",
"2 & 2 & 1\n",
"\\end{bmatrix}, \\quad\n",
"\\boldsymbol{x} = \\begin{bmatrix}\n",
"x_1 \\\\\n",
"x_2\\\\\n",
"x_3\n",
"\\end{bmatrix}, \\quad\n",
"\\boldsymbol{b} = \\begin{bmatrix}\n",
"8 \\\\\n",
"23\\\\\n",
"9\n",
"\\end{bmatrix}\n",
"$$"
]
},
{
"cell_type": "code",
"execution_count": 80,
"id": "edb85b44-50b3-4115-8539-5023a19bb2a1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[1., 2., 3.],\n",
" [4., 5., 6.],\n",
" [7., 8., 8.]])"
]
},
"execution_count": 80,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"m3 = np.arange(1, 10, dtype='f8').reshape(3, 3)\n",
"m3[-1, -1] = 8\n",
"m3"
]
},
{
"cell_type": "code",
"execution_count": 81,
"id": "0b90dbf4-05f7-43ad-a37d-20d48d03dd3a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3"
]
},
"execution_count": 81,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 计算矩阵的秩\n",
"np.linalg.matrix_rank(m3)"
]
},
{
"cell_type": "code",
"execution_count": 82,
"id": "a442ab40-97a9-4cf5-8d61-bfcaa3561655",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[-2.66666667, 2.66666667, -1. ],\n",
" [ 3.33333333, -4.33333333, 2. ],\n",
" [-1. , 2. , -1. ]])"
]
},
"execution_count": 82,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 逆矩阵 - 奇异矩阵不能求逆矩阵\n",
"# LinAlgError: Singular matrix\n",
"np.linalg.inv(m3)"
]
},
{
"cell_type": "code",
"execution_count": 83,
"id": "19be7ff8-6a71-40ad-8e37-20008c56be7a",
"metadata": {},
"outputs": [],
"source": [
"# 有唯一解决的条件:系数矩阵的秩等于增广矩阵的秩,同时跟未知数的个数相同。\n",
"# 秩(rank):线性无关的行或者列的数量。\n",
"# 线性相关:一个向量可以通过其他向量做线性变换(数乘和加法)得到,那么它们就是线性相关的。"
]
},
{
"cell_type": "code",
"execution_count": 84,
"id": "87043db3-e2bd-4a70-950a-d74163afc4d1",
"metadata": {},
"outputs": [],
"source": [
"A = np.array([[1, 2, 1], [3, 7, 2], [2, 2, 1]])\n",
"b = np.array([8, 23, 9]).reshape(-1, 1)"
]
},
{
"cell_type": "code",
"execution_count": 85,
"id": "479d272b-2b46-4374-93f3-54e13af52d59",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3"
]
},
"execution_count": 85,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 系数矩阵的秩\n",
"np.linalg.matrix_rank(A)"
]
},
{
"cell_type": "code",
"execution_count": 86,
"id": "faa35591-09da-4232-9b38-72ee2fb824cc",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3"
]
},
"execution_count": 86,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 增广矩阵的秩\n",
"np.linalg.matrix_rank(np.hstack((A, b)))"
]
},
{
"cell_type": "code",
"execution_count": 87,
"id": "e8d02dee-7d86-4f9d-8e33-77b5a76784a3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[1.],\n",
" [2.],\n",
" [3.]])"
]
},
"execution_count": 87,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 解线性方程组\n",
"np.linalg.solve(A, b)"
]
},
{
"cell_type": "markdown",
"id": "336ee288-5be1-41e5-89cb-e22f465efdd2",
"metadata": {},
"source": [
"$$\n",
"A \\cdot x = b\n",
"$$\n",
"$$\n",
"A^{-1} \\cdot A \\cdot x = A^{-1} \\cdot b\n",
"$$\n",
"$$\n",
"I \\cdot x = A^{-1} \\cdot b\n",
"$$\n",
"$$\n",
"x = A^{-1} \\cdot b\n",
"$$"
]
},
{
"cell_type": "code",
"execution_count": 88,
"id": "7c2bd2fd-8867-4dad-8bbc-b5b175f690b7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[1.],\n",
" [2.],\n",
" [3.]])"
]
},
"execution_count": 88,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 通过逆矩阵解线性方程组\n",
"np.linalg.inv(A) @ b"
]
},
{
"cell_type": "markdown",
"id": "cd91252a-31d4-40d5-82ef-71f22f0bd39c",
"metadata": {},
"source": [
"#### 补充 - 用scipy处理图像"
]
},
{
"cell_type": "code",
"execution_count": 89,
"id": "9052864d-a32d-4cd3-9e2a-8df1654b8a67",
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from scipy.ndimage import gaussian_filter, sobel\n",
"\n",
"# 获取灰度图\n",
"guido_image = plt.imread('res/guido.jpg')\n",
"gray_image = np.mean(guido_image, axis=2)\n",
"\n",
"plt.figure(figsize=(12, 4))\n",
"\n",
"# 灰度图\n",
"plt.subplot(1, 4, 1)\n",
"plt.imshow(gray_image, cmap=plt.cm.gray)\n",
"\n",
"# 模糊和锐化\n",
"plt.subplot(1, 4, 2)\n",
"blurred_image = gaussian_filter(gray_image, 3)\n",
"plt.imshow(blurred_image, cmap=plt.cm.gray)\n",
"\n",
"plt.subplot(1, 4, 3)\n",
"filtered_image = gaussian_filter(blurred_image, 1)\n",
"sharpen_image = blurred_image + 32 * (blurred_image - filtered_image)\n",
"plt.imshow(sharpen_image, cmap=plt.cm.gray)\n",
"\n",
"# 边缘图\n",
"plt.subplot(1, 4, 4)\n",
"# 使用索贝尔算子(邻点灰度加权差)进行边缘检测\n",
"edge_image = sobel(gray_image)\n",
"plt.imshow(edge_image, cmap=plt.cm.gray)\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 90,
"id": "dbbc6edf-c643-4cb4-a98b-05e806413478",
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from scipy.ndimage import rotate, zoom\n",
"\n",
"plt.figure(figsize=(12, 4))\n",
"\n",
"# 旋转\n",
"plt.subplot(1, 3, 1)\n",
"rotated_image = rotate(guido_image, -16, reshape=True)\n",
"plt.imshow(rotated_image)\n",
"\n",
"# 旋转\n",
"plt.subplot(1, 3, 2)\n",
"rotated_image = rotate(guido_image, -16, reshape=False)\n",
"plt.imshow(rotated_image)\n",
"\n",
"# 缩放\n",
"plt.subplot(1, 3, 3)\n",
"scaled_image = zoom(guido_image, zoom=(0.8, 1.25, 1))\n",
"plt.imshow(scaled_image)\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "b6989fb7-a3ab-4889-80be-f4ecce033e5c",
"metadata": {},
"source": [
"### 多项式"
]
},
{
"cell_type": "code",
"execution_count": 91,
"id": "0c719ca9-de74-4c2e-aaf2-d0a9c833f512",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 3\n",
"3 x + 2 x + 1\n",
" 2\n",
"1 x + 2 x + 3\n"
]
}
],
"source": [
"# NumPy老版本用poly1d表示多项式\n",
"p1 = np.poly1d([3, 0, 2, 1])\n",
"p2 = np.poly1d([1, 2, 3])\n",
"print(p1)\n",
"print(p2)"
]
},
{
"cell_type": "code",
"execution_count": 92,
"id": "e6aab279-fa9f-4d4e-b597-b3c51b478777",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 3 2\n",
"3 x + 1 x + 4 x + 4\n"
]
}
],
"source": [
"# 多项式加法\n",
"print(p1 + p2)"
]
},
{
"cell_type": "code",
"execution_count": 93,
"id": "375a1989-7231-432c-bf33-06fce490cacf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 5 4 3 2\n",
"3 x + 6 x + 11 x + 5 x + 8 x + 3\n"
]
}
],
"source": [
"# 多项式乘法\n",
"print(p1 * p2)"
]
},
{
"cell_type": "code",
"execution_count": 94,
"id": "4df5662a-c0bc-434d-b53f-623e09b51a76",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"11"
]
},
"execution_count": 94,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 令x=2,计算多项式的值\n",
"p2(2)"
]
},
{
"cell_type": "code",
"execution_count": 95,
"id": "df150df6-9904-45b7-8704-dbb033552dcb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 2\n",
"9 x + 2\n"
]
}
],
"source": [
"# 求导\n",
"print(p1.deriv())"
]
},
{
"cell_type": "code",
"execution_count": 96,
"id": "d2d3002c-59d6-4e00-b00f-69f3f6ac6609",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 4 2\n",
"0.75 x + 1 x + 1 x\n"
]
}
],
"source": [
"# 求不定积分\n",
"print(p1.integ())"
]
},
{
"cell_type": "code",
"execution_count": 97,
"id": "0965b9e9-42fc-425c-90cb-d265e3f6e42f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 2\n",
"1 x + 3 x + 2\n",
"[-2. -1.]\n"
]
}
],
"source": [
"p3 = np.poly1d([1, 3, 2])\n",
"print(p3)\n",
"# 令多项式等于0,求解x\n",
"print(p3.roots)"
]
},
{
"cell_type": "code",
"execution_count": 98,
"id": "c5bda2d2-0d61-46db-ac48-0f7ec3438660",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"numpy.poly1d"
]
},
"execution_count": 98,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(p3)"
]
},
{
"cell_type": "code",
"execution_count": 99,
"id": "a71c4799-183e-4096-9903-7edf9464feb7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.0 + 2.0·x + 0.0·x² + 3.0·x³\n"
]
}
],
"source": [
"from numpy.polynomial import Polynomial\n",
"\n",
"# NumPy新版本用Polynomial表示多项式\n",
"p1 = Polynomial([1, 2, 0, 3])\n",
"print(p1)"
]
},
{
"cell_type": "code",
"execution_count": 100,
"id": "0002c6e3-697b-425c-a17c-802e10f07981",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2.0 + 0.0·x + 9.0·x²\n"
]
}
],
"source": [
"print(p1.deriv())"
]
},
{
"cell_type": "code",
"execution_count": 101,
"id": "1f9dde21-0854-4ae6-9703-9599a4204003",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.0 + 1.0·x + 1.0·x² + 0.0·x³ + 0.75·x⁴\n"
]
}
],
"source": [
"print(p1.integ())"
]
},
{
"cell_type": "code",
"execution_count": 102,
"id": "eb191b28-fb2c-460b-9c87-0041b128ba16",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3"
]
},
"execution_count": 102,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 最高次项\n",
"p1.degree()"
]
},
{
"cell_type": "markdown",
"id": "2d6c3103-c5b0-413f-b2de-324b0394e3ae",
"metadata": {},
"source": [
"### 最小二乘解"
]
},
{
"cell_type": "code",
"execution_count": 103,
"id": "7163cb46-44bb-487c-8dd2-7c530574ecc0",
"metadata": {},
"outputs": [],
"source": [
"# 每月收入\n",
"x = np.array([3200, 4811, 5386, 5564, 6120, 6691, 6906, 7483, 7587, 7890,\n",
" 8090, 8300, 8650, 8835, 8975, 9070, 9100, 9184, 9247, 9313, \n",
" 9465, 9558, 9853, 9938, 10020, 10242, 10343, 10731, 10885, 10990, \n",
" 11100, 11227, 11313, 11414, 11630, 11806, 11999, 12038, 12400, 12547, \n",
" 12890, 13050, 13360, 13850, 14890, 14990, 15500, 16899, 17010, 19880])\n",
"# 每月网购支出\n",
"y = np.array([1761, 882, 1106, 182, 1532, 1978, 2174, 2117, 2134, 1924, \n",
" 2207, 2876, 2617, 2683, 3054, 3277, 3345, 3462, 3401, 3591,\n",
" 3596, 3671, 3829, 3907, 3852, 4288, 4359, 4099, 4300, 4367,\n",
" 5019, 4873, 4674, 5174, 4666, 5797, 5782, 5451, 5487, 5448,\n",
" 6002, 6439, 6309, 6045, 5935, 6928, 7356, 6682, 6672, 6582])"
]
},
{
"cell_type": "code",
"execution_count": 104,
"id": "a68a1d0d-0dc9-4437-bed9-6e9aff589df1",
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# 定性分析 - 散点图\n",
"plt.scatter(x, y)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 105,
"id": "89030441-eefd-4a02-ad41-ba154dbf87b2",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(ShapiroResult(statistic=0.9830237255502384, pvalue=0.6844559821829901),\n",
" ShapiroResult(statistic=0.9789829625124067, pvalue=0.5099101868610301))"
]
},
"execution_count": 105,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from scipy import stats\n",
"\n",
"# 夏皮洛检验(正态性判定)\n",
"stats.shapiro(x), stats.shapiro(y)"
]
},
{
"cell_type": "code",
"execution_count": 106,
"id": "93573426-e1c0-4025-8b8f-2d83d69e103c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[1. , 0.93422273],\n",
" [0.93422273, 1. ]])"
]
},
"execution_count": 106,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 定量分析 - 相关系数 - correlation coefficient\n",
"# 皮尔逊相关系数(标准化的协方差 - [-1, 1])\n",
"# 1. 连续值且成对出现\n",
"# 2. 没有异常值\n",
"# 3. 来自于正态总体\n",
"np.corrcoef(x, y)"
]
},
{
"cell_type": "code",
"execution_count": 107,
"id": "4b6e47de-8e3b-4be4-a5e4-f1d95649e6f9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"PearsonRResult(statistic=0.9342227278473364, pvalue=3.9566343708624996e-23)"
]
},
"execution_count": 107,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 计算皮尔逊相关系数\n",
"stats.pearsonr(x, y)"
]
},
{
"cell_type": "code",
"execution_count": 108,
"id": "f8e53b1e-b2b1-4222-a69a-dcbca0041d0e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"50"
]
},
"execution_count": 108,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"history_data = {key: value for key, value in zip(x, y)}\n",
"len(history_data)"
]
},
{
"cell_type": "code",
"execution_count": 109,
"id": "1530478e-ec70-4f87-828f-1d6798c53c5e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[67, 88, 28, 95, 96, 10, 70, 80, 84, 8, 19, 68, 1, 90, 39]"
]
},
"execution_count": 109,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = np.random.randint(1, 100, 15).tolist()\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 110,
"id": "7d246563-86fd-418f-b058-4aae08d9b9c1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[1, 8, 10]\n",
"[96, 95, 90, 88, 84]\n"
]
}
],
"source": [
"import heapq\n",
"\n",
"# 通过堆(heap)结构快速的找到TopN元素\n",
"print(heapq.nsmallest(3, data))\n",
"print(heapq.nlargest(5, data))"
]
},
{
"cell_type": "code",
"execution_count": 111,
"id": "866f9fd9-58aa-4c03-9415-99aa236958d4",
"metadata": {},
"outputs": [],
"source": [
"# 目标:因为月收入和网购支出之间有强相关关系,所以我们可以通过月收入预测网购支出\n",
"# 方法1:输入一个月收入,找到跟这个收入最接近的N条数据,用它们的平均值预测对应的网购支出\n",
"# KNN - k最近邻算法(找到k个最近的邻居,用这k个邻居的数据来做出预测)\n",
"import heapq\n",
"\n",
"\n",
"def predicate_by_knn(income, k=5):\n",
" \"\"\"KNN算法\"\"\"\n",
" keys = heapq.nsmallest(k, history_data, key=lambda x: (x - income) ** 2)\n",
" return np.mean([history_data[key] for key in keys]).round(2)"
]
},
{
"cell_type": "code",
"execution_count": 112,
"id": "399b9b6a-6625-4dc2-a09c-c0699c538a46",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"5937.0"
]
},
"execution_count": 112,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predicate_by_knn(12800)"
]
},
{
"cell_type": "code",
"execution_count": 113,
"id": "db0ee4bf-83f8-489f-a543-80854571da60",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1987.0"
]
},
"execution_count": 113,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predicate_by_knn(6800)"
]
},
{
"cell_type": "code",
"execution_count": 114,
"id": "6e514003-c143-4823-91eb-57f7608f6137",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"6645.33"
]
},
"execution_count": 114,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predicate_by_knn(20000, k=3)"
]
},
{
"cell_type": "markdown",
"id": "aa44d3ad-aa3f-44db-889a-8a69b9a9be65",
"metadata": {},
"source": [
"回归模型:\n",
"$$ Y = aX + b $$\n",
"\n",
"损失函数:\n",
"$$ MSE = \\frac{1} {N} \\sum (\\hat{y_i} - y_i)^2 $$"
]
},
{
"cell_type": "code",
"execution_count": 115,
"id": "032eb740-ac9a-4526-9eb4-b6770907b07a",
"metadata": {},
"outputs": [],
"source": [
"# MSE - Mean Squared Error\n",
"def get_loss(a, b):\n",
" \"\"\"损失函数\"\"\"\n",
" y_hat = a * x + b\n",
" return np.mean((y_hat - y) ** 2)"
]
},
{
"cell_type": "code",
"execution_count": 116,
"id": "0d56ccc4-5a0d-4b3a-89ab-01cf690c4fba",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1120507.2896234244\n",
"808664.5200191085\n",
"443441.57239664154\n",
"408322.6807976254\n",
"394598.0903141962\n",
"394480.4982102699\n",
"394457.8332210129\n",
"393928.3764661801\n",
"393882.835806886\n",
"393829.0258808886\n",
"393817.59355663764\n",
"0.5068829877448287 -1209.315185532824\n"
]
}
],
"source": [
"# 蒙特卡洛模拟(随机瞎蒙法)\n",
"import random\n",
"\n",
"min_loss = np.inf\n",
"ba, bb = None, None\n",
"\n",
"for _ in range(10000):\n",
" a = random.random() * 0.5 + 0.5\n",
" b = random.random() * 1000 - 2000\n",
" curr_loss = get_loss(a, b)\n",
" if curr_loss < min_loss:\n",
" min_loss = curr_loss\n",
" ba, bb = a, b\n",
" print(min_loss)\n",
"print(ba, bb)"
]
},
{
"cell_type": "code",
"execution_count": 117,
"id": "af5f7d02-f921-4712-82e3-1ce43c86c708",
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.scatter(x, y)\n",
"plt.plot(x, ba * x + bb, color='r', linewidth=4)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 118,
"id": "207c2da0-371f-40ea-b19f-f56cca8a7d30",
"metadata": {},
"outputs": [],
"source": [
"def predicate_by_regression(income):\n",
" return round(ba * income + bb, 2)"
]
},
{
"cell_type": "code",
"execution_count": 119,
"id": "a9e3af32-654c-4704-9f32-cfe41f067d06",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2237.49"
]
},
"execution_count": 119,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predicate_by_regression(6800)"
]
},
{
"cell_type": "code",
"execution_count": 120,
"id": "5133079a-59f8-4435-af1e-fb5bc13c457d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"5278.79"
]
},
"execution_count": 120,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predicate_by_regression(12800)"
]
},
{
"cell_type": "markdown",
"id": "200b9821-b8c8-41df-ae8e-d96bf5049415",
"metadata": {},
"source": [
"将回归模型带入损失函数:\n",
"$$ f(a, b) = \\frac {1} {N} \\sum_{i=1}^{N}(y_i - (ax_i + b))^2 $$\n",
"\n",
"如何让$f(a, b)$取到最小值???\n",
"\n",
"求偏导数,并令其等于0。\n",
"$$ \\frac {\\partial {f(a, b)}} {\\partial {a}} = \\frac {2} {N} \\sum_{i=1}^{N}(-x_iy_i + x_i^2a + x_ib) = 0 $$ \n",
"$$ \\frac {\\partial {f(a, b)}} {\\partial {b}} = \\frac {2} {N} \\sum_{i=1}^{N}(-y_i + x_ia + b) = 0 $$\n",
"\n",
"求解得到:\n",
"$$a = \\frac{\\sum(x_{i} - \\bar{x})(y_{i} - \\bar{y})}{\\sum(x_{i} - \\bar{x})^{2}}$$\n",
"$$b = \\bar{y} - a\\bar{x}$$"
]
},
{
"cell_type": "code",
"execution_count": 121,
"id": "9ab79db5-85c7-44ad-8ffa-2684fb499f7a",
"metadata": {},
"outputs": [],
"source": [
"x_bar, y_bar = np.mean(x), np.mean(y)"
]
},
{
"cell_type": "code",
"execution_count": 122,
"id": "332777b5-bc3f-44d2-bd54-f8e9a19497aa",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(0.5079223873753402, -1227.104582703003)"
]
},
"execution_count": 122,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ba = np.dot((x - x_bar), (y - y_bar)) / np.sum((x - x_bar) ** 2)\n",
"bb = y_bar - ba * x_bar\n",
"ba, bb"
]
},
{
"cell_type": "code",
"execution_count": 123,
"id": "795f6a40-136a-4772-9e2b-5798022e408d",
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.scatter(x, y)\n",
"plt.plot(x, ba * x + bb, color='r', linewidth=4)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 124,
"id": "2bfac239-7bf0-4784-a124-bbfdb4cdafe8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 5.07922387e-01, -1.22710458e+03])"
]
},
"execution_count": 124,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 拟合出一个线性回归模型\n",
"np.polyfit(x, y, deg=1)"
]
},
{
"cell_type": "code",
"execution_count": 125,
"id": "9d5adc92-73e2-40df-843b-02efb28c6ae3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(-1.783556141649917e-05, 0.905493221908901, -3247.1511980120213)"
]
},
"execution_count": 125,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 拟合出一个多项式回归模型\n",
"a, b, c = np.polyfit(x, y, deg=2)\n",
"a, b, c"
]
},
{
"cell_type": "code",
"execution_count": 126,
"id": "c42ee939-e4b9-479e-ad73-66e806e29c6c",
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.scatter(x, y)\n",
"plt.plot(x, a * x ** 2 + b * x + c, color='r', linewidth=4)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 127,
"id": "3363f359-3615-4ffa-a040-41e39c83b38f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([-1.22710458e+03, 5.07922387e-01])"
]
},
"execution_count": 127,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Polynomial.fit(x, y, deg=1).convert().coef"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}