{ "cells": [ { "cell_type": "markdown", "id": "6fc07f67-318b-4d79-8d4e-4eb8a2c61be2", "metadata": {}, "source": [ "## NumPy进阶" ] }, { "cell_type": "code", "execution_count": 1, "id": "a9d74703-47d5-44f4-8566-eb7d5476c792", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "\n", "plt.rcParams['font.sans-serif'].insert(0, 'SimHei')\n", "plt.rcParams['axes.unicode_minus'] = False" ] }, { "cell_type": "code", "execution_count": 2, "id": "d139c565-6bf2-4bf6-9d66-d2755b29d1db", "metadata": {}, "outputs": [], "source": [ "%config InlineBackend.figure_format = 'svg'\n", "%matplotlib inline" ] }, { "cell_type": "markdown", "id": "d41a57e5-6009-455b-aff9-4f96682423fc", "metadata": {}, "source": [ "### NumPy中的函数\n", "\n", "#### 通用一元函数" ] }, { "cell_type": "code", "execution_count": 3, "id": "5f881886-8aca-40cb-a9f3-4514e28b8fe3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 1., 2., 3., inf, nan, -inf, nan, 5.])" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# inf - infinity\n", "# nan - not a number\n", "array1 = np.array([1, 2, 3, np.inf, np.nan, -np.inf, np.nan, 5])\n", "array1" ] }, { "cell_type": "code", "execution_count": 4, "id": "b6e891cc-035c-4e98-9406-1e78e3623e76", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dtype('float64')" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "array1.dtype" ] }, { "cell_type": "code", "execution_count": 5, "id": "674995a2-e50a-45a6-b1ad-7a9f88331dd0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([False, False, False, False, True, False, True, False])" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.isnan(array1)" ] }, { "cell_type": "code", "execution_count": 6, "id": "358641bc-510c-4f1b-9df7-5dd54de47978", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 1., 2., 3., inf, -inf, 5.])" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "array1[~np.isnan(array1)]" ] }, { "cell_type": "code", "execution_count": 7, "id": "20030d7d-822e-45c7-b962-3aaf706e133c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ True, True, True, False, False, False, False, True])" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.isfinite(array1)" ] }, { "cell_type": "code", "execution_count": 8, "id": "357cc22a-7acf-46ee-9523-631134dc8eae", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1., 2., 3., 5.])" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "array1[np.isfinite(array1)]" ] }, { "cell_type": "code", "execution_count": 9, "id": "c38f23a4-7d72-4ce4-9bc4-9f8cd9433fa8", "metadata": {}, "outputs": [], "source": [ "x = np.linspace(0.5, 10, 72)\n", "y1 = np.sin(x)\n", "y2 = np.log2(x)\n", "y3 = np.sqrt(x)" ] }, { "cell_type": "code", "execution_count": 10, "id": "08a272bc-8765-455b-b0d0-700872071cf4", "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n" ], "text/plain": [ "

" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# 定制画布\n", "plt.figure(figsize=(8, 4))\n", "# 绘制折线图\n", "plt.plot(x, y1, marker='.', label='$y=sin(x)$')\n", "plt.plot(x, y2, label='$y=log_{2}x$', linewidth=3, color='#9c9c9c')\n", "plt.plot(x, y3, label='$y=\\sqrt{x}$', linestyle='-.', linewidth=0.5)\n", "# 显示图例\n", "plt.legend(loc='center right')\n", "# 显示图表\n", "plt.show()" ] }, { "cell_type": "markdown", "id": "0b292f56-dab7-469e-89ed-0fb2114902aa", "metadata": {}, "source": [ "#### 通用二元函数" ] }, { "cell_type": "code", "execution_count": 11, "id": "8b67932a-481e-4e2d-9d83-00994a01d959", "metadata": {}, "outputs": [], "source": [ "array2 = np.array([0.1 + 0.2, 0.1 + 0.2 + 0.3])\n", "array3 = np.array([0.3, 0.6])" ] }, { "cell_type": "code", "execution_count": 12, "id": "23581a64-7b02-4f3f-8a5f-ea49ec20e48a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([False, False])" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "array2 == array3" ] }, { "cell_type": "code", "execution_count": 13, "id": "ddcb612c-c7aa-44c6-b8d3-1ee123fba534", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "np.False_" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.all(array2 == array3)" ] }, { "cell_type": "code", "execution_count": 14, "id": "40d454ad-8c60-4132-8dde-10726180e552", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 比较两个数组元素是否（几乎）完全相等 - 有误差容忍度\n", "np.allclose(array2, array3)" ] }, { "cell_type": "code", "execution_count": 15, "id": "f1aab287-9d50-4b32-94fa-26d7f183fde3", "metadata": {}, "outputs": [], "source": [ "array4 = np.array([1, 2, 3, 4, 5, 6])\n", "array5 = np.array([2, 4, 6, 8, 10])" ] }, { "cell_type": "code", "execution_count": 16, "id": "ea25f2ad-e007-486b-a402-eb3436c9346c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([2, 4, 6])" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 交集\n", "np.intersect1d(array4, array5)" ] }, { "cell_type": "code", "execution_count": 17, "id": "e4d2116c-c895-4597-95dc-fda67a1c99a8", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 1, 2, 3, 4, 5, 6, 8, 10])" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 并集\n", "np.union1d(array4, array5)" ] }, { "cell_type": "code", "execution_count": 18, "id": "5348c4f2-4222-4904-bd07-a3c85e62e4c7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1, 3, 5])" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 差集\n", "np.setdiff1d(array4, array5)" ] }, { "cell_type": "code", "execution_count": 19, "id": "9cd3f3e5-a986-469f-97ba-c739aa4b8577", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 1, 3, 5, 8, 10])" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 对称差\n", "np.setxor1d(array4, array5)" ] }, { "cell_type": "code", "execution_count": 22, "id": "78eb3a98-5992-452e-ab13-eaf578cab7a0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([False, True, False, True, False, True])" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 成员运算\n", "# np.in1d(array4, array5)\n", "np.isin(array4, array5)" ] }, { "cell_type": "code", "execution_count": 23, "id": "c7bbc624-dfdc-41d9-bfa2-7a4f3a387bce", "metadata": {}, "outputs": [], "source": [ "# 杰卡德相似度\n", "user_a = np.array(['平板电脑', '尿不湿', '手机', '键盘', '手机支架', '奶瓶', '婴儿辅食', '基围虾', '巴沙鱼', '生抽', '沙拉酱'])\n", "user_b = np.array(['平板电脑', '键盘', '充电宝', '补光灯', '生抽', '散热器', '笔记本电脑', '双肩包', '登山杖', '露营帐篷', '睡袋'])\n", "user_c = np.array(['沐浴露', '维C泡腾片', '牛奶', '尿不湿', '平板电脑', '奶瓶', '婴儿辅食', '手机', '磨牙棒', '生抽', '基围虾'])" ] }, { "cell_type": "code", "execution_count": 24, "id": "c4132979-1ef5-4e2b-93a4-2d6bfc66f38b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.15789473684210525" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.intersect1d(user_a, user_b).size / np.union1d(user_a, user_b).size" ] }, { "cell_type": "code", "execution_count": 25, "id": "46dda506-908b-405a-8e9b-c14090da05b7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.4666666666666667" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.intersect1d(user_a, user_c).size / np.union1d(user_a, user_c).size" ] }, { "cell_type": "code", "execution_count": null, "id": "fb5435f8-b8d4-4b37-88fb-13149a62660e", "metadata": {}, "outputs": [], "source": [ "np.setdiff1d(user_a, user_c)" ] }, { "cell_type": "code", "execution_count": null, "id": "f8366b08-779d-4369-9f27-a9b4b9125782", "metadata": {}, "outputs": [], "source": [ "np.setdiff1d(user_c, user_a)" ] }, { "cell_type": "code", "execution_count": null, "id": "b1ce07b5-ec70-4512-814a-e210148ed205", "metadata": {}, "outputs": [], "source": [ "# 余弦相似度\n", "user = np.array([5, 1, 3])\n", "mov1 = np.array([4, 5, 1])\n", "mov2 = np.array([5, 1, 5])" ] }, { "cell_type": "code", "execution_count": null, "id": "5ab70c0f-cfe7-4e10-b162-0feefb36f884", "metadata": {}, "outputs": [], "source": [ "# linear algebra\n", "# np.dot - 点积\n", "# np.linalg.norm - 模长\n", "np.dot(user, mov1) / (np.linalg.norm(user) * np.linalg.norm(mov1))" ] }, { "cell_type": "code", "execution_count": null, "id": "21a1caaf-2cf4-4cf8-836c-d244f4133098", "metadata": {}, "outputs": [], "source": [ "# np.arcos - 反余弦函数 - 弧度\n", "# np.degrees - 弧度换算角度\n", "np.degrees(np.arccos(np.dot(user, mov1) / (np.linalg.norm(user) * np.linalg.norm(mov1))))" ] }, { "cell_type": "code", "execution_count": null, "id": "9b90ecb1-67d0-48be-84c5-54bf212f1292", "metadata": {}, "outputs": [], "source": [ "np.degrees(np.arccos(np.dot(user, mov2) / (np.linalg.norm(user) * np.linalg.norm(mov2))))" ] }, { "cell_type": "markdown", "id": "c81c6238-f28c-44e8-ac54-94a69f5a6c4a", "metadata": {}, "source": [ "#### 其他常用函数" ] }, { "cell_type": "code", "execution_count": null, "id": "1df2d086-58cd-4324-b2cf-98d8d971a4d7", "metadata": {}, "outputs": [], "source": [ "array6 = np.array([1, 2, 3, 1, 1, 2, 2, 4, 5, 7, 3, 6, 6])\n", "array6" ] }, { "cell_type": "code", "execution_count": null, "id": "70df0735-6680-4996-8cc3-10ac5d9102ab", "metadata": {}, "outputs": [], "source": [ "# 去重\n", "array7 = np.unique(array6)\n", "array7" ] }, { "cell_type": "code", "execution_count": null, "id": "79ad92e0-da87-401d-aa70-0514a5b61d0e", "metadata": {}, "outputs": [], "source": [ "array8 = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]])\n", "array9 = np.array([[4, 4, 4], [5, 5, 5], [6, 6, 6]])" ] }, { "cell_type": "code", "execution_count": null, "id": "b099b5ee-d7df-4864-89f3-bb1c7a748a59", "metadata": {}, "outputs": [], "source": [ "# 在0轴方向（垂直）堆叠 - vertical\n", "array10 = np.vstack((array8, array9))\n", "array10" ] }, { "cell_type": "code", "execution_count": null, "id": "204b8e34-43e3-4a4f-9e8c-8734e72a041f", "metadata": {}, "outputs": [], "source": [ "# 在1轴的方向堆叠 - horizontal\n", "np.hstack((array8, array9))" ] }, { "cell_type": "code", "execution_count": null, "id": "3301cc02-3ecf-4516-b2ff-e0fa6a9041c7", "metadata": {}, "outputs": [], "source": [ "# 数组的拼接\n", "np.concatenate((array8, array9), axis=1)" ] }, { "cell_type": "code", "execution_count": null, "id": "c956190c-1382-4416-9ab2-09a19f5567f6", "metadata": {}, "outputs": [], "source": [ "# 堆叠出更高维的数组\n", "np.stack((array8, array9), axis=0)" ] }, { "cell_type": "code", "execution_count": null, "id": "a956be87-65e9-4391-b2be-bfbaaab7e7dc", "metadata": {}, "outputs": [], "source": [ "np.stack((array8, array9), axis=1)" ] }, { "cell_type": "code", "execution_count": null, "id": "df88bd16-64c3-40af-adba-f7faed516159", "metadata": {}, "outputs": [], "source": [ "# 将一个数组拆分成多个数组\n", "np.vsplit(array10, 3)" ] }, { "cell_type": "code", "execution_count": null, "id": "592ffc62-2a47-48bc-8789-3cad653d2893", "metadata": {}, "outputs": [], "source": [ "# 追加元素\n", "np.append(array6, [10, 11, 12])" ] }, { "cell_type": "code", "execution_count": null, "id": "7a3355ca-1ee7-4d8f-8567-c10fed5055f6", "metadata": {}, "outputs": [], "source": [ "# 插入元素\n", "np.insert(array6, 1, [10, 20])" ] }, { "cell_type": "code", "execution_count": null, "id": "4b824b66-7471-4669-aa99-8f62a6b8cb2b", "metadata": {}, "outputs": [], "source": [ "array11 = np.random.randint(1, 100, 10)\n", "array11" ] }, { "cell_type": "code", "execution_count": null, "id": "2ff82b6f-9a96-40c2-a4d9-4aaec7d42e9a", "metadata": {}, "outputs": [], "source": [ "# 抽取元素 - 相当于布尔索引的作用\n", "np.extract(array11 < 50, array11)" ] }, { "cell_type": "code", "execution_count": null, "id": "cc00dac0-0b97-435c-960d-06c141de0b78", "metadata": {}, "outputs": [], "source": [ "# 给出一组条件和对应的处理数据的表达式，满足条件就执行对应的表达式，不满足条件取默认值\n", "np.select([array11 < 30, array11 > 50], [array11 * 10, array11 // 10], default=100)" ] }, { "cell_type": "code", "execution_count": null, "id": "4d7b8700-a431-4da4-99c6-54eda9f065dd", "metadata": {}, "outputs": [], "source": [ "# 给出一个条件和两个表达式，满足条件执行表达式1，不满足条件执行表达式2\n", "np.where(array11 < 50, array11 * 10, array11 // 10)" ] }, { "cell_type": "code", "execution_count": null, "id": "bc672588-f36b-4951-902c-6b70ef83d1af", "metadata": {}, "outputs": [], "source": [ "array11" ] }, { "cell_type": "code", "execution_count": null, "id": "aa96b613-cef4-475e-b88c-f6ee8194ef4c", "metadata": {}, "outputs": [], "source": [ "# 滚动数组元素\n", "np.roll(array11, 2)" ] }, { "cell_type": "code", "execution_count": null, "id": "bfeed7b4-d835-4a1a-9858-d27d4bc363c3", "metadata": {}, "outputs": [], "source": [ "np.roll(array11, -2)" ] }, { "cell_type": "code", "execution_count": null, "id": "52b9da9d-dd2b-4fea-984c-6d4b94efedab", "metadata": {}, "outputs": [], "source": [ "np.roll(array10, 2)" ] }, { "cell_type": "code", "execution_count": null, "id": "53aa8382-51e2-4634-bb9f-4ff84ddaef60", "metadata": {}, "outputs": [], "source": [ "np.roll(array10, 2, axis=0)" ] }, { "cell_type": "code", "execution_count": null, "id": "5f2de13b-2773-4a87-a854-ee9486dfcc0d", "metadata": {}, "outputs": [], "source": [ "array12 = np.arange(1, 10).reshape((3, 3))\n", "array12" ] }, { "cell_type": "code", "execution_count": null, "id": "3808483f-2811-45fd-adbb-a10bcc9d7dc6", "metadata": {}, "outputs": [], "source": [ "np.roll(array12, 2, axis=0)" ] }, { "cell_type": "code", "execution_count": null, "id": "7f6d8d16-2e35-4926-ba13-d5e9cf77660d", "metadata": {}, "outputs": [], "source": [ "np.roll(array12, 1, axis=1)" ] }, { "cell_type": "code", "execution_count": null, "id": "29b76e3a-8414-4658-b0c6-7795f2186fbe", "metadata": {}, "outputs": [], "source": [ "# 替换数组元素\n", "np.put(array11, [1, 3, 5, 7], [33, 88])\n", "array11" ] }, { "cell_type": "code", "execution_count": null, "id": "eb4653ef-5c5a-4d3c-adb2-0f10d79ca6c6", "metadata": {}, "outputs": [], "source": [ "np.place(array11, array11 > 50, [44, 99])\n", "array11" ] }, { "cell_type": "code", "execution_count": null, "id": "03bcc628-1471-44ef-ad56-88468c08548d", "metadata": {}, "outputs": [], "source": [ "guido_image = plt.imread('res/guido.jpg')\n", "guido_image.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "16fa2543-195b-47d6-8e29-2682800f91aa", "metadata": {}, "outputs": [], "source": [ "plt.imshow(np.flip(guido_image, axis=0))" ] }, { "cell_type": "code", "execution_count": null, "id": "259830fa-6cd3-43be-bffe-560050f795b9", "metadata": {}, "outputs": [], "source": [ "plt.imshow(np.flip(guido_image, axis=1))" ] }, { "cell_type": "code", "execution_count": null, "id": "6059087d-85ba-4151-9af0-8870876a6b1a", "metadata": {}, "outputs": [], "source": [ "plt.imshow(np.flip(guido_image, axis=2))" ] }, { "cell_type": "code", "execution_count": null, "id": "70ebbf72-87c9-41d8-8c46-f1eccb531206", "metadata": {}, "outputs": [], "source": [ "plt.imshow(guido_image)" ] }, { "cell_type": "code", "execution_count": null, "id": "015548b9-819c-49ba-a13d-7777662a7414", "metadata": {}, "outputs": [], "source": [ "plt.imshow(guido_image.swapaxes(0, 1))" ] }, { "cell_type": "markdown", "id": "7c4c00de-f16f-4aac-ae37-e0b9df0eb6c2", "metadata": {}, "source": [ "#### 普通函数矢量化" ] }, { "cell_type": "code", "execution_count": null, "id": "fa07c8bc-9558-4d2c-b12c-b0e3814cbb48", "metadata": {}, "outputs": [], "source": [ "# 面试官：讲一讲Python语言中的装饰器\n", "# 用一个函数去装饰另一个函数或者一个类并为其提供额外的能力（横切关注功能）" ] }, { "cell_type": "code", "execution_count": null, "id": "02e4d3e6-9bc7-462e-8be4-800a1dcdc632", "metadata": {}, "outputs": [], "source": [ "# 面试题：写一个装饰器，如果原函数返回字符串，那么将字符串每个单词首字母大写\n", "from functools import wraps\n", "\n", "\n", "def titlize_str(func):\n", "\n", " @wraps(func)\n", " def wrapper(*args, **kwargs):\n", " result = func(*args, **kwargs)\n", " if isinstance(result, str):\n", " result = result.title()\n", " return result\n", "\n", " return wrapper" ] }, { "cell_type": "code", "execution_count": null, "id": "e798956c-d543-4a50-a12b-f7314b98bf40", "metadata": {}, "outputs": [], "source": [ "@titlize_str\n", "def say_hello(name):\n", " return 'hello, ' + name" ] }, { "cell_type": "code", "execution_count": null, "id": "82145cd0-bcb2-44b8-90ef-ac461e2120bd", "metadata": {}, "outputs": [], "source": [ "# 如果不使用@语法糖（便捷语法），也可以通过下面的方式应用装饰器\n", "# say_hello = titlize_str(say_hello)\n", "# say_hello('tom')" ] }, { "cell_type": "code", "execution_count": null, "id": "108ca8a5-ec63-458b-b95b-df759b68cb51", "metadata": {}, "outputs": [], "source": [ "say_hello('tom')" ] }, { "cell_type": "code", "execution_count": null, "id": "3d846bc7-718f-4344-b90e-c7803a104887", "metadata": {}, "outputs": [], "source": [ "# 获取原函数\n", "say_hello = say_hello.__wrapped__\n", "say_hello('tom')" ] }, { "cell_type": "code", "execution_count": null, "id": "ea620684-3375-4876-b097-49e81bf225c9", "metadata": {}, "outputs": [], "source": [ "# 优化代码的执行性能：空间换时间\n", "from functools import lru_cache\n", "\n", "\n", "@lru_cache(maxsize=128)\n", "def fib(n):\n", " \"\"\"获取第n个斐波那契数\"\"\"\n", " if n in (1, 2):\n", " return 1\n", " return fib(n - 1) + fib(n - 2)" ] }, { "cell_type": "code", "execution_count": null, "id": "9f72372e-9cc9-4751-8ee7-e8d9f40727bd", "metadata": {}, "outputs": [], "source": [ "for i in range(1, 121):\n", " print(i, fib(i))" ] }, { "cell_type": "code", "execution_count": 26, "id": "29d231a2-57cd-4786-85cd-1366f5378185", "metadata": {}, "outputs": [], "source": [ "# 通过vectorize装饰器将普通函数做矢量化处理\n", "@np.vectorize\n", "def fac(n):\n", " if n == 0:\n", " return 1\n", " return n * fac(n - 1)" ] }, { "cell_type": "code", "execution_count": 27, "id": "7c04cc06-ba86-4b1b-a4e1-75e4527f6dde", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1, 2, 3, 4, 5, 6, 7, 8])" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "temp = np.arange(1, 9)\n", "temp" ] }, { "cell_type": "code", "execution_count": 28, "id": "38e8026a-6e75-4e1b-a646-98c86736797f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 1, 2, 6, 24, 120, 720, 5040, 40320])" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fac(temp)" ] }, { "cell_type": "code", "execution_count": 29, "id": "4ac8bb35-87f0-44d0-930e-3fc0c5fb63c3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(array([26, 68, 73, 33, 64, 54, 26, 40, 60, 36]),\n", " array([37, 56, 65, 30, 57, 36, 61, 54, 34, 52]))" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x1 = np.random.randint(20, 80, 10)\n", "x2 = np.random.randint(30, 70, 10)\n", "x1, x2" ] }, { "cell_type": "code", "execution_count": 30, "id": "4baced9b-fee2-4c2b-b7ca-dc5914b120b9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 1, 4, 1, 3, 1, 18, 1, 2, 2, 4])" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from math import gcd, lcm\n", "\n", "gcd = np.vectorize(gcd)\n", "gcd(x1, x2)" ] }, { "cell_type": "code", "execution_count": null, "id": "08c6a075-8b61-4716-92c5-126cd78108c1", "metadata": {}, "outputs": [], "source": [ "lcm = np.vectorize(lcm)\n", "lcm(x1, x2)" ] }, { "cell_type": "markdown", "id": "859bba4b-a0cf-4140-a8de-b2f3fffcd355", "metadata": {}, "source": [ "### 广播机制\n", "\n", "两个形状（shape属性）不一样的数组如果要做运算，要先通过广播机制使其形状一样才能运算。
\n", "如果要执行广播机制使得两个数组形状一样，需要满足以下两个条件其中一个：\n", "\n", "1. 两个数组后缘维度（shape属性从后往前看对应的部分）相同。\n", "2. 两个数组后缘维度不同，但是其中一方为1。" ] }, { "cell_type": "code", "execution_count": null, "id": "3339c56a-b68c-401e-a27c-134be60ccf14", "metadata": {}, "outputs": [], "source": [ "temp1 = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2], [3, 3, 3]])\n", "temp2 = np.array([1, 2, 3])" ] }, { "cell_type": "code", "execution_count": null, "id": "4ecc9498-792c-4de1-a120-585003b19087", "metadata": {}, "outputs": [], "source": [ "temp1 + temp2" ] }, { "cell_type": "code", "execution_count": null, "id": "2cb20c46-3f87-4346-80ab-7e2786ca1475", "metadata": {}, "outputs": [], "source": [ "temp3 = np.array([[1], [2], [3], [4]])" ] }, { "cell_type": "code", "execution_count": null, "id": "58932222-bb83-43b7-8cef-b9574898dbb5", "metadata": {}, "outputs": [], "source": [ "temp1 + temp3" ] }, { "cell_type": "code", "execution_count": null, "id": "74b6c376-05ac-4049-a4b9-5f0614de780e", "metadata": {}, "outputs": [], "source": [ "temp4 = np.array([1 ,2, 3])\n", "temp5 = np.array([[3], [2], [1]])" ] }, { "cell_type": "code", "execution_count": null, "id": "eefe6354-4614-4dfb-aa94-3d146b770b3b", "metadata": {}, "outputs": [], "source": [ "temp4.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "04dd44a5-6bea-41eb-aad0-b0a18d64a512", "metadata": {}, "outputs": [], "source": [ "temp5.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "764eb0c3-991d-4a5f-a4eb-7da2c0b445bd", "metadata": {}, "outputs": [], "source": [ "temp4 + temp5" ] }, { "cell_type": "markdown", "id": "8f1022cb-c07a-4149-aafd-9f53d235da4f", "metadata": {}, "source": [ "### 矩阵" ] }, { "cell_type": "code", "execution_count": null, "id": "8a16c29a-088a-473b-b856-3dbd6660f9cd", "metadata": {}, "outputs": [], "source": [ "m1 = np.array([[1, 0, 2], [-1, 3, 1]])\n", "m2 = np.array([[3, 1], [2, 1], [1, 0]])" ] }, { "cell_type": "code", "execution_count": null, "id": "74a54196-ca95-4425-9212-62869795aed7", "metadata": {}, "outputs": [], "source": [ "m1.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "1afbceea-782d-49c9-b6d4-d0848f3fdd99", "metadata": {}, "outputs": [], "source": [ "m2.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "bc65a37a-84eb-4aac-b63f-5625a0c15a3a", "metadata": {}, "outputs": [], "source": [ "m1 @ m2" ] }, { "cell_type": "code", "execution_count": null, "id": "67be1c96-a20c-4862-87cb-8579d3a303f5", "metadata": {}, "outputs": [], "source": [ "np.matmul(m1, m2)" ] }, { "cell_type": "markdown", "id": "5fd602a8-61fc-4c5c-94a6-a930bcf6fb2f", "metadata": {}, "source": [ "$$\n", "\\begin{cases}\n", "x_1 + 2x_2 + x_3 = 8 \\\\\n", "3x_1 + 7x_2 + 2x_3 = 23 \\\\\n", "2x_1 + 2x_2 + x_3 = 9\n", "\\end{cases}\n", "$$" ] }, { "cell_type": "markdown", "id": "d41b4856-79c3-4f48-9d8e-58c8d6045884", "metadata": {}, "source": [ "$$\n", "\\boldsymbol{A} = \\begin{bmatrix}\n", "1 & 2 & 1\\\\\n", "3 & 7 & 2\\\\\n", "2 & 2 & 1\n", "\\end{bmatrix}, \\quad\n", "\\boldsymbol{x} = \\begin{bmatrix}\n", "x_1 \\\\\n", "x_2\\\\\n", "x_3\n", "\\end{bmatrix}, \\quad\n", "\\boldsymbol{b} = \\begin{bmatrix}\n", "8 \\\\\n", "23\\\\\n", "9\n", "\\end{bmatrix}\n", "$$" ] }, { "cell_type": "code", "execution_count": null, "id": "edb85b44-50b3-4115-8539-5023a19bb2a1", "metadata": {}, "outputs": [], "source": [ "m3 = np.arange(1, 10, dtype='f8').reshape(3, 3)\n", "m3[-1, -1] = 8\n", "m3" ] }, { "cell_type": "code", "execution_count": null, "id": "0b90dbf4-05f7-43ad-a37d-20d48d03dd3a", "metadata": {}, "outputs": [], "source": [ "# 计算矩阵的秩\n", "np.linalg.matrix_rank(m3)" ] }, { "cell_type": "code", "execution_count": null, "id": "a442ab40-97a9-4cf5-8d61-bfcaa3561655", "metadata": {}, "outputs": [], "source": [ "# 逆矩阵 - 奇异矩阵不能求逆矩阵\n", "# LinAlgError: Singular matrix\n", "np.linalg.inv(m3)" ] }, { "cell_type": "code", "execution_count": null, "id": "19be7ff8-6a71-40ad-8e37-20008c56be7a", "metadata": {}, "outputs": [], "source": [ "# 有唯一解决的条件：系数矩阵的秩等于增广矩阵的秩，同时跟未知数的个数相同。\n", "# 秩（rank）：线性无关的行或者列的数量。\n", "# 线性相关：一个向量可以通过其他向量做线性变换（数乘和加法）得到，那么它们就是线性相关的。" ] }, { "cell_type": "code", "execution_count": null, "id": "87043db3-e2bd-4a70-950a-d74163afc4d1", "metadata": {}, "outputs": [], "source": [ "A = np.array([[1, 2, 1], [3, 7, 2], [2, 2, 1]])\n", "b = np.array([8, 23, 9]).reshape(-1, 1)" ] }, { "cell_type": "code", "execution_count": null, "id": "479d272b-2b46-4374-93f3-54e13af52d59", "metadata": {}, "outputs": [], "source": [ "# 系数矩阵的秩\n", "np.linalg.matrix_rank(A)" ] }, { "cell_type": "code", "execution_count": null, "id": "faa35591-09da-4232-9b38-72ee2fb824cc", "metadata": {}, "outputs": [], "source": [ "# 增广矩阵的秩\n", "np.linalg.matrix_rank(np.hstack((A, b)))" ] }, { "cell_type": "code", "execution_count": null, "id": "e8d02dee-7d86-4f9d-8e33-77b5a76784a3", "metadata": {}, "outputs": [], "source": [ "# 解线性方程组\n", "np.linalg.solve(A, b)" ] }, { "cell_type": "markdown", "id": "336ee288-5be1-41e5-89cb-e22f465efdd2", "metadata": {}, "source": [ "$$\n", "A \\cdot x = b\n", "$$\n", "$$\n", "A^{-1} \\cdot A \\cdot x = A^{-1} \\cdot b\n", "$$\n", "$$\n", "I \\cdot x = A^{-1} \\cdot b\n", "$$\n", "$$\n", "x = A^{-1} \\cdot b\n", "$$" ] }, { "cell_type": "code", "execution_count": null, "id": "7c2bd2fd-8867-4dad-8bbc-b5b175f690b7", "metadata": {}, "outputs": [], "source": [ "# 通过逆矩阵解线性方程组\n", "np.linalg.inv(A) @ b" ] }, { "cell_type": "markdown", "id": "b876a47b-a2ab-497a-b564-69750ddb8666", "metadata": {}, "source": [ "#### 补充 - 用矩阵运算实现图像处理" ] }, { "cell_type": "code", "execution_count": null, "id": "95e138b5-ed1d-40c3-acbc-821b6ae8cf41", "metadata": {}, "outputs": [], "source": [ "# 安装opencv库\n", "# %pip install opencv-python" ] }, { "cell_type": "code", "execution_count": null, "id": "669c59e5-a477-4bf3-b87b-7486e9d2e9ef", "metadata": {}, "outputs": [], "source": [ "def basic_matrix(translation):\n", " \"\"\"基础变换矩阵\"\"\"\n", " return np.array([[1, 0, translation[0]], [0, 1, translation[1]], [0, 0, 1]])" ] }, { "cell_type": "code", "execution_count": null, "id": "0338ad44-3142-428f-b1b6-45e691962900", "metadata": {}, "outputs": [], "source": [ "import copy\n", "\n", "def adjust_transform_for_image(img, trans_matrix):\n", " \"\"\"根据图像调整变换矩阵\"\"\"\n", " height, width, *_ = img.shape\n", " center = np.array([0.5 * width, 0.5 * height])\n", " return basic_matrix(center) @ trans_matrix @ basic_matrix(-center)" ] }, { "cell_type": "code", "execution_count": null, "id": "368578be-3f94-4a37-be51-c6a5dbce1512", "metadata": {}, "outputs": [], "source": [ "import cv2\n", "\n", "def apply_transform(img, transform, border_value=(204, 204, 204)):\n", " \"\"\"仿射变换\"\"\"\n", " return cv2.warpAffine(\n", " img,\n", " transform[:2, :],\n", " dsize=(img.shape[1], img.shape[0]),\n", " flags=cv2.INTER_LINEAR,\n", " borderMode=cv2.BORDER_CONSTANT,\n", " borderValue=border_value\n", " )" ] }, { "cell_type": "code", "execution_count": null, "id": "28b22755-4aed-4cd2-a3a2-57bd73f8eb00", "metadata": {}, "outputs": [], "source": [ "def apply(img, trans_matrix):\n", " \"\"\"应用变换\"\"\"\n", " temp_matrix = adjust_transform_for_image(img, trans_matrix)\n", " out_img = apply_transform(img, temp_matrix)\n", " return out_img" ] }, { "cell_type": "code", "execution_count": null, "id": "1526fbcd-ad60-4d7c-892c-5c0ae2b7ef3f", "metadata": {}, "outputs": [], "source": [ "def scale(img, x_ratio, y_ratio):\n", " \"\"\"缩放\"\"\"\n", " scale_matrix = np.array([\n", " [x_ratio, 0, 0], \n", " [0, y_ratio, 0], \n", " [0, 0, 1]\n", " ])\n", " return apply(img, scale_matrix)" ] }, { "cell_type": "code", "execution_count": null, "id": "5e443448-9b6c-488b-83a5-0de7ac4fca18", "metadata": {}, "outputs": [], "source": [ "def rotate(img, degree):\n", " \"\"\"旋转\"\"\"\n", " rad = np.deg2rad(degree)\n", " rotate_matrix = np.array([\n", " [np.cos(rad), -np.sin(rad), 0], \n", " [np.sin(rad), np.cos(rad), 0], \n", " [0, 0, 1]\n", " ])\n", " return apply(img, rotate_matrix)" ] }, { "cell_type": "code", "execution_count": null, "id": "2a09187e-2707-49c2-8b8d-84ffa28b6f7e", "metadata": {}, "outputs": [], "source": [ "def transvect(img, ratio):\n", " \"\"\"剪切影射\"\"\"\n", " transvect_matrix = np.array([\n", " [1, ratio, 0],\n", " [0, 1, 0],\n", " [0, 0, 1]\n", " ])\n", " return apply(img, transvect_matrix)" ] }, { "cell_type": "code", "execution_count": null, "id": "9561e506-3b64-49e6-9e9d-8731a2951646", "metadata": {}, "outputs": [], "source": [ "scaled_img = scale(guido_image, 1.25, 0.75)\n", "plt.imshow(scaled_img)" ] }, { "cell_type": "code", "execution_count": null, "id": "56cb960b-b3f9-4109-95ba-4388a2b1762e", "metadata": {}, "outputs": [], "source": [ "rotated_img = rotate(guido_image, -45)\n", "plt.imshow(rotated_img)" ] }, { "cell_type": "code", "execution_count": null, "id": "cc2ac604-0fd7-43d5-be18-407b78e46f58", "metadata": {}, "outputs": [], "source": [ "transvected_img = transvect(guido_image, -0.3)\n", "plt.imshow(transvected_img)" ] }, { "cell_type": "markdown", "id": "cd91252a-31d4-40d5-82ef-71f22f0bd39c", "metadata": {}, "source": [ "#### 补充 - 用scipy处理图像" ] }, { "cell_type": "code", "execution_count": null, "id": "9052864d-a32d-4cd3-9e2a-8df1654b8a67", "metadata": {}, "outputs": [], "source": [ "from scipy.ndimage import gaussian_filter, sobel\n", "\n", "# 获取灰度图\n", "guido_image = plt.imread('res/guido.jpg')\n", "gray_image = np.mean(guido_image, axis=2)\n", "\n", "plt.figure(figsize=(12, 4))\n", "\n", "# 灰度图\n", "plt.subplot(1, 4, 1)\n", "plt.imshow(gray_image, cmap=plt.cm.gray)\n", "\n", "# 模糊和锐化\n", "plt.subplot(1, 4, 2)\n", "blurred_image = gaussian_filter(gray_image, 3)\n", "plt.imshow(blurred_image, cmap=plt.cm.gray)\n", "\n", "plt.subplot(1, 4, 3)\n", "filtered_image = gaussian_filter(blurred_image, 1)\n", "sharpen_image = blurred_image + 32 * (blurred_image - filtered_image)\n", "plt.imshow(sharpen_image, cmap=plt.cm.gray)\n", "\n", "# 边缘图\n", "plt.subplot(1, 4, 4)\n", "# 使用索贝尔算子（邻点灰度加权差）进行边缘检测\n", "edge_image = sobel(gray_image)\n", "plt.imshow(edge_image, cmap=plt.cm.gray)\n", "\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "dbbc6edf-c643-4cb4-a98b-05e806413478", "metadata": {}, "outputs": [], "source": [ "from scipy.ndimage import rotate, zoom\n", "\n", "plt.figure(figsize=(12, 4))\n", "\n", "# 旋转\n", "plt.subplot(1, 3, 1)\n", "rotated_image = rotate(guido_image, -16, reshape=True)\n", "plt.imshow(rotated_image)\n", "\n", "# 旋转\n", "plt.subplot(1, 3, 2)\n", "rotated_image = rotate(guido_image, -16, reshape=False)\n", "plt.imshow(rotated_image)\n", "\n", "# 缩放\n", "plt.subplot(1, 3, 3)\n", "scaled_image = zoom(guido_image, zoom=(0.8, 1.25, 1))\n", "plt.imshow(scaled_image)\n", "\n", "plt.show()" ] }, { "cell_type": "markdown", "id": "331862d6-60cd-442f-97a4-bbbfbabfc3fa", "metadata": {}, "source": [ "#### 补充 - 视频流人脸识别" ] }, { "cell_type": "code", "execution_count": null, "id": "383fa323-db82-4a8b-a13b-9cbe9b8c48d3", "metadata": {}, "outputs": [], "source": [ "# 安装face_recognition库\n", "# %pip install face_recognition" ] }, { "cell_type": "code", "execution_count": null, "id": "9c1cf7b8-9d89-43d4-8954-616bce780183", "metadata": {}, "outputs": [], "source": [ "import cv2\n", "import face_recognition\n", "# from PIL import Image\n", "\n", "plt.figure(figsize=(12, 8))\n", "\n", "image = face_recognition.load_image_file('res/Solvay.jpg')\n", "locations = face_recognition.face_locations(image)\n", "for location in locations:\n", " top, right, bottom, left = location\n", " # Image.fromarray(image[top:bottom, left:right]).show()\n", " cv2.rectangle(image, (left, top), (right, bottom), (255, 0, 0), 2)\n", "plt.imshow(image)" ] }, { "cell_type": "code", "execution_count": null, "id": "fa76d9c2-c8c3-4cec-8c91-b6b54fe6e32e", "metadata": {}, "outputs": [], "source": [ "# import cv2\n", "# import face_recognition\n", "# import numpy as np\n", "\n", "# # 获取摄像头\n", "# video_capture = cv2.VideoCapture(0)\n", "\n", "# # 加载图片获取脸部特征\n", "# obama_image = face_recognition.load_image_file(\"res/obama.jpg\")\n", "# obama_face_encoding = face_recognition.face_encodings(obama_image)[0]\n", "# luohao_image = face_recognition.load_image_file(\"res/luohao.png\")\n", "# luohao_face_encoding = face_recognition.face_encodings(luohao_image)[0]\n", "# guido_image = face_recognition.load_image_file(\"res/guido.jpg\")\n", "# guido_face_encoding = face_recognition.face_encodings(guido_image)[0]\n", "\n", "# # 保存脸部特征和对应的名字\n", "# known_face_encodings = [\n", "# obama_face_encoding,\n", "# luohao_face_encoding,\n", "# guido_face_encoding\n", "# ]\n", "# known_face_names = [\n", "# \"Barack\",\n", "# \"Hao\",\n", "# \"Guido\"\n", "# ]\n", "\n", "# face_locations = []\n", "# face_encodings = []\n", "# face_names = []\n", "# process_this_frame = True\n", "\n", "# while True:\n", "# # 从视频中读取一帧数据\n", "# ret, frame = video_capture.read()\n", "\n", "# # 调整为原始尺寸的四分之一（加速处理）\n", "# small_frame = cv2.resize(frame, (0, 0), fx=0.25, fy=0.25)\n", "\n", "# # BGR转成RGB\n", "# rgb_small_frame = small_frame[:, :, ::-1]\n", "\n", "# if process_this_frame:\n", "# # 找到所有的人脸位置和脸部特征保存在列表中\n", "# face_locations = face_recognition.face_locations(rgb_small_frame)\n", "# face_encodings = face_recognition.face_encodings(rgb_small_frame, face_locations)\n", "\n", "# face_names = []\n", "# for face_encoding in face_encodings:\n", "# # 比较脸部特征\n", "# matches = face_recognition.compare_faces(known_face_encodings, face_encoding)\n", "# name = \"Unknown\"\n", "\n", "# # 通过距离判定最佳匹配并获取对应的名字\n", "# face_distances = face_recognition.face_distance(known_face_encodings, face_encoding)\n", "# best_match_index = np.argmin(face_distances)\n", "# if matches[best_match_index]:\n", "# name = known_face_names[best_match_index]\n", "\n", "# face_names.append(name)\n", "\n", "# process_this_frame = not process_this_frame\n", "\n", "# # 显示结果\n", "# for (top, right, bottom, left), name in zip(face_locations, face_names):\n", "# # 恢复正常的尺寸\n", "# top, right, bottom, left = top * 4, right * 4, bottom * 4, left * 4\n", "# # 绘制一个标识人脸的矩形框\n", "# cv2.rectangle(frame, (left, top), (right, bottom), (0, 0, 255), 2)\n", "# # 绘制一个填写名字的矩形框\n", "# cv2.rectangle(frame, (left, bottom - 35), (right, bottom), (0, 0, 255), cv2.FILLED)\n", "# # 绘制识别出的人脸对应的名字\n", "# cv2.putText(frame, name, (left + 6, bottom - 6), cv2.FONT_HERSHEY_DUPLEX, 1.0, (255, 255, 255), 1)\n", " \n", "# cv2.imshow('Video', frame)\n", " \n", "# # 按键盘上的q键退出窗口 \n", "# if cv2.waitKey(1) & 0xFF == ord('q'):\n", "# break\n", "\n", "# video_capture.release()\n", "# cv2.destroyAllWindows()" ] }, { "cell_type": "markdown", "id": "b6989fb7-a3ab-4889-80be-f4ecce033e5c", "metadata": {}, "source": [ "### 多项式" ] }, { "cell_type": "code", "execution_count": null, "id": "0c719ca9-de74-4c2e-aaf2-d0a9c833f512", "metadata": {}, "outputs": [], "source": [ "# NumPy老版本用poly1d表示多项式\n", "p1 = np.poly1d([3, 0, 2, 1])\n", "p2 = np.poly1d([1, 2, 3])\n", "print(p1)\n", "print(p2)" ] }, { "cell_type": "code", "execution_count": null, "id": "e6aab279-fa9f-4d4e-b597-b3c51b478777", "metadata": {}, "outputs": [], "source": [ "# 多项式加法\n", "print(p1 + p2)" ] }, { "cell_type": "code", "execution_count": null, "id": "375a1989-7231-432c-bf33-06fce490cacf", "metadata": {}, "outputs": [], "source": [ "# 多项式乘法\n", "print(p1 * p2)" ] }, { "cell_type": "code", "execution_count": null, "id": "4df5662a-c0bc-434d-b53f-623e09b51a76", "metadata": {}, "outputs": [], "source": [ "# 令x=2，计算多项式的值\n", "p2(2)" ] }, { "cell_type": "code", "execution_count": null, "id": "df150df6-9904-45b7-8704-dbb033552dcb", "metadata": {}, "outputs": [], "source": [ "# 求导\n", "print(p1.deriv())" ] }, { "cell_type": "code", "execution_count": null, "id": "d2d3002c-59d6-4e00-b00f-69f3f6ac6609", "metadata": {}, "outputs": [], "source": [ "# 求不定积分\n", "print(p1.integ())" ] }, { "cell_type": "code", "execution_count": null, "id": "0965b9e9-42fc-425c-90cb-d265e3f6e42f", "metadata": {}, "outputs": [], "source": [ "p3 = np.poly1d([1, 3, 2])\n", "print(p3)\n", "# 令多项式等于0，求解x\n", "print(p3.roots)" ] }, { "cell_type": "code", "execution_count": null, "id": "c5bda2d2-0d61-46db-ac48-0f7ec3438660", "metadata": {}, "outputs": [], "source": [ "type(p3)" ] }, { "cell_type": "code", "execution_count": null, "id": "a71c4799-183e-4096-9903-7edf9464feb7", "metadata": {}, "outputs": [], "source": [ "from numpy.polynomial import Polynomial\n", "\n", "# NumPy新版本用Polynomial表示多项式\n", "p1 = Polynomial([1, 2, 0, 3])\n", "print(p1)" ] }, { "cell_type": "code", "execution_count": null, "id": "0002c6e3-697b-425c-a17c-802e10f07981", "metadata": {}, "outputs": [], "source": [ "print(p1.deriv())" ] }, { "cell_type": "code", "execution_count": null, "id": "1f9dde21-0854-4ae6-9703-9599a4204003", "metadata": {}, "outputs": [], "source": [ "print(p1.integ())" ] }, { "cell_type": "code", "execution_count": null, "id": "eb191b28-fb2c-460b-9c87-0041b128ba16", "metadata": {}, "outputs": [], "source": [ "# 最高次项\n", "p1.degree()" ] }, { "cell_type": "markdown", "id": "2d6c3103-c5b0-413f-b2de-324b0394e3ae", "metadata": {}, "source": [ "### 最小二乘解" ] }, { "cell_type": "code", "execution_count": null, "id": "7163cb46-44bb-487c-8dd2-7c530574ecc0", "metadata": {}, "outputs": [], "source": [ "# 每月收入\n", "x = np.array([3200, 4811, 5386, 5564, 6120, 6691, 6906, 7483, 7587, 7890,\n", " 8090, 8300, 8650, 8835, 8975, 9070, 9100, 9184, 9247, 9313, \n", " 9465, 9558, 9853, 9938, 10020, 10242, 10343, 10731, 10885, 10990, \n", " 11100, 11227, 11313, 11414, 11630, 11806, 11999, 12038, 12400, 12547, \n", " 12890, 13050, 13360, 13850, 14890, 14990, 15500, 16899, 17010, 19880])\n", "# 每月网购支出\n", "y = np.array([1761, 882, 1106, 182, 1532, 1978, 2174, 2117, 2134, 1924, \n", " 2207, 2876, 2617, 2683, 3054, 3277, 3345, 3462, 3401, 3591,\n", " 3596, 3671, 3829, 3907, 3852, 4288, 4359, 4099, 4300, 4367,\n", " 5019, 4873, 4674, 5174, 4666, 5797, 5782, 5451, 5487, 5448,\n", " 6002, 6439, 6309, 6045, 5935, 6928, 7356, 6682, 6672, 6582])" ] }, { "cell_type": "code", "execution_count": null, "id": "a68a1d0d-0dc9-4437-bed9-6e9aff589df1", "metadata": {}, "outputs": [], "source": [ "# 定性分析 - 散点图\n", "plt.scatter(x, y)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "89030441-eefd-4a02-ad41-ba154dbf87b2", "metadata": {}, "outputs": [], "source": [ "from scipy import stats\n", "\n", "# 夏皮洛检验（正态性判定）\n", "stats.shapiro(x), stats.shapiro(y)" ] }, { "cell_type": "code", "execution_count": null, "id": "93573426-e1c0-4025-8b8f-2d83d69e103c", "metadata": {}, "outputs": [], "source": [ "# 定量分析 - 相关系数 - correlation coefficient\n", "# 皮尔逊相关系数（标准化的协方差 - [-1, 1]）\n", "# 1. 连续值且成对出现\n", "# 2. 没有异常值\n", "# 3. 来自于正态总体\n", "np.corrcoef(x, y)" ] }, { "cell_type": "code", "execution_count": null, "id": "4b6e47de-8e3b-4be4-a5e4-f1d95649e6f9", "metadata": {}, "outputs": [], "source": [ "# 计算皮尔逊相关系数\n", "stats.pearsonr(x, y)" ] }, { "cell_type": "code", "execution_count": null, "id": "f8e53b1e-b2b1-4222-a69a-dcbca0041d0e", "metadata": {}, "outputs": [], "source": [ "history_data = {key: value for key, value in zip(x, y)}\n", "len(history_data)" ] }, { "cell_type": "code", "execution_count": null, "id": "1530478e-ec70-4f87-828f-1d6798c53c5e", "metadata": {}, "outputs": [], "source": [ "data = np.random.randint(1, 100, 15).tolist()\n", "data" ] }, { "cell_type": "code", "execution_count": null, "id": "7d246563-86fd-418f-b058-4aae08d9b9c1", "metadata": {}, "outputs": [], "source": [ "import heapq\n", "\n", "# 通过堆（heap）结构快速的找到TopN元素\n", "print(heapq.nsmallest(3, data))\n", "print(heapq.nlargest(5, data))" ] }, { "cell_type": "code", "execution_count": null, "id": "866f9fd9-58aa-4c03-9415-99aa236958d4", "metadata": {}, "outputs": [], "source": [ "# 目标：因为月收入和网购支出之间有强相关关系，所以我们可以通过月收入预测网购支出\n", "# 方法1：输入一个月收入，找到跟这个收入最接近的N条数据，用它们的平均值预测对应的网购支出\n", "# KNN - k最近邻算法（找到k个最近的邻居，用这k个邻居的数据来做出预测）\n", "import heapq\n", "\n", "\n", "def predicate_by_knn(income, k=5):\n", " \"\"\"KNN算法\"\"\"\n", " keys = heapq.nsmallest(k, history_data, key=lambda x: (x - income) ** 2)\n", " return np.mean([history_data[key] for key in keys]).round(2)" ] }, { "cell_type": "code", "execution_count": null, "id": "399b9b6a-6625-4dc2-a09c-c0699c538a46", "metadata": {}, "outputs": [], "source": [ "predicate_by_knn(12800)" ] }, { "cell_type": "code", "execution_count": null, "id": "db0ee4bf-83f8-489f-a543-80854571da60", "metadata": {}, "outputs": [], "source": [ "predicate_by_knn(6800)" ] }, { "cell_type": "code", "execution_count": null, "id": "6e514003-c143-4823-91eb-57f7608f6137", "metadata": {}, "outputs": [], "source": [ "predicate_by_knn(20000, k=3)" ] }, { "cell_type": "markdown", "id": "aa44d3ad-aa3f-44db-889a-8a69b9a9be65", "metadata": {}, "source": [ "回归模型：\n", "$$ Y = aX + b $$\n", "\n", "损失函数：\n", "$$ MSE = \\frac{1} {N} \\sum (\\hat{y_i} - y_i)^2 $$" ] }, { "cell_type": "code", "execution_count": null, "id": "032eb740-ac9a-4526-9eb4-b6770907b07a", "metadata": {}, "outputs": [], "source": [ "# MSE - Mean Squared Error\n", "def get_loss(a, b):\n", " \"\"\"损失函数\"\"\"\n", " y_hat = a * x + b\n", " return np.mean((y_hat - y) ** 2)" ] }, { "cell_type": "code", "execution_count": null, "id": "0d56ccc4-5a0d-4b3a-89ab-01cf690c4fba", "metadata": {}, "outputs": [], "source": [ "# 蒙特卡洛模拟（随机瞎蒙法）\n", "import random\n", "\n", "min_loss = np.inf\n", "ba, bb = None, None\n", "\n", "for _ in range(10000):\n", " a = random.random() * 0.5 + 0.5\n", " b = random.random() * 1000 - 2000\n", " curr_loss = get_loss(a, b)\n", " if curr_loss < min_loss:\n", " min_loss = curr_loss\n", " ba, bb = a, b\n", " print(min_loss)\n", "print(ba, bb)" ] }, { "cell_type": "code", "execution_count": null, "id": "af5f7d02-f921-4712-82e3-1ce43c86c708", "metadata": {}, "outputs": [], "source": [ "plt.scatter(x, y)\n", "plt.plot(x, ba * x + bb, color='r', linewidth=4)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "207c2da0-371f-40ea-b19f-f56cca8a7d30", "metadata": {}, "outputs": [], "source": [ "def predicate_by_regression(income):\n", " return round(ba * income + bb, 2)" ] }, { "cell_type": "code", "execution_count": null, "id": "a9e3af32-654c-4704-9f32-cfe41f067d06", "metadata": {}, "outputs": [], "source": [ "predicate_by_regression(6800)" ] }, { "cell_type": "code", "execution_count": null, "id": "5133079a-59f8-4435-af1e-fb5bc13c457d", "metadata": {}, "outputs": [], "source": [ "predicate_by_regression(12800)" ] }, { "cell_type": "markdown", "id": "200b9821-b8c8-41df-ae8e-d96bf5049415", "metadata": {}, "source": [ "将回归模型带入损失函数：\n", "$$ f(a, b) = \\frac {1} {N} \\sum_{i=1}^{N}(y_i - (ax_i + b))^2 $$\n", "\n", "如何让$f(a, b)$取到最小值？？？\n", "\n", "求偏导数，并令其等于0。\n", "$$ \\frac {\\partial {f(a, b)}} {\\partial {a}} = \\frac {2} {N} \\sum_{i=1}^{N}(-x_iy_i + x_i^2a + x_ib) = 0 $$ \n", "$$ \\frac {\\partial {f(a, b)}} {\\partial {b}} = \\frac {2} {N} \\sum_{i=1}^{N}(-y_i + x_ia + b) = 0 $$\n", "\n", "求解得到：\n", "$$a = \\frac{\\sum(x_{i} - \\bar{x})(y_{i} - \\bar{y})}{\\sum(x_{i} - \\bar{x})^{2}}$$\n", "$$b = \\bar{y} - a\\bar{x}$$" ] }, { "cell_type": "code", "execution_count": null, "id": "9ab79db5-85c7-44ad-8ffa-2684fb499f7a", "metadata": {}, "outputs": [], "source": [ "x_bar, y_bar = np.mean(x), np.mean(y)" ] }, { "cell_type": "code", "execution_count": null, "id": "332777b5-bc3f-44d2-bd54-f8e9a19497aa", "metadata": {}, "outputs": [], "source": [ "ba = np.dot((x - x_bar), (y - y_bar)) / np.sum((x - x_bar) ** 2)\n", "bb = y_bar - ba * x_bar\n", "ba, bb" ] }, { "cell_type": "code", "execution_count": null, "id": "795f6a40-136a-4772-9e2b-5798022e408d", "metadata": {}, "outputs": [], "source": [ "plt.scatter(x, y)\n", "plt.plot(x, ba * x + bb, color='r', linewidth=4)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "2bfac239-7bf0-4784-a124-bbfdb4cdafe8", "metadata": {}, "outputs": [], "source": [ "# 拟合出一个线性回归模型\n", "np.polyfit(x, y, deg=1)" ] }, { "cell_type": "code", "execution_count": null, "id": "9d5adc92-73e2-40df-843b-02efb28c6ae3", "metadata": {}, "outputs": [], "source": [ "# 拟合出一个多项式回归模型\n", "a, b, c = np.polyfit(x, y, deg=2)\n", "a, b, c" ] }, { "cell_type": "code", "execution_count": null, "id": "c42ee939-e4b9-479e-ad73-66e806e29c6c", "metadata": {}, "outputs": [], "source": [ "plt.scatter(x, y)\n", "plt.plot(x, a * x ** 2 + b * x + c, color='r', linewidth=4)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "3363f359-3615-4ffa-a040-41e39c83b38f", "metadata": {}, "outputs": [], "source": [ "Polynomial.fit(x, y, deg=1).convert().coef" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.7" } }, "nbformat": 4, "nbformat_minor": 5 }