From 25830d3753084b19f8113380dce67d592524faf3 Mon Sep 17 00:00:00 2001 From: jackfrued Date: Sat, 2 Jun 2018 13:19:54 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E4=BA=86=E7=88=AC=E8=99=AB?= =?UTF-8?q?=E9=83=A8=E5=88=86=E7=9A=84=E5=86=85=E5=AE=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Day66-75/02.数据采集和解析.md | 5 +- Day66-75/03.存储数据.md | 12 ++- Day66-75/code/example10.py | 33 ++++++++ Day66-75/code/main.py | 132 ++++++++++++++++++++++++++++++ Day66-75/code/main_redis.py | 150 ++++++++++++++++++++++++++++++++++ 玩转PyCharm(上).md | 4 +- 6 files changed, 331 insertions(+), 5 deletions(-) create mode 100644 Day66-75/code/example10.py create mode 100644 Day66-75/code/main.py create mode 100644 Day66-75/code/main_redis.py diff --git a/Day66-75/02.数据采集和解析.md b/Day66-75/02.数据采集和解析.md index a99a46d..cfe99d3 100644 --- a/Day66-75/02.数据采集和解析.md +++ b/Day66-75/02.数据采集和解析.md @@ -5,8 +5,9 @@ 1. 下载数据 - urllib / requests / aiohttp。 2. 解析数据 - re / lxml / beautifulsoup4(bs4)/ pyquery。 3. 缓存和持久化 - pymysql / redis / sqlalchemy / peewee / pymongo。 -4. 序列化和压缩 - pickle / json / zlib。 -5. 调度器 - 进程 / 线程 / 协程。 +4. 生成摘要 - hashlib。 +5. 序列化和压缩 - pickle / json / zlib。 +6. 调度器 - 进程 / 线程 / 协程。 ### HTML页面分析 diff --git a/Day66-75/03.存储数据.md b/Day66-75/03.存储数据.md index d6f0875..b491130 100644 --- a/Day66-75/03.存储数据.md +++ b/Day66-75/03.存储数据.md @@ -191,8 +191,14 @@ b'admin' #### MongoDB简介 +MongoDB是2009年问世的一个面向文档的数据库管理系统,由C++语言编写,旨在为Web应用提供可扩展的高性能数据存储解决方案。虽然在划分类别的时候后,MongoDB被认为是NoSQL的产品,但是它更像一个介于关系数据库和非关系数据库之间的产品,在非关系数据库中它功能最丰富,最像关系数据库。 + +MongoDB将数据存储为一个文档,一个文档由一系列的“键值对”组成,其文档类似于JSON对象。目前,MongoDB已经提供了对Windows、MacOS、Linux、Solaris等多个平台的支持,而且也提供了多种开发语言的驱动程序,Python当然是其中之一。 + #### MongoDB的安装和配置 + + #### 使用MongoDB实现CRUD操作 @@ -226,13 +232,15 @@ def main(): # 创建BeautifulSoup对象并指定使用lxml作为解析器 soup = BeautifulSoup(resp.text, 'lxml') href_regex = re.compile(r'^/question') + # 将URL处理成SHA1摘要(长度固定更简短) + hasher_proto = sha1() # 查找所有href属性以/question打头的a标签 for a_tag in soup.find_all('a', {'href': href_regex}): # 获取a标签的href属性值并组装完整的URL href = a_tag.attrs['href'] full_url = urljoin(base_url, href) - # 将URL处理成SHA1摘要(长度固定更简短) - hasher = sha1() + # 传入URL生成SHA1摘要 + hasher = hasher_proto.copy() hasher.update(full_url.encode('utf-8')) field_key = hasher.hexdigest() # 如果Redis的键'zhihu'对应的hash数据类型中没有URL的摘要就访问页面并缓存 diff --git a/Day66-75/code/example10.py b/Day66-75/code/example10.py new file mode 100644 index 0000000..3204c72 --- /dev/null +++ b/Day66-75/code/example10.py @@ -0,0 +1,33 @@ +import requests +from bs4 import BeautifulSoup +# selenium是一个自动化测试工具 +# 通过它可以模拟浏览器的行为来访问Web页面 +from selenium import webdriver + + +def main(): + # 先下载chromedriver并且将可执行程序放到PATH环境变量路径下 + # 创建谷歌Chrome浏览器内核 + driver = webdriver.Chrome() + # 通过浏览器内核加载页面(可以加载动态生成的内容) + driver.get('https://www.taobao.com/markets/mm/mm2017') + # driver.page_source获得的页面包含了JavaScript动态创建的内容 + soup = BeautifulSoup(driver.page_source, 'lxml') + all_images = soup.select('img[src]') + for image in all_images: + url = image.get('src') + try: + if not str(url).startswith('http'): + url = 'http:' + url + filename = url[url.rfind('/') + 1:] + print(filename) + resp = requests.get(url) + with open('c:/images/' + filename, 'wb') as f: + f.write(resp.content) + except OSError: + print(filename + '下载失败!') + print('图片下载完成!') + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/Day66-75/code/main.py b/Day66-75/code/main.py new file mode 100644 index 0000000..571a14d --- /dev/null +++ b/Day66-75/code/main.py @@ -0,0 +1,132 @@ +from enum import Enum, unique +from queue import Queue +from random import random +from threading import Thread, current_thread +from time import sleep +from urllib.parse import urlparse + +import requests +from bs4 import BeautifulSoup + + +@unique +class SpiderStatus(Enum): + IDLE = 0 + WORKING = 1 + + +def decode_page(page_bytes, charsets=('utf-8',)): + page_html = None + for charset in charsets: + try: + page_html = page_bytes.decode(charset) + break + except UnicodeDecodeError: + pass + return page_html + + +class Retry(object): + + def __init__(self, *, retry_times=3, + wait_secs=5, errors=(Exception, )): + self.retry_times = retry_times + self.wait_secs = wait_secs + self.errors = errors + + def __call__(self, fn): + + def wrapper(*args, **kwargs): + for _ in range(self.retry_times): + try: + return fn(*args, **kwargs) + except self.errors as e: + print(e) + sleep((random() + 1) * self.wait_secs) + return None + + return wrapper + + +class Spider(object): + + def __init__(self): + self.status = SpiderStatus.IDLE + + @Retry() + def fetch(self, current_url, *, charsets=('utf-8', ), + user_agent=None, proxies=None): + thread_name = current_thread().name + print(f'[{thread_name}]: {current_url}') + headers = {'user-agent': user_agent} if user_agent else {} + resp = requests.get(current_url, + headers=headers, proxies=proxies) + return decode_page(resp.content, charsets) \ + if resp.status_code == 200 else None + + def parse(self, html_page, *, domain='m.sohu.com'): + soup = BeautifulSoup(html_page, 'lxml') + url_links = [] + for a_tag in soup.body.select('a[href]'): + parser = urlparse(a_tag.attrs['href']) + scheme = parser.scheme or 'http' + netloc = parser.netloc or domain + if scheme != 'javascript' and netloc == domain: + path = parser.path + query = '?' + parser.query if parser.query else '' + full_url = f'{scheme}://{netloc}{path}{query}' + if full_url not in visited_urls: + url_links.append(full_url) + return url_links + + def extract(self, html_page): + pass + + def store(self, data_dict): + pass + + +class SpiderThread(Thread): + + def __init__(self, name, spider, tasks_queue): + super().__init__(name=name, daemon=True) + self.spider = spider + self.tasks_queue = tasks_queue + + def run(self): + while True: + current_url = self.tasks_queue.get() + visited_urls.add(current_url) + self.spider.status = SpiderStatus.WORKING + html_page = self.spider.fetch(current_url) + if html_page not in [None, '']: + url_links = self.spider.parse(html_page) + for url_link in url_links: + self.tasks_queue.put(url_link) + self.spider.status = SpiderStatus.IDLE + + +def is_any_alive(spider_threads): + return any([spider_thread.spider.status == SpiderStatus.WORKING + for spider_thread in spider_threads]) + + +visited_urls = set() + + +def main(): + task_queue = Queue() + task_queue.put('http://m.sohu.com/') + spider_threads = [SpiderThread('thread-%d' % i, Spider(), task_queue) + for i in range(10)] + for spider_thread in spider_threads: + spider_thread.start() + + while not task_queue.empty() or is_any_alive(spider_threads): + pass + + print('Over!') + + +if __name__ == '__main__': + main() diff --git a/Day66-75/code/main_redis.py b/Day66-75/code/main_redis.py new file mode 100644 index 0000000..b73a6bc --- /dev/null +++ b/Day66-75/code/main_redis.py @@ -0,0 +1,150 @@ +import pickle +import zlib +from enum import Enum, unique +from hashlib import sha1 +from random import random +from threading import Thread, current_thread +from time import sleep +from urllib.parse import urlparse + +import pymongo +import redis +import requests +from bs4 import BeautifulSoup +from bson import Binary + + +@unique +class SpiderStatus(Enum): + IDLE = 0 + WORKING = 1 + + +def decode_page(page_bytes, charsets=('utf-8',)): + page_html = None + for charset in charsets: + try: + page_html = page_bytes.decode(charset) + break + except UnicodeDecodeError: + pass + return page_html + + +class Retry(object): + + def __init__(self, *, retry_times=3, + wait_secs=5, errors=(Exception, )): + self.retry_times = retry_times + self.wait_secs = wait_secs + self.errors = errors + + def __call__(self, fn): + + def wrapper(*args, **kwargs): + for _ in range(self.retry_times): + try: + return fn(*args, **kwargs) + except self.errors as e: + print(e) + sleep((random() + 1) * self.wait_secs) + return None + + return wrapper + + +class Spider(object): + + def __init__(self): + self.status = SpiderStatus.IDLE + + @Retry() + def fetch(self, current_url, *, charsets=('utf-8', ), + user_agent=None, proxies=None): + thread_name = current_thread().name + print(f'[{thread_name}]: {current_url}') + headers = {'user-agent': user_agent} if user_agent else {} + resp = requests.get(current_url, + headers=headers, proxies=proxies) + return decode_page(resp.content, charsets) \ + if resp.status_code == 200 else None + + def parse(self, html_page, *, domain='m.sohu.com'): + soup = BeautifulSoup(html_page, 'lxml') + for a_tag in soup.body.select('a[href]'): + parser = urlparse(a_tag.attrs['href']) + scheme = parser.scheme or 'http' + netloc = parser.netloc or domain + if scheme != 'javascript' and netloc == domain: + path = parser.path + query = '?' + parser.query if parser.query else '' + full_url = f'{scheme}://{netloc}{path}{query}' + if not redis_client.sismember('visited_urls', full_url): + redis_client.rpush('m_sohu_task', full_url) + + def extract(self, html_page): + pass + + def store(self, data_dict): + pass + + +class SpiderThread(Thread): + + def __init__(self, name, spider): + super().__init__(name=name, daemon=True) + self.spider = spider + + def run(self): + while True: + current_url = redis_client.lpop('m_sohu_task') + while not current_url: + current_url = redis_client.lpop('m_sohu_task') + self.spider.status = SpiderStatus.WORKING + current_url = current_url.decode('utf-8') + if not redis_client.sismember('visited_urls', current_url): + redis_client.sadd('visited_urls', current_url) + html_page = self.spider.fetch(current_url) + if html_page not in [None, '']: + hasher = hasher_proto.copy() + hasher.update(current_url.encode('utf-8')) + doc_id = hasher.hexdigest() + if not sohu_data_coll.find_one({'_id': doc_id}): + sohu_data_coll.insert_one({ + '_id': doc_id, + 'url': current_url, + 'page': Binary(zlib.compress(pickle.dumps(html_page))) + }) + self.spider.parse(html_page) + self.spider.status = SpiderStatus.IDLE + + +def is_any_alive(spider_threads): + return any([spider_thread.spider.status == SpiderStatus.WORKING + for spider_thread in spider_threads]) + + +redis_client = redis.Redis(host='120.77.222.217', + port=6379, password='1qaz2wsx') +mongo_client = pymongo.MongoClient(host='120.77.222.217', port=27017) +db = mongo_client.msohu +sohu_data_coll = db.webpages +hasher_proto = sha1() + + +def main(): + if not redis_client.exists('m_sohu_task'): + redis_client.rpush('m_sohu_task', 'http://m.sohu.com/') + spider_threads = [SpiderThread('thread-%d' % i, Spider()) + for i in range(10)] + for spider_thread in spider_threads: + spider_thread.start() + + while redis_client.exists('m_sohu_task') or is_any_alive(spider_threads): + pass + + print('Over!') + + +if __name__ == '__main__': + main() diff --git a/玩转PyCharm(上).md b/玩转PyCharm(上).md index 0ab57ed..b3295b5 100644 --- a/玩转PyCharm(上).md +++ b/玩转PyCharm(上).md @@ -6,13 +6,15 @@ PyCharm是由JetBrains公司开发的提供给Python专业的开发者的一个 可以在[JetBrains公司的官方网站]()找到PyCharm的[下载链接](https://www.jetbrains.com/pycharm/download/),有两个可供下载的版本一个是社区版一个是专业版,社区版在[Apache许可证](https://zh.wikipedia.org/wiki/Apache%E8%AE%B8%E5%8F%AF%E8%AF%81)下发布,专业版在专用许可证下发布(需要购买授权下载后可试用30天),其拥有许多额外功能。安装PyCharm需要有JRE(Java运行时环境)的支持,如果没有可以在安装过程中选择在线下载安装。 +> 说明:如果你是一名学生,希望购买PyCharm来使用,可以看看[教育优惠官方申请指南](https://sales.jetbrains.com/hc/zh-cn/articles/207154369)。 + ### 首次使用的设置 第一次使用PyCharm时,会有一个导入设置的向导,如果之前没有使用PyCharm或者没有保存过设置的就直接选择“Do not import settings”进入下一步即可。 ![](./res/pycharm-import-settings.png) -专业版的PyCharm是需要激活的,强烈建议为优秀的软件支付费用,如果不用做商业用途,我们可以暂时选择试用30天或者使用社区版的PyCharm。 +专业版的PyCharm是需要激活的,**强烈建议为优秀的软件支付费用**,如果不用做商业用途,我们可以暂时选择试用30天或者使用社区版的PyCharm。 ![](./res/pycharm-activate.png)