更新了爬虫第1天代码
This commit is contained in:
parent
402e056498
commit
452b6f1441
|
|
@ -0,0 +1,60 @@
|
|||
from urllib.error import URLError
|
||||
from urllib.request import urlopen
|
||||
|
||||
import re
|
||||
import pymysql
|
||||
|
||||
|
||||
def get_page_code(start_url, *, retry_times=3, charsets=('utf-8', )):
|
||||
try:
|
||||
for charset in charsets:
|
||||
try:
|
||||
html = urlopen(start_url).read().decode(charset)
|
||||
break
|
||||
except UnicodeDecodeError:
|
||||
html = None
|
||||
except URLError as ex:
|
||||
print('Error:', ex)
|
||||
return get_page_code(start_url, retry_times=retry_times - 1, charsets=charsets) if \
|
||||
retry_times > 0 else None
|
||||
return html
|
||||
|
||||
|
||||
def main():
|
||||
url_list = ['http://sports.sohu.com/nba_a.shtml']
|
||||
visited_list = set({})
|
||||
while len(url_list) > 0:
|
||||
current_url = url_list.pop(0)
|
||||
visited_list.add(current_url)
|
||||
print(current_url)
|
||||
html = get_page_code(current_url, charsets=('utf-8', 'gbk', 'gb2312'))
|
||||
if html:
|
||||
link_regex = re.compile(r'<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
|
||||
link_list = re.findall(link_regex, html)
|
||||
url_list += link_list
|
||||
conn = pymysql.connect(host='localhost', port=3306,
|
||||
db='crawler', user='root',
|
||||
passwd='123456', charset='utf8')
|
||||
try:
|
||||
for link in link_list:
|
||||
if link not in visited_list:
|
||||
visited_list.add(link)
|
||||
print(link)
|
||||
html = get_page_code(link, charsets=('utf-8', 'gbk', 'gb2312'))
|
||||
if html:
|
||||
title_regex = re.compile(r'<h1>(.*)<span', re.IGNORECASE)
|
||||
match_list = title_regex.findall(html)
|
||||
if len(match_list) > 0:
|
||||
title = match_list[0]
|
||||
with conn.cursor() as cursor:
|
||||
cursor.execute('insert into tb_result (rtitle, rurl) values (%s, %s)',
|
||||
(title, link))
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
print('执行完成!')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
|
|
@ -0,0 +1,50 @@
|
|||
from bs4 import BeautifulSoup
|
||||
|
||||
import re
|
||||
|
||||
|
||||
def main():
|
||||
html = """
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>首页</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Hello, world!</h1>
|
||||
<p>Good!!!</p>
|
||||
<hr>
|
||||
<div>
|
||||
<h2>这是一个例子程序</h2>
|
||||
<p>静夜思</p>
|
||||
<p class="foo">床前明月光</p>
|
||||
<p id="bar">疑似地上霜</p>
|
||||
<p class="foo">举头望明月</p>
|
||||
<div><a href="http://www.baidu.com"><p>低头思故乡</p></a></div>
|
||||
</div>
|
||||
<a class="foo" href="http://www.qq.com">腾讯网</a>
|
||||
<img src="./img/pretty-girl.png" alt="美女">
|
||||
<img src="./img/hellokitty.png" alt="凯蒂猫">
|
||||
<img src="./static/img/pretty-girl.png" alt="美女">
|
||||
<goup>Hello, Goup!</goup>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
# resp = requests.get('http://sports.sohu.com/nba_a.shtml')
|
||||
# html = resp.content.decode('gbk')
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
print(soup.title)
|
||||
# JavaScript: document.body.h1
|
||||
# JavaScript: document.forms[0]
|
||||
print(soup.body.h1)
|
||||
print(soup.find_all(re.compile(r'p$')))
|
||||
print(soup.find_all('img', {'src': re.compile(r'\./img/\w+.png')}))
|
||||
print(soup.find_all(lambda x: len(x.attrs) == 2))
|
||||
print(soup.find_all('p', {'class': 'foo'}))
|
||||
for elem in soup.select('a[href]'):
|
||||
print(elem.attrs['href'])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
from bs4 import BeautifulSoup
|
||||
|
||||
import requests
|
||||
|
||||
import re
|
||||
|
||||
|
||||
def main():
|
||||
# 通过requests第三方库的get方法获取页面
|
||||
resp = requests.get('http://sports.sohu.com/nba_a.shtml')
|
||||
# 对响应的字节串(bytes)进行解码操作(搜狐的部分页面使用了GBK编码)
|
||||
html = resp.content.decode('gbk')
|
||||
# 创建BeautifulSoup对象来解析页面(相当于JavaScript的DOM)
|
||||
bs = BeautifulSoup(html, 'lxml')
|
||||
# 通过CSS选择器语法查找元素并通过循环进行处理
|
||||
# for elem in bs.find_all(lambda x: 'test' in x.attrs):
|
||||
for elem in bs.select('a[test]'):
|
||||
# 通过attrs属性(字典)获取元素的属性值
|
||||
link_url = elem.attrs['href']
|
||||
resp = requests.get(link_url)
|
||||
bs_sub = BeautifulSoup(resp.text, 'lxml')
|
||||
# 使用正则表达式对获取的数据做进一步的处理
|
||||
print(re.sub(r'[\r\n]', '', bs_sub.find('h1').text))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Loading…
Reference in New Issue