请求网页信息
blog_spider.py
import requests
# 生成列表
from bs4 import BeautifulSoup
urls = [
# f"http://www.cnblogs.com/#p{page}"
f"https://www.cnblogs.com/sitehome/p/{page}"
for page in range(1,50+1)
]
def craw(url):
r = requests.get(url)
# print(url, len(r.text)
return r.text
def parse(html):
# class='post-item-title'
# 定位html中a标签class为post-item-title的值
soup = BeautifulSoup(html,'html.parser')
links = soup.find_all("a", class_='post-item-title')
return [(link['href'], link.get_text()) for link in links]
if __name__ == '__main__':
for result in parse(craw(urls[2])):
print(result)
# craw(urls[0])
multi_thread_craw.py
import threading
import time
import blog_spider
def single_thread():
print('single_thread begin')
for url in blog_spider.urls:
blog_spider.craw(url)
print('single_thread end')
def multi_thread():
print('multi_thread begin')
threads = []
for url in blog_spider.urls:
threads.append(threading.Thread(target=blog_spider.craw, args=(url,)))
for thread in threads:
thread.start()
for thread in threads:
thread.join()
print('multi_thread end')
if __name__ == '__main__':
start = time.time()
single_thread()
end = time.time()
print("single thread cost:", end-start, 'seconds')
start = time.time()
multi_thread()
end = time.time()
print("multi thread cost:", end - start, 'seconds')
运行结果:
single_thread begin
single_thread end
single thread cost: 56.829020261764526 seconds
multi_thread begin
multi_thread end
multi thread cost: 1.360729455947876 seconds
可以看出多线程比单线程提升了约41.77倍。