代码结构
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210608221051996.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80NTg5NTg3Mw==,size_16,color_FFFFFF,t_70)
相关文件的代码
爬虫文件
# mid.py 爬虫文件
import scrapy
from middle.items import MiddleItem
from selenium import webdriver
class MidSpider(scrapy.Spider):
name = 'mid'
start_urls = ['https://news.163.com/']
def __init__(self):
# 以mid.py文件取相对路径
self.bro = webdriver.Chrome(executable_path='../chromedriver_win32/chromedriver.exe')
self.models_url = []
# 进行首页的数据解析,获取目标模块的超链
def parse(self, response):
li_list = response.xpath('//div[@class="ns_area list"]/ul/li')
# 存放目标模块的url
models_url = []
# 只爬取国内、国际两个个模块
i_list = [2, 3]
for i in i_list:
# 获取url和模块名称
url = li_list[i].xpath('./a/@href').extract_first()
model = li_list[i].xpath('./a/text()').extract_first()
# 创建item对象并保存模块名
item = MiddleItem()
item['model'] = model
# 将五个目标模块存在列表中便于下载中间件进行拦截判断
self.models_url.append(url)
yield scrapy.Request(url, callback=self.parse_model, meta={'item': item})
# 进行目标模块的数据解析,获取对应标题的url
def parse_model(self, response):
div_list = response.xpath('//div[@class="ndi_main"]/div')
for div in div_list:
url = div.xpath('.//div[@class="news_title"]/h3/a/@href').extract_first()
# 获取上一层提交过来的item进行进一步数据的保存
item = response.meta['item']
yield scrapy.Request(url, callback=self.parse_detail, meta={'item': item})
# 进行对应标题页面的数据解析,获取对应的正文内容
def parse_detail(self, response):
title = response.xpath('//h1[@class="post_title"]/text()').extract_first()
if title is not None:
# 处理非法字符
title = title.replace(' ', '').replace(':', '').replace('?', '').replace('”', '').replace('"', '').replace('?', '')
else:
title = '标题'
content = response.xpath('//div[@class="post_body"]//text()').extract()
content = '\n'.join(content)
# 获取上一层提交过来的item进行进一步数据的保存
item = response.meta['item']
item['title'] = title
item['content'] = content
# 将item提交给管道
yield item
# 覆写父类方法-关闭爬虫
def close(self, spider, reason):
# 关闭浏览器
self.bro.quit()
item文件
# items.py
import scrapy
class MiddleItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 模块名称
model = scrapy.Field()
# 标题名称
title = scrapy.Field()
# 正文内容
content = scrapy.Field()
pass
中间件文件(由于只使用下载中间件,所以只贴出下载中间件的代码)
# middlewares.py 中间件文件
import random
from scrapy import signals
from scrapy.http import HtmlResponse
from itemadapter import is_item, ItemAdapter
from time import sleep
class MiddleDownloaderMiddleware:
user_agents = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
'Opera/8.0 (Windows NT 5.1; U; en)',
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2 ',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0) ',
]
# 拦截请求
# 一般在此处进行UA伪装
def process_request(self, request, spider):
# 从设置好的UA池中随机设置一个
request.headers['User-Agent'] = random.choice(self.user_agents)
# Must either:
return None
# 拦截所有响应对象
def process_response(self, request, response, spider):
# 获取爬虫文件中实例化的浏览器对象
bro = spider.bro
# 由于该方法会拦截所有响应对象,而我们需要的数据不一定是动态加载的,不需要借用selenium进行额外处理
# 因此需要进行判断,如果没有需要的数据,再借用selenium进行处理
# 一个request对应一个response
# 通过spider对象可以获取爬虫文件中的成员
if request.url in spider.models_url:
bro.get(request.url)
sleep(3)
page_text = bro.page_source
# 对需要动态加载的数据进行篡改
# 实例化一个response对象
new_response = HtmlResponse(url=request.url, body=page_text, encoding='utf-8', request=request)
return new_response
# 如果不需要篡改则返回原先的response对象
else:
return response
管道文件
# pipelines.py 管道文件
from itemadapter import ItemAdapter
import os
class MiddlePipeline:
def process_item(self, item, spider):
# 进行持久化存储
# 以mid.py文件取相对路径
path = '../data/'
if not os.path.exists(path):
os.mkdir(path)
path += str(item['model']) + '/'
if not os.path.exists(path):
os.mkdir(path)
path += str(item['title']) + '.txt'
with open(path, 'w', encoding='utf-8') as fp:
fp.write(str(item['content']))
return item
设置文件
# setting.py
# 关闭robots协议
ROBOTSTXT_OBEY = False
# 设置日志等级
LOG_LEVEL = 'ERROR'
# 打开下载中间件
DOWNLOADER_MIDDLEWARES = {
'middle.middlewares.MiddleDownloaderMiddleware': 543,
}
# 打开管道
ITEM_PIPELINES = {
'middle.pipelines.MiddlePipeline': 300,
}
爬取结果
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210608232348796.png)
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210608232405392.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80NTg5NTg3Mw==,size_16,color_FFFFFF,t_70)
注意事项
1.运行mid.py文件进行爬取时,最好到spiders文件夹的路径下进行,因为浏览器驱动的读取和数据持久化存储是按照相对路径编写的。
2.浏览器驱动的版本不一定适用,如果发现不适应,下载一个对应版本替换掉目录下的chromedriver.exe即可。
3.该用法是和某站老师学的,但是发现老师的代码是存在一些问题的,并且为了进一步练习,自己写了一份,有任何问题欢迎私聊或者评论,不定时回复。
4.完整代码如有需要个人资源自取