#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
import openpyxl
import pandas as pd
import requests
from bs4 import BeautifulSoup
from openpyxl.utils.dataframe import dataframe_to_rows
session = requests.session()
# ========================= generate urls ==========================
def generate_home_url(city): # 生成页面url
return 'http://' + city + '.lianjia.com/ershoufang/'
def generate_area_page_url(page_count, city, path): # 生成页面url
url = 'http://' + city + '.lianjia.com' + path + 'pg{}/'
for page_index in range(1, page_count):
yield url.format(page_index)
# ========================= ==========================
def update_session():
# 这里模拟一下请求头,头文件是从浏览器里面抓到的,否则服务会回复403错误,(其实就是服务器做的简单防爬虫检测)
headers = {
'Host': 'bj.lianjia.com',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch, br',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Cookie': 'TY_SESSION_ID=25a21767-af26-4543-b2b4-b92f7d6028b5; TY_SESSION_ID=f5cecba1-d783-4d40-b86d-72ee2accfccf; select_city=110000; lianjia_ssid=7ea6e0a0-dd03-48c2-9031-987bda2481c2; lianjia_uuid=435b41db-4268-4e59-9852-c4cd50e86646; sajssdk_2015_cross_new_user=1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216ff914d8d8522-08cf45a790e359-5e130c17-1024000-16ff914d8d9a86%22%2C%22%24device_id%22%3A%2216ff914d8d8522-08cf45a790e359-5e130c17-1024000-16ff914d8d9a86%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D'
}
session.headers.clear()
session.headers.update(headers)
def get_all_area_path(area_url): # 分析url解析出区域的url
update_session()
res = session.get(area_url)
if res.status_code == 200:
soup = BeautifulSoup(res.text, 'lxml')
urls = {}
areas = soup.find_all('div', attrs={'data-role': 'ershoufang'})
for item in areas:
for a in item.find_all(name='a'):
url = a.attrs['href']
urls[a.text] = url
# print('urls:'+str(urls))
return urls
def get_all_page_urls(page_url): # 分析url解析出每一页的详细url
update_session()
res = session.get(page_url)
# res = requests.get(page_url, 'lxml')
if res.status_code == 200:
soup = BeautifulSoup(res.text, 'html.parser')
urls = []
infos = soup.find_all('div', attrs={'class': 'info clear'})
for a in infos:
url = a.a.attrs['href']
urls.append(url)
return urls
def get_page_by_url(page_url): # 分析详细url获取所需信息
print("get_page_by_url:" + page_url)
update_session()
try:
res = session.get(page_url, timeout=(30, 120))
# res = requests.get(page_url)
if res.status_code == 200:
info = {}
soup = BeautifulSoup(res.text, 'lxml')
info['标题'] = soup.select('.main')[0].text
info['总价'] = soup.select('.total')[0].text + '万'
info['每平方售价'] = soup.select('.unitPriceValue')[0].text
info['参考总价'] = soup.select('.taxtext')[0].text
info['建造时间'] = soup.select('.subInfo')[2].text
info['小区名称'] = soup.select('.info')[0].text
info['所在区域'] = soup.select('.info a')[0].text + ':' + soup.select('.info a')[1].text
info['链家编号'] = str(page_url)[34:].rsplit('.html')[0]
for ul in soup.find_all('div', attrs={'class': 'base'}):
# print('ul:'+str(ul))
for li in ul.find_all(name='li'):
# print('li:'+str(li.text))
span = li.find('span', attrs={'class': 'label'})
if '房屋户型' == span.text:
info['房屋户型'] = li.text.replace('房屋户型', '')
if '所在楼层' == span.text:
info['所在楼层'] = li.text.replace('所在楼层', '')
if '建筑面积' == span.text:
info['建筑面积'] = li.text.replace('建筑面积', '')
if '户型结构' == span.text:
info['户型结构'] = li.text.replace('户型结构', '')
if '套内面积' == span.text:
info['套内面积'] = li.text.replace('套内面积', '')
if '建筑类型' == span.text:
info['建筑类型'] = li.text.replace('建筑类型', '')
if '房屋朝向' == span.text:
info['房屋朝向'] = li.text.replace('房屋朝向', '')
if '建筑结构' == span.text:
info['建筑结构'] = li.text.replace('建筑结构', '')
if '装修情况' == span.text:
info['装修情况'] = li.text.replace('装修情况', '')
if '梯户比例' == span.text:
info['梯户比例'] = li.text.replace('梯户比例', '')
if '供暖方式' == span.text:
info['供暖方式'] = li.text.replace('供暖方式', '')
if '配备电梯' == span.text:
info['配备电梯'] = li.text.replace('配备电梯', '')
if '产权年限' == span.text:
info['产权年限'] = li.text.replace('产权年限', '')
for ul in soup.find_all('div', attrs={'class': 'transaction'}):
# print('ul:'+str(ul))
for li in ul.find_all(name='li'):
# print('li:'+str(li.text))
span = li.find('span', attrs={'class': 'label'})
if '挂牌时间' == span.text:
info['挂牌时间'] = li.text.replace('挂牌时间', '').replace('\n', '').strip()
if '交易权属' == span.text:
info['交易权属'] = li.text.replace('交易权属', '').replace('\n', '').strip()
if '上次交易' == span.text:
info['上次交易'] = li.text.replace('上次交易', '').replace('\n', '').strip()
if '房屋用途' == span.text:
info['房屋用途'] = li.text.replace('房屋用途', '').replace('\n', '').strip()
if '房屋年限' == span.text:
info['房屋年限'] = li.text.replace('房屋年限', '').replace('\n', '').strip()
if ('产权所属' == span.text):
info['产权所属'] = li.text.replace('产权所属', '').replace('\n', '').strip()
if '抵押信息' == span.text:
info['抵押信息'] = li.text.replace('抵押信息', '').replace('\n', '').strip()
if '房本备件' == span.text:
info['房本备件'] = li.text.replace('房本备件', '').replace('\n', '').strip()
# print("info:" + str(info))
return info
except Exception as e:
print(str(e))
return None
def do_write_workbook(ws, data):
if data == None:
return
frame = pd.DataFrame(data, index=['0'])
# print('frame:' + str(frame))
for r in dataframe_to_rows(frame, index=False, header=True):
if '标题' in str(r):
continue
#print('row:' + str(r))
ws.append(r)
def create_sheet(workbook_file, wb, sheet_name):
data = {'标题': '西四环,珠江峰景精装两居,正对小区花园,看房方便', '总价': '590万', '每平方售价': '57399元/平米',
'参考总价': '首付及贷款情况请咨询经纪人', '建造时间': '2007年建/板塔结合', '小区名称': '珠江峰景',
'所在区域': '丰台:岳各庄', '链家编号': '101106686239', '房屋户型': '房屋户型2室1厅1厨1卫',
'所在楼层': '所在楼层高楼层 (共11层)', '建筑面积': '建筑面积102.79㎡', '户型结构': '户型结构平层',
'套内面积': '套内面积84.24㎡', '建筑类型': '建筑类型板塔结合', '房屋朝向': '房屋朝向南',
'建筑结构': '建筑结构钢混结构', '装修情况': '装修情况其他', '梯户比例': '梯户比例一梯四户',
'供暖方式': '供暖方式集中供暖', '配备电梯': '配备电梯有', '产权年限': '产权年限70年',
'挂牌时间': '2020-01-04', '交易权属': '商品房', '上次交易': '2008-07-07', '房屋用途': '普通住宅',
'房屋年限': '满五年', '产权所属': '非共有', '抵押信息': '暂无数据', '房本备件': '已上传房本照片'}
frame = pd.DataFrame(data, index=['0'])
ws = wb.create_sheet(sheet_name)
for r in dataframe_to_rows(frame, index=False, header=True):
print('row:' + str(r))
ws.append(r)
# wb.save(workbook_file)
break
return ws
def has_sheet(wb, key):
sheet_names = wb.get_sheet_names()
for sheet_name in sheet_names:
if key == sheet_name:
return True
return False
def fetch_all_area():
city = 'bj'
page_count = 15
workbook_file = '链家二手房.xlsx'
area_path_map = get_all_area_path(generate_home_url(city))
#这里要注意下,如果没有文件会失败,我没有新建操作.你可以建一个xlsx文件就行.
wb = openpyxl.load_workbook(workbook_file)
for key, val in area_path_map.items():
print('key:' + str(key) + ' val:' + str(val))
if has_sheet(wb, key): #由于抓取数据过程会失败,重新运行,所以有判断是否已经存在sheet.
continue
ws = create_sheet(workbook_file, wb, key)
for area_url in generate_area_page_url(page_count, city, val):
for page_url in get_all_page_urls(area_url):
do_write_workbook(ws, get_page_by_url(page_url))
wb.save(workbook_file)
wb = openpyxl.load_workbook(workbook_file)
if __name__ == '__main__':
fetch_all_area()
保存失败,不想再写了.直接上代码吧.
参考了 :https://blog.csdn.net/liujiayu2/article/details/86007384 的代码.
抓取的数据不太一样,是按区域存储sheet的,原文是直接分页抓取.打开链家二手房页面,会看到下面有小区/地铁,从这里的小区得到每一个小区的path,然后拼成url,再分页抓取数据.详情页的数据也把很多信息抓取了.像取暖,挂牌时间等.
![](https://img-blog.csdnimg.cn/20200201155022276.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2FyY2hrbw==,size_16,color_FFFFFF,t_70)