利用scrapy-redis爬取链家百万成交记录

新年的钟声敲响之际,我也敲定了链家成交记录的爬虫代码,从北京开始,一共近百万成交记录。代码总共用了两天,关键代码1.31号写完,元旦完成全部代码修补了几个小bug,先亮一下爬取的字段:

{
	"_id" : "101100313085",
	"index_url" : "https://bj.lianjia.com/chengjiao/dongcheng/pg1p2l2a2/",
	"detail_url" : "https://bj.lianjia.com/chengjiao/101100313085.html",
	"house_code" : "101100313085",
	"detail_title" : "定安里 2室1厅 54.34平米",
	"detail_dirFurnish" : "南 北 | 简装",
	"detail_dealDate" : "2016.07.21",
	"detail_floor" : "中楼层(共6层) 1980年建板楼",
	"detail_totalPrice" : "240",
	"detail_unitPrice" : "44167",
	"detail_listPrice" : "挂牌240万",
	"aprt_dealCycle" : "成交周期7天",
	"detail_agentName" : "李玉军",
	"detail_agentId" : "1000000010080328",
	"detail_dealInfo" : "定安里 2室1厅 54.34平米2016.07.21 成交",
	"detail_dealBread" : "北京房产网北京二手房成交东城二手房成交永定门二手房成交定安里二手房成交",
	"detail_priceChangeTimes" : "0",
	"detail_visitTimes" : "",
	"detail_followers" : "7",
	"detail_viewTimes" : "66",
	"detail_basic_info" : {
		"房屋户型" : "2室1厅1厨1卫",
		"所在楼层" : "中楼层(共6层)",
		"建筑面积" : "54.34㎡",
		"户型结构" : "平层",
		"套内面积" : "暂无数据",
		"建筑类型" : "板楼",
		"房屋朝向" : "南 北",
		"建成年代" : "1980",
		"装修情况" : "简装",
		"建筑结构" : "混合结构",
		"供暖方式" : "集中供暖",
		"梯户比例" : "一梯两户",
		"配备电梯" : "无"
	},
	"detail_transaction_info" : {
		"链家编号" : "101100313085",
		"交易权属" : "商品房",
		"挂牌时间" : "2016-07-15",
		"房屋用途" : "普通住宅",
		"房屋年限" : "满五年",
		"房权所属" : "非共有"
	},
	"detail_transaction_history" : "240万单价44167元/平,2016-07成交",
	"community_name" : "定安里",
	"community_url" : "https://bj.lianjia.com/chengjiao/c1111027376735",
	"community_info" : {
		
	},
	"detail_features" : {
		"房源标签" : "房本满五年"
	},
	"resblockPosition" : "116.418443,39.866651",
	"city_id" : "110000",
	"city_name" : "city_name: '北京'",
	"resblockId" : "1111027376735"
}

实际在生产项目中是需要把html文件保存下来的,但是我的服务器只有区区50G空间,可用空间只有10G多点儿了,吃不消,所以爬取时尽量地采集更多的字段。

链家成交记录是有反爬的,需要使用大量代理IP,不然爬取速度会受限。

  • https://bj.lianjia.com/chengjiao/,这个是入口,很明显,每页只有30个房源,最多显示100页的限制需要通过不同分类才能全面地获取数据。我专门写了个脚本,利用区域、售价、房型三个限制条件来构造所有的url,这样能够采集所有的url,https://bj.lianjia.com/chengjiao/haidian/l3a3p4/,这个是海淀300-400万,三室,70-90平的url,可以通过解析下面的页面构造出来。
链家成交记录页面
  • 禁用cookie和redirect,设置timeout=3, retry_times=5,单IP并发数16,单机并发数我控制在了80,用了6个拨号服务器做代理20秒拨号一次,这样scrapy每秒能抓取10个房源,基本上一天多就采集完北京站的交易记录。
  • 关于adsl vps,我博客讲过多次,即使是少量,也要尽量用不同地区甚至省份的VPS,确保IP多样性和高可用率。以我的经验,江浙和广东一带的服务器IP最多,毕竟网络发达程度和互联网发达程度成正比,推荐杭州,景德镇,中山这类城市。
  • 关键代码如下。实际上,中间件至关重要,请求头, referer和代理一定要设置好并做好测试。
#!/usr/bin/env python3
# # -*- coding: utf-8 -*-
import json
import logging
import uuid
import pickle
import scrapy
from scrapy_redis import spiders
from scrapy.utils.project import get_project_settings
from scrapy_redis.utils import bytes_to_str
import redis
import random
from scrapy_redis.spiders import RedisSpider
from lianjia.items import LianjiaItem
from lianjia.log import logger
import re
import sys

class DealsSpider(RedisSpider):
    name = 'deals'
    allowed_domains = ['lianjia.com']
    # start_urls = ['http://lianjia.com/']
    redis_key = 'lianjia:start_urls'
    
    def __init__(self, *args, **kwargs):
        super(DealsSpider, self).__init__(*args, **kwargs)
            
    def parse(self, response):
        index_url = response.url
        num_found = int(response.xpath('//div[@class="total fl"]/span/text()').extract_first())
        logger.info(f'num of apartments found in {index_url}: {num_found}')        
        if num_found > 0:
            try:
                logger.debug(f'index request.meta: {response.request.meta} {index_url}')
                logger.debug(f'index request.headers: {response.request.headers} {index_url}')            
                total_pages = int(num_found/30) + 1
                aprt_list = response.xpath('//ul[@class="listContent"]/li')
                logger.info(f'num of apartments in the current_pgNum: {len(aprt_list)}')
                pattern = re.compile(r'"curPage":\d+')
                curPage_ = re.search(pattern, response.text)[0]
                patternd = re.compile(r'\d+')
                current_pgNum = int(re.search(patternd, curPage_)[0])
                logger.info(f'curPage matched: {current_pgNum}')
                logger.debug(f'debug index_url: {index_url}')
                # current_pgNum = int(response.xpath('//div[@class="contentBottom clear"]/div[@class="page-box fr"]/div[@class="page-box house-lst-page-box"]/a[@class="on"]/text()').extract_first())            
                for li in aprt_list:

                    aprt_link = self.eleMissing(li.xpath('./a/@href').extract_first())
                   
                    aprt_title = self.eleMissing(self.strJoin(li.xpath('./div[@class="info"]/div[@class="title"]/a/text()').extract()))
                    aprt_dirFurnish = self.eleMissing(self.strJoin(li.xpath('./div[@class="info"]/div[@class="address"]/div[@class="houseInfo"]/text()').extract()))
                    aprt_dealDate = self.eleMissing(self.strJoin(li.xpath('./div[@class="info"]//div[@class="dealDate"]/text()').extract()))
                    aprt_floor = self.eleMissing(self.strJoin(li.xpath('./div[@class="info"]/div[@class="flood"]/div[@class="positionInfo"]/text()').extract()))                
                    aprt_totalPrice =  self.eleMissing(li.xpath('./div[@class="info"]/div[@class="address"]/div[@class="totalPrice"]/span[@class="number"]/text()').extract_first())
                    aprt_unitPrice = self.eleMissing(li.xpath('./div[@class="info"]/div[@class="flood"]/div[@class="unitPrice"]/span[@class="number"]/text()').extract_first())
                    aprt_features = self.eleMissing(li.xpath('./div[@class="info"]/div[@class="dealHouseInfo"]/span[@class="dealHouseTxt"]/span/text()').extract_first())                
                    aprt_listPrice = self.eleMissing(self.strJoin(li.xpath('./div[@class="info"]/div[@class="dealCycleeInfo"]/span[@class="dealCycleTxt"]/span[1]/text()').extract()))
                    aprt_dealCycle = self.eleMissing(li.xpath('./div[@class="info"]/div[@class="dealCycleeInfo"]/span[@class="dealCycleTxt"]/span[2]/text()').extract_first())
                    aprt_agent_name = self.eleMissing(li.xpath('./div[@class="info"]/div[@class="agentInfoList"]/a/text()').extract_first())
                    aprt_agent_id = self.eleMissing(li.xpath('./div[@class="info"]/div[@class="agentInfoList"]/div[@class="agent_chat_btn im-talk LOGCLICKDATA"]/@data-lj_action_agent_id').extract_first())                    
                    yield scrapy.Request(url=aprt_link, meta={'detail_url': aprt_link, 'detail_title': aprt_title, 'detail_dirFurnish': aprt_dirFurnish,
                    'detail_dealDate': aprt_dealDate, 'detail_floor': aprt_floor, 'detail_totalPrice': aprt_totalPrice, 'detail_unitPrice': aprt_unitPrice,
                    'detail_sellpoint': aprt_features, 'detail_listPrice': aprt_listPrice, 'aprt_dealCycle': aprt_dealCycle, 'index_url': index_url,
                    'detail_agent_name': aprt_agent_name, 'detail_agent_id': aprt_agent_id, 'dont_redirect': True, 'referer': index_url}, callback=self.parse_item, dont_filter=False)
                if current_pgNum < total_pages:
                    pg = 'pg' + str(current_pgNum)
                    next_url = re.sub(f'/{pg}', f'/pg{current_pgNum + 1}', index_url)
                    logger.debug(f'next_url: {next_url}')
                    yield scrapy.Request(url=next_url, callback=self.parse, dont_filter=False, meta={'dont_redirect': True, 'referer': index_url})
            except Exception as e:
                logger.info(e)
                # logger.info(response.text)
                # sys.exit()
    def parse_item(self, response):    
        logger.debug(f'request.meta: {response.request.meta} {response.url}')
        logger.debug(f'request.headers: {response.request.headers} {response.url}')     
        item = LianjiaItem()
        item['index_url'] = response.meta['index_url']
        item['detail_url'] = response.meta['detail_url']
        item['house_code'] = response.meta['detail_url'].split('/')[-1].split('.')[0]
        item['_id'] = item['house_code']
        item['detail_title'] = response.meta['detail_title']
        item['detail_dirFurnish'] = response.meta['detail_dirFurnish'] 
        item['detail_dealDate'] = response.meta['detail_dealDate']
        item['detail_floor'] = response.meta['detail_floor']
        item['detail_totalPrice'] = response.meta['detail_totalPrice']
        item['detail_unitPrice'] = response.meta['detail_unitPrice']
        # item['detail_sellpoint'] = response.meta['detail_sellpoint']
        item['detail_listPrice'] = response.meta['detail_listPrice']
        if len(item['detail_listPrice']) == 0:
            item['detail_listPrice'] = self.eleMissing(response.xpath('//section[@class="wrapper"]//div[@class="msg"]/span[1]/label/text()').extract_first())
        item['aprt_dealCycle'] = response.meta['aprt_dealCycle']
        # Not all aprt_agent_id exist
        item['detail_agentName'] = response.meta['detail_agent_name']
        item['detail_agentId'] = response.meta['detail_agent_id']        
        item['detail_dealInfo'] = self.eleMissing(response.xpath('//div[@class="wrapper"]/text()').extract_first() + response.xpath('//div[@class="wrapper"]/span/text()').extract_first())
        item['detail_dealBread'] = self.eleMissing(self.strJoin(response.xpath('//section[@class="wrapper"]/div[@class="deal-bread"]/a/text()').extract()))
        item['detail_priceChangeTimes'] = self.eleMissing(response.xpath('//section[@class="wrapper"]//div[@class="msg"]/span[3]/label/text()').extract_first())
        item['detail_visitTimes'] = self.eleMissing(response.xpath('//section[@class="wrapper"]//div[@class="msg"]/span[4]/label/text()').extract_first())
        item['detail_followers'] = self.eleMissing(response.xpath('//section[@class="wrapper"]//div[@class="msg"]/span[5]/label/text()').extract_first())
        item['detail_viewTimes'] = self.eleMissing(response.xpath('//section[@class="wrapper"]//div[@class="msg"]/span[6]/label/text()').extract_first())
        basic_info_names = self.stripList(response.xpath('//section[@class="houseContentBox"]//div[@class="base"]/div[@class="content"]/ul/li/span/text()').extract())
        basic_info_values = self.stripList(response.xpath('//section[@class="houseContentBox"]//div[@class="base"]/div[@class="content"]/ul/li/text()').extract())
        item['detail_basic_info'] = dict(zip(basic_info_names, basic_info_values))
        transaction_info_names = self.stripList(response.xpath('//div[@class="transaction"]//div[@class="content"]/ul/li/span/text()').extract())
        transaction_info_values = self.stripList(response.xpath('//div[@class="transaction"]//div[@class="content"]/ul/li/text()').extract())        
        item['detail_transaction_info'] = dict(zip(transaction_info_names, transaction_info_values))   
        item['detail_transaction_history'] = self.eleMissing(self.strJoin(response.xpath('//*[@id="chengjiao_record"]/ul/li//text()').extract()))       
        # item['community_name'] = self.eleMissing(response.xpath('//*[@id="resblockCardContainer"]/div[@class="newwrap"]/div[@class="xiaoquCard"]/div[@class="xiaoqu_header clear"]/h3/span/text()').extract_first())[:-2]
        item['community_name'] = item['detail_title'].split(' ')[0]
        # item['community_url'] = response.xpath('//*[@id="resblockCardContainer"]/div[@class="newwrap"]/div[@class="xiaoquCard"]/div[@class="xiaoqu_header clear"]/a/@href').extract_first()
        pattern_url = re.compile(r'https://bj.lianjia.com/chengjiao/c\d+')
        item['community_url'] = self.eleMissing(re.search(pattern_url, response.text)[0])
        community_info_label = response.xpath('//*[@id="resblockCardContainer"]/div[@class="newwrap"]/div[@class="xiaoquCard"]/div[@class="xiaoqu_content clear"]/div[@class="xiaoqu_main fl"]/div/label/text()').extract()
        community_info_value = response.xpath('//*[@id="resblockCardContainer"]/div[@class="newwrap"]/div[@class="xiaoquCard"]/div[@class="xiaoqu_content clear"]/div[@class="xiaoqu_main fl"]/div/span/text()').extract()
        item['community_info'] = dict(zip(self.stripList(community_info_label), self.stripList(community_info_value)))
        feature_label = self.eleMissing(response.xpath('//*[@id="house_feature"]/div[@class="introContent showbasemore"]/div/div[@class="name"]/text()').extract())
        feature_value = self.eleMissing(response.xpath('//*[@id="house_feature"]/div[@class="introContent showbasemore"]/div/div[@class="content"]/a/text()').extract())
        item['detail_features'] = dict(zip(self.stripList(feature_label), self.stripList(feature_value)))
        # positionInfo: 
        pattern_pos = re.compile(r"resblockPosition:'\d+.\d+,\d+.\d+")
        pos_ = re.search(pattern_pos, response.text)[0]
        item['resblockPosition'] = self.eleMissing(re.search(r'\d+.\d+,\d+.\d+', pos_)[0])
        # city_id:
        pattern_cityId = re.compile(r"city_id: '\d+")
        cityId_ = re.search(pattern_cityId, response.text)[0]
        item['city_id'] = self.eleMissing(re.search(r'\d+', cityId_)[0])
        # city_name
        pattern_cityName = re.compile(r"city_name: '.*'")
        item['city_name'] = self.eleMissing((re.search(pattern_cityName, response.text)[0]))
        # resblockId
        pattern_resblockId = re.compile(r"resblockId:'\d+'")
        resblockId_ = re.search(pattern_resblockId, response.text)[0]
        item['resblockId'] = self.eleMissing(re.search(r'\d+', resblockId_)[0])
        yield item
    def strJoin(self, element_list):
        return ''.join(i for i in element_list)
    def eleMissing(self, element):
        if element is None:
            return ""
        else:
            return element
    def stripList(self, eleList):
        return [i.strip() for i in eleList]

发表评论

电子邮件地址不会被公开。 必填项已用*标注