class DedupeMiddleware(object): # 定制去重中间件
logger = logging.getLogger(__name__) # 利用这个可以方便进行scrapy 的log输出
client = redis.Redis(host=settings.REDIS_HOST, port=6379, db=0, password=settings.REDIS_PWD)
if client:
logger.info('redis connected for dedupe')
else:
logger.info('redis connect failed for dedupe')
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
if self.client.hexists('existing_url', request.url): #取item里的url和key里的字段对比,看是否存在,存在就丢掉这个item。不存在返回item给后面的函数处理
self.logger.info(f'{request.url} already in mysql')
raise IgnoreRequest(f'{request.url} already in mysql')
return None
splash:
image: scrapinghub/splash
ports:
- 8050:8050
deploy:
mode: replicated
replicas: 2
labels: [APP=SPLASH]
# service resource management
resources:
# Hard limit - Docker does not allow to allocate more
limits:
cpus: '0.25'
memory: 2048M
# Soft limit - Docker makes best effort to return to it
reservations:
cpus: '0.25'
memory: 2560M
# service restart policy
restart_policy:
condition: any
delay: 5s
max_attempts: 3
window: 120s
# placement constraint - in this case on 'worker' nodes only
placement:
constraints: [node.role == worker]
(boss) [chen@VM_0_3_centos aquarium]$ docker stack deploy -c docker-compose.yml splash Updating service splash_visualizer (id: lcfw3l45xkvewmly5yz7ukywh) Updating service splash_splash (id: v1s5gbl9nfzmmp1hct6c8okce)
# HAProxy 1.7 config for Splash. It assumes Splash instances are executed
# on the same machine and connected to HAProxy using Docker links.
global
# raise it if necessary
maxconn 512
# required for stats page
stats socket /tmp/haproxy
userlist users
user USER insecure-password PASSWD
defaults
log global
mode http
# remove requests from a queue when clients disconnect;
# see https://cbonte.github.io/haproxy-dconv/1.7/configuration.html#4.2-option%20abortonclose
option abortonclose
# gzip can save quite a lot of traffic with json, html or base64 data
# compression algo gzip
compression type text/html text/plain application/json
# increase these values if you want to
# allow longer request queues in HAProxy
timeout connect 3600s
timeout client 3600s
timeout server 3600s
# visit 0.0.0.0:8036 to see HAProxy stats page
listen stats
bind *:8036
mode http
stats enable
stats hide-version
stats show-legends
stats show-desc Splash Cluster
stats uri /
stats refresh 10s
stats realm Haproxy\ Statistics
stats auth admin:adminpass
# Splash Cluster configuration
# 代理服务器监听全局的8050端口
frontend http-in
bind *:8050
# 如果你需要开启Splash的访问认证
# 则注释default_backend splash-cluster
# 并放开其余default_backend splash-cluster 之上的其余注释
# 账号密码为user userpass
acl auth_ok http_auth(users)
http-request auth realm Splash if !auth_ok
http-request allow if auth_ok
http-request deny
acl staticfiles path_beg /_harviewer/
acl misc path / /info /_debug /debug
use_backend splash-cluster if auth_ok !staticfiles !misc
use_backend splash-misc if auth_ok staticfiles
use_backend splash-misc if auth_ok misc
default_backend splash-cluster
backend splash-cluster
option httpchk GET /
balance leastconn
# try another instance when connection is dropped
retries 2
option redispatch
# 将下面IP地址替换为你自己的Splash服务IP和端口
# 按照以下格式一次增加其余的Splash服务器
server splash-0 SPLASH0_IP:8050 check maxconn 50 inter 2s fall 10 observe layer4
server splash-1 SPLASH1_IP:8050 check maxconn 50 inter 2s fall 10 observe layer4
backend splash-misc
balance roundrobin
# 将下面IP地址替换为你自己的Splash服务IP和端口
# 按照以下格式一次增加其余的Splash服务器
server splash-0 SPLASH0_IP:8050 check fall 15
server splash-1 SPLASH1_IP:8050 check fall 15
重磅: scrapy crawl SPIDER -a http_user=’USER’ -a http_pass=’PASSWD’