scrapy单机如何日均采集百万

解决了增量去重的问题,两个ADSL VPS搭建的代理池可以支撑每秒10-12个商品的爬虫。基本上达到了单机日均百万的采集速度,想要更快的话需要更多代理IP,加几个拨号服务器可以解决。关于更多优化,我的博客最近提到过,就不多说了。

CONCURRENT_REQUESTS = 20, DOWNLOAD_TIMEOUT = 5

{'downloader/exception_count': 83,
 'downloader/exception_type_count/scrapy.exceptions.IgnoreRequest': 44,
 'downloader/exception_type_count/twisted.internet.error.TimeoutError': 39,
 'downloader/request_bytes': 1126708,
 'downloader/request_count': 3208,
 'downloader/request_method_count/GET': 3208,
 'downloader/response_bytes': 3437494,
 'downloader/response_count': 3169,
 'downloader/response_status_count/200': 3169,
 'elapsed_time_seconds': 119.389706,  ##################################
 'finish_reason': 'closespider_itemcount',
 'finish_time': datetime.datetime(2020, 12, 20, 13, 29, 0, 592259),
 'item_scraped_count': 1002,
 'log_count/DEBUG': 4212,
 'log_count/INFO': 1231,
 'memdebug/gc_garbage_count': 0,
 'memdebug/live_refs/Product1Spider': 1,
 'memdebug/live_refs/ProductItem': 3,
 'memdebug/live_refs/Request': 1,
 'memusage/max': 81879040,
 'memusage/startup': 55734272,
 'request_depth_max': 161,
 'response_received_count': 3169,
 'retry/count': 39,
 'retry/reason_count/twisted.internet.error.TimeoutError': 39,
 'scheduler/dequeued/redis': 3252,
 'scheduler/enqueued/redis': 2633,
 'start_time': datetime.datetime(2020, 12, 20, 13, 27, 1, 202553)}

CONCURRENT_REQUESTS = 16, DOWNLOAD_TIMEOUT = 5

{'downloader/exception_count': 42,
 'downloader/exception_type_count/scrapy.exceptions.IgnoreRequest': 2,
 'downloader/exception_type_count/twisted.internet.error.TimeoutError': 40,
 'downloader/request_bytes': 1093102,
 'downloader/request_count': 3165,
 'downloader/request_method_count/GET': 3165,
 'downloader/response_bytes': 3268017,
 'downloader/response_count': 3125,
 'downloader/response_status_count/200': 3125,
 'elapsed_time_seconds': 148.237574, ####################################
 'finish_reason': 'closespider_itemcount',
 'finish_time': datetime.datetime(2020, 12, 20, 13, 40, 57, 779393),
 'item_scraped_count': 1004,
 'log_count/DEBUG': 4171,
 'log_count/INFO': 1201,
 'memdebug/gc_garbage_count': 0,
 'memdebug/live_refs/Product1Spider': 1,
 'memdebug/live_refs/ProductItem': 5,
 'memdebug/live_refs/Request': 1,
 'memusage/max': 89427968,
 'memusage/startup': 55738368,
 'request_depth_max': 159,
 'response_received_count': 3125,
 'retry/count': 40,
 'retry/reason_count/twisted.internet.error.TimeoutError': 40,
 'scheduler/dequeued/redis': 3167,
 'scheduler/enqueued/redis': 2565,
 'start_time': datetime.datetime(2020, 12, 20, 13, 38, 29, 541819)}

CONCURRENT_REQUESTS = 24, DOWNLOAD_TIMEOUT = 5

{'downloader/exception_count': 35,
 'downloader/exception_type_count/scrapy.exceptions.IgnoreRequest': 11,
 'downloader/exception_type_count/twisted.internet.error.TimeoutError': 24,
 'downloader/request_bytes': 909800,
 'downloader/request_count': 2550,
 'downloader/request_method_count/GET': 2550,
 'downloader/response_bytes': 2190768,
 'downloader/response_count': 2526,
 'downloader/response_status_count/200': 2526,
 'elapsed_time_seconds': 79.041858, ###########################
 'finish_reason': 'closespider_itemcount',
 'finish_time': datetime.datetime(2020, 12, 20, 13, 49, 15, 852147),
 'item_scraped_count': 1009,
 'log_count/DEBUG': 3561,
 'log_count/INFO': 1083,
 'memdebug/gc_garbage_count': 0,
 'memdebug/live_refs/Product1Spider': 1,
 'memdebug/live_refs/ProductItem': 10,
 'memdebug/live_refs/Request': 1,
 'memusage/max': 80408576,
 'memusage/startup': 55742464,
 'request_depth_max': 160,
 'response_received_count': 2526,
 'retry/count': 24,
 'retry/reason_count/twisted.internet.error.TimeoutError': 24,
 'scheduler/dequeued/redis': 2561,
 'scheduler/enqueued/redis': 1752,
 'start_time': datetime.datetime(2020, 12, 20, 13, 47, 56, 810289)}

CONCURRENT_REQUESTS = 28, DOWNLOAD_TIMEOUT = 5

{'downloader/exception_count': 73,
 'downloader/exception_type_count/scrapy.exceptions.IgnoreRequest': 46,
 'downloader/exception_type_count/twisted.internet.error.TimeoutError': 27,
 'downloader/request_bytes': 1116668,
 'downloader/request_count': 3260,
 'downloader/request_method_count/GET': 3260,
 'downloader/response_bytes': 3951091,
 'downloader/response_count': 3233,
 'downloader/response_status_count/200': 3233,
 'elapsed_time_seconds': 81.211134, ###################################
 'finish_reason': 'closespider_itemcount',
 'finish_time': datetime.datetime(2020, 12, 20, 13, 51, 51, 976231),
 'item_scraped_count': 1004,   ########################################
 'log_count/DEBUG': 4266,
 'log_count/INFO': 1115,
 'memdebug/gc_garbage_count': 0,
 'memdebug/live_refs/Product1Spider': 1,
 'memdebug/live_refs/ProductItem': 5,
 'memdebug/live_refs/Request': 1,
 'memusage/max': 88412160,
 'memusage/startup': 55717888,
 'request_depth_max': 160,
 'response_received_count': 3233,
 'retry/count': 27,
 'retry/reason_count/twisted.internet.error.TimeoutError': 27,
 'scheduler/dequeued/redis': 3306,
 'scheduler/enqueued/redis': 2600,
 'start_time': datetime.datetime(2020, 12, 20, 13, 50, 30, 765097)}

CONCURRENT_REQUESTS = 32, DOWNLOAD_TIMEOUT = 5

{'downloader/exception_count': 83,
 'downloader/exception_type_count/scrapy.exceptions.IgnoreRequest': 8,
 'downloader/exception_type_count/twisted.internet.error.TimeoutError': 75,
 'downloader/request_bytes': 1603675,
 'downloader/request_count': 4645,
 'downloader/request_method_count/GET': 4645,
 'downloader/response_bytes': 5979015,
 'downloader/response_count': 4570,
 'downloader/response_status_count/200': 4570,
 'elapsed_time_seconds': 163.676522,
 'finish_reason': 'closespider_itemcount',
 'finish_time': datetime.datetime(2020, 12, 20, 13, 58, 20, 996155),
 'item_scraped_count': 1004,
 'log_count/DEBUG': 5651,
 'log_count/INFO': 1170,
 'memdebug/gc_garbage_count': 0,
 'memdebug/live_refs/Product1Spider': 1,
 'memdebug/live_refs/ProductItem': 5,
 'memdebug/live_refs/Request': 1,
 'memusage/max': 80977920,
 'memusage/startup': 55742464,
 'request_depth_max': 158,
 'response_received_count': 4570,
 'retry/count': 75,
 'retry/reason_count/twisted.internet.error.TimeoutError': 75,
 'scheduler/dequeued/redis': 4653,
 'scheduler/enqueued/redis': 4321,
 'start_time': datetime.datetime(2020, 12, 20, 13, 55, 37, 319633)}

nginx scrapyd访问认证 – address already in use

sudo netstat -ntpl,sudo netstat -tulpn两个命令:

(boss) [chen@VM_0_3_centos conf.d]$ sudo netstat -ntpl
Active Internet connections (only servers)
Proto Recv-Q Send-Q Local Address           Foreign Address         State       PID/Program name    
tcp        0      0 0.0.0.0:80              0.0.0.0:*               LISTEN      5013/nginx: worker  
tcp        0      0 0.0.0.0:6801            0.0.0.0:*               LISTEN      5013/nginx: worker  
tcp        0      0 0.0.0.0:22              0.0.0.0:*               LISTEN      22677/sshd          
tcp        0      0 127.0.0.1:25            0.0.0.0:*               LISTEN      1310/master         
tcp6       0      0 :::80                   :::*                    LISTEN      5013/nginx: worker  
tcp6       0      0 ::1:25                  :::*                    LISTEN      1310/master         
tcp6       0      0 :::9000                 :::*                    LISTEN      29245/dockerd       
tcp6       0      0 :::2377                 :::*                    LISTEN      29245/dockerd       
tcp6       0      0 :::7946                 :::*                    LISTEN      29245/dockerd

(boss) [chen@VM_0_3_centos conf.d]$ sudo netstat -tulpn
Active Internet connections (only servers)
Proto Recv-Q Send-Q Local Address           Foreign Address         State       PID/Program name    
tcp        0      0 127.0.0.1:6800          0.0.0.0:*               LISTEN      5237/python         
tcp        0      0 0.0.0.0:22              0.0.0.0:*               LISTEN      22677/sshd          
tcp        0      0 127.0.0.1:25            0.0.0.0:*               LISTEN      1310/master         
tcp6       0      0 ::1:25                  :::*                    LISTEN      1310/master         
tcp6       0      0 :::9000                 :::*                    LISTEN      29245/dockerd       
tcp6       0      0 :::2377                 :::*                    LISTEN      29245/dockerd       
tcp6       0      0 :::7946                 :::*                    LISTEN      29245/dockerd       
udp        0      0 0.0.0.0:4789            0.0.0.0:*                           -                   
udp        0      0 0.0.0.0:68              0.0.0.0:*                           927/dhclient        
udp        0      0 172.16.0.3:123          0.0.0.0:*                           4636/ntpd           
udp        0      0 127.0.0.1:123           0.0.0.0:*                           4636/ntpd           
udp        0      0 0.0.0.0:514             0.0.0.0:*                           27343/rsyslogd      
udp6       0      0 :::7946                 :::*                                29245/dockerd       
udp6       0      0 fe80::5054:ff:fe36::123 :::*                                4636/ntpd           
udp6       0      0 ::1:123                 :::*                                4636/ntpd           
udp6       0      0 :::514                  :::*                                27343/rsyslogd  

技术问题还是stackoverflow靠谱:https://stackoverflow.com/questions/42303401/nginx-will-not-start-address-already-in-use

scrapyd访问认证nginx反向代理:

# For more information on configuration, see:
#   * Official English Documentation: http://nginx.org/en/docs/
#   * Official Russian Documentation: http://nginx.org/ru/docs/

user nginx;
worker_processes auto;
error_log /var/log/nginx/error.log;
pid /var/run/nginx.pid;

# Load dynamic modules. See /usr/share/doc/nginx/README.dynamic.
include /usr/share/nginx/modules/*.conf;

events {
    worker_connections 1024;
}

http {
    log_format  main  '$remote_addr - $remote_user [$time_local] "$request" '
                      '$status $body_bytes_sent "$http_referer" '
                      '"$http_user_agent" "$http_x_forwarded_for"';

    access_log  /var/log/nginx/access.log  main;

    sendfile            on;
    tcp_nopush          on;
    tcp_nodelay         on;
    keepalive_timeout   65;
    types_hash_max_size 2048;

    include             /etc/nginx/mime.types;
    default_type        application/octet-stream;

    # Load modular configuration files from the /etc/nginx/conf.d directory.
    # See http://nginx.org/en/docs/ngx_core_module.html#include
    # for more information.
    include /etc/nginx/conf.d/*.conf;
    server {
        listen 6801;
        server_name localhost;
        #charset koi8-r;
        #access_log logs/host.access.log main;
        location / {
            proxy_pass http://127.0.0.1:6800/;
            auth_basic "Restricted";
            auth_basic_user_file /etc/nginx/conf.d/pswdScrapyd;
        }

    
    }

# Settings for a TLS enabled server.
#
#    server {
#        listen       443 ssl http2 default_server;
#        listen       [::]:443 ssl http2 default_server;
#        server_name  _;
#        root         /usr/share/nginx/html;
#
#        ssl_certificate "/etc/pki/nginx/server.crt";
#        ssl_certificate_key "/etc/pki/nginx/private/server.key";
#        ssl_session_cache shared:SSL:1m;
#        ssl_session_timeout  10m;
#        ssl_ciphers HIGH:!aNULL:!MD5;
#        ssl_prefer_server_ciphers on;
#
#        # Load configuration files for the default server block.
#        include /etc/nginx/default.d/*.conf;
#
#        location / {
#        }
#
#        error_page 404 /404.html;
#        location = /404.html {
#        }
#
#        error_page 500 502 503 504 /50x.html;
#        location = /50x.html {
#        }
#    }

}

scrapyd.conf需要加入:

[deploy]
url = http://127.0.0.1:6801/
project = jingdong
username = user
password = password

这里的用户名密码是htpasswd生成的,比如sudo htpasswd -c password user