解决了增量去重的问题,两个ADSL VPS搭建的代理池可以支撑每秒10-12个商品的爬虫。基本上达到了单机日均百万的采集速度,想要更快的话需要更多代理IP,加几个拨号服务器可以解决。关于更多优化,我的博客最近提到过,就不多说了。
CONCURRENT_REQUESTS = 20, DOWNLOAD_TIMEOUT = 5
{'downloader/exception_count': 83,
'downloader/exception_type_count/scrapy.exceptions.IgnoreRequest': 44,
'downloader/exception_type_count/twisted.internet.error.TimeoutError': 39,
'downloader/request_bytes': 1126708,
'downloader/request_count': 3208,
'downloader/request_method_count/GET': 3208,
'downloader/response_bytes': 3437494,
'downloader/response_count': 3169,
'downloader/response_status_count/200': 3169,
'elapsed_time_seconds': 119.389706, ##################################
'finish_reason': 'closespider_itemcount',
'finish_time': datetime.datetime(2020, 12, 20, 13, 29, 0, 592259),
'item_scraped_count': 1002,
'log_count/DEBUG': 4212,
'log_count/INFO': 1231,
'memdebug/gc_garbage_count': 0,
'memdebug/live_refs/Product1Spider': 1,
'memdebug/live_refs/ProductItem': 3,
'memdebug/live_refs/Request': 1,
'memusage/max': 81879040,
'memusage/startup': 55734272,
'request_depth_max': 161,
'response_received_count': 3169,
'retry/count': 39,
'retry/reason_count/twisted.internet.error.TimeoutError': 39,
'scheduler/dequeued/redis': 3252,
'scheduler/enqueued/redis': 2633,
'start_time': datetime.datetime(2020, 12, 20, 13, 27, 1, 202553)}
CONCURRENT_REQUESTS = 16, DOWNLOAD_TIMEOUT = 5
{'downloader/exception_count': 42,
'downloader/exception_type_count/scrapy.exceptions.IgnoreRequest': 2,
'downloader/exception_type_count/twisted.internet.error.TimeoutError': 40,
'downloader/request_bytes': 1093102,
'downloader/request_count': 3165,
'downloader/request_method_count/GET': 3165,
'downloader/response_bytes': 3268017,
'downloader/response_count': 3125,
'downloader/response_status_count/200': 3125,
'elapsed_time_seconds': 148.237574, ####################################
'finish_reason': 'closespider_itemcount',
'finish_time': datetime.datetime(2020, 12, 20, 13, 40, 57, 779393),
'item_scraped_count': 1004,
'log_count/DEBUG': 4171,
'log_count/INFO': 1201,
'memdebug/gc_garbage_count': 0,
'memdebug/live_refs/Product1Spider': 1,
'memdebug/live_refs/ProductItem': 5,
'memdebug/live_refs/Request': 1,
'memusage/max': 89427968,
'memusage/startup': 55738368,
'request_depth_max': 159,
'response_received_count': 3125,
'retry/count': 40,
'retry/reason_count/twisted.internet.error.TimeoutError': 40,
'scheduler/dequeued/redis': 3167,
'scheduler/enqueued/redis': 2565,
'start_time': datetime.datetime(2020, 12, 20, 13, 38, 29, 541819)}
CONCURRENT_REQUESTS = 24, DOWNLOAD_TIMEOUT = 5
{'downloader/exception_count': 35,
'downloader/exception_type_count/scrapy.exceptions.IgnoreRequest': 11,
'downloader/exception_type_count/twisted.internet.error.TimeoutError': 24,
'downloader/request_bytes': 909800,
'downloader/request_count': 2550,
'downloader/request_method_count/GET': 2550,
'downloader/response_bytes': 2190768,
'downloader/response_count': 2526,
'downloader/response_status_count/200': 2526,
'elapsed_time_seconds': 79.041858, ###########################
'finish_reason': 'closespider_itemcount',
'finish_time': datetime.datetime(2020, 12, 20, 13, 49, 15, 852147),
'item_scraped_count': 1009,
'log_count/DEBUG': 3561,
'log_count/INFO': 1083,
'memdebug/gc_garbage_count': 0,
'memdebug/live_refs/Product1Spider': 1,
'memdebug/live_refs/ProductItem': 10,
'memdebug/live_refs/Request': 1,
'memusage/max': 80408576,
'memusage/startup': 55742464,
'request_depth_max': 160,
'response_received_count': 2526,
'retry/count': 24,
'retry/reason_count/twisted.internet.error.TimeoutError': 24,
'scheduler/dequeued/redis': 2561,
'scheduler/enqueued/redis': 1752,
'start_time': datetime.datetime(2020, 12, 20, 13, 47, 56, 810289)}
CONCURRENT_REQUESTS = 28, DOWNLOAD_TIMEOUT = 5
{'downloader/exception_count': 73,
'downloader/exception_type_count/scrapy.exceptions.IgnoreRequest': 46,
'downloader/exception_type_count/twisted.internet.error.TimeoutError': 27,
'downloader/request_bytes': 1116668,
'downloader/request_count': 3260,
'downloader/request_method_count/GET': 3260,
'downloader/response_bytes': 3951091,
'downloader/response_count': 3233,
'downloader/response_status_count/200': 3233,
'elapsed_time_seconds': 81.211134, ###################################
'finish_reason': 'closespider_itemcount',
'finish_time': datetime.datetime(2020, 12, 20, 13, 51, 51, 976231),
'item_scraped_count': 1004, ########################################
'log_count/DEBUG': 4266,
'log_count/INFO': 1115,
'memdebug/gc_garbage_count': 0,
'memdebug/live_refs/Product1Spider': 1,
'memdebug/live_refs/ProductItem': 5,
'memdebug/live_refs/Request': 1,
'memusage/max': 88412160,
'memusage/startup': 55717888,
'request_depth_max': 160,
'response_received_count': 3233,
'retry/count': 27,
'retry/reason_count/twisted.internet.error.TimeoutError': 27,
'scheduler/dequeued/redis': 3306,
'scheduler/enqueued/redis': 2600,
'start_time': datetime.datetime(2020, 12, 20, 13, 50, 30, 765097)}
CONCURRENT_REQUESTS = 32, DOWNLOAD_TIMEOUT = 5
{'downloader/exception_count': 83,
'downloader/exception_type_count/scrapy.exceptions.IgnoreRequest': 8,
'downloader/exception_type_count/twisted.internet.error.TimeoutError': 75,
'downloader/request_bytes': 1603675,
'downloader/request_count': 4645,
'downloader/request_method_count/GET': 4645,
'downloader/response_bytes': 5979015,
'downloader/response_count': 4570,
'downloader/response_status_count/200': 4570,
'elapsed_time_seconds': 163.676522,
'finish_reason': 'closespider_itemcount',
'finish_time': datetime.datetime(2020, 12, 20, 13, 58, 20, 996155),
'item_scraped_count': 1004,
'log_count/DEBUG': 5651,
'log_count/INFO': 1170,
'memdebug/gc_garbage_count': 0,
'memdebug/live_refs/Product1Spider': 1,
'memdebug/live_refs/ProductItem': 5,
'memdebug/live_refs/Request': 1,
'memusage/max': 80977920,
'memusage/startup': 55742464,
'request_depth_max': 158,
'response_received_count': 4570,
'retry/count': 75,
'retry/reason_count/twisted.internet.error.TimeoutError': 75,
'scheduler/dequeued/redis': 4653,
'scheduler/enqueued/redis': 4321,
'start_time': datetime.datetime(2020, 12, 20, 13, 55, 37, 319633)}