爬虫代理监控 – ip proxy monitor

如果你手头有很多adsl vps拨号建代理池,那么它的日志监控是非常重要的。我根据我的拨号脚本写了个脚本,监控7个服务器的运行状态,效果刚刚的!见图:

比较理想的IP代理服务器
质量可以接受的IP代理服务器
比较差的IP代理服务器

通过可视化图表,可以清晰的看到这些服务器拨出的IP的质量。代码见https://github.com/chenxuzhen/AdslProxy

%matplotlib qt
import os
import re
import pymysql
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import paramiko
import time
import numpy as np

# %matplotlib notebook

# log_file = r'E:\splash\AdslProxy\proxy_reboot.log'
log_file = '/root/proxy_reboot.log'
class PROXY_MON(object):
    def __init__(self, hostname, port, username, password, adsl_num):
        #服务器信息,主机名(IP地址)、端口号、用户名及密码
        self.adsl=adsl_num
        client = paramiko.SSHClient()
        client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
        client.connect(hostname, port, username, password, compress=True, timeout=10)
        self.sftp_client = client.open_sftp()        
    def log_check(self):
        try:    
            with self.sftp_client.open(log_file, 'r') as file:
                contents = file.read().decode()
                print(contents[-200:])
                dial_times = re.findall('Dial started', contents)
                print(f'total IPs dialed: {len(dial_times)}')
                repeat_ips = re.findall('2 times', contents)
                print(f'num of repeat IPs: {len(repeat_ips)}')
                success_ips = re.findall('Successfully set', contents)
                print(f'num of successful IPs set to redis: {len(success_ips)}')
                dial_failed = re.findall('Get IP failed', contents)
                print(f'num of failed dialing: {len(dial_failed)}')
                valid_ip = re.findall('Valid proxy', contents)
                print(f'num of Valid proxy IPs: {len(valid_ip)}')
                invalid_ip = re.findall('Proxy invalid', contents)
                print(f'num of invalid proxy IPs: {len(invalid_ip)}')
                consec_ip_repeat = re.findall('IP和上次相同', contents)
                print(f'num of consecutive repeat IP dialed: {len(consec_ip_repeat)}')
                reboot_ip_del_failure = re.findall('删除IP失败!从代理池删除IP并重启系统', contents)
                print(f'num of reboot due to deleltion failure from redis: {len(reboot_ip_del_failure)}') 
                reboot_ip_3dial_failure = re.findall('连续三次拨号失败!从代理池删除IP并重启系统', contents)
                print(f'num of reboot due to 3 consecutive dial failures: {len(reboot_ip_3dial_failure)}')
        except Exception as e:
            print(e)
            print(self.adsl)
        finally:
            file.close()

        proxy_stats = [len(dial_times), len(repeat_ips), len(success_ips), len(dial_failed), len(valid_ip), len(invalid_ip), len(consec_ip_repeat), len(reboot_ip_del_failure), len(reboot_ip_3dial_failure)]
        column_names = ['dial_times', 'repeat_ips', 'success_ips', 'dial_failed', 'valid_ip', 'invalid_ip', 'consec_ip_repeat', 'reboot_ip_del_failure', 'reboot_ip_3dial_failure']
        data_list = [proxy_stats, column_names]
        df = pd.DataFrame (data_list).transpose()
        df.columns = ['proxy_stats', 'stats_names']
        df
        proxy_stats2 = [('server', self.adsl), ('dial_times',len(dial_times)), ('repeat_ips',len(repeat_ips)), 
                        ('success_ips',len(success_ips)), ('dial_failed',len(dial_failed)), ('valid_ip',len(valid_ip)),
                        ('invalid_ip',len(invalid_ip)), ('consec_ip_repeat',len(consec_ip_repeat)),
                        ('reboot_ip_del_failure',len(reboot_ip_del_failure)), ('reboot_ip_3dial_failure',len(reboot_ip_3dial_failure)), ('reg_date','2020')]
        proxy_stats3 = list(tuple((self.adsl,len(dial_times), len(repeat_ips), len(success_ips), len(dial_failed), len(valid_ip), len(invalid_ip), len(consec_ip_repeat), len(reboot_ip_del_failure), len(reboot_ip_3dial_failure), '2020')))

        proxy_stats = [self.adsl, len(dial_times), len(repeat_ips), len(success_ips), len(dial_failed), len(valid_ip), len(invalid_ip), len(consec_ip_repeat), len(reboot_ip_del_failure), len(reboot_ip_3dial_failure)]

        # 日志数据总结写入mysql,可以在本地或者远程服务器运行
        db_conn=pymysql.connect(host=HOST_IP,port=3306,user='root',passwd=PASSWORD,db='proxy',charset='utf8mb4')
        cur = db_conn.cursor()  
        insert_sql="""INSERT IGNORE INTO stats(server, dial_times, repeat_ips, success_ips, dial_failed, valid_ip, invalid_ip,\
                consec_ip_repeat, reboot_ip_del_failure, reboot_ip_3dial_failure) \
                VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) AS new \
                ON DUPLICATE KEY UPDATE \
                dial_times=new.dial_times, repeat_ips=new.repeat_ips, success_ips=new.success_ips,\
                dial_failed=new.dial_failed, valid_ip=new.valid_ip, invalid_ip=new.invalid_ip,\
                consec_ip_repeat=new.consec_ip_repeat, reboot_ip_del_failure=new.reboot_ip_del_failure,\
                reboot_ip_3dial_failure=new.reboot_ip_3dial_failure"""
        cur.executemany(insert_sql, [proxy_stats])
        db_conn.commit()
#         figure = plt.figure(self.adsl, figsize=(16, 8)) 
        figure, ax = plt.subplots(1, 1, figsize=(16, 8))
        plt.ion()
        sns.barplot(x = 'stats_names',
                    y = 'proxy_stats',
                    data = df).set_title(self.adsl + '_proxy_quality_monitor')
        
        plt.xticks(rotation=30)
        plt.tight_layout()
        self.show_values_on_bars(ax)
        # Show the plot
        figure.show()        
        plt.pause(10)
        figure.savefig('E:/splash/AdslProxy/' + self.adsl + '_proxy_quality_monitor' + '.jpg')
        figure.clf()
        plt.close()
    def show_values_on_bars(self, axs):
        def _show_on_single_plot(ax):        
            for p in ax.patches:
                _x = p.get_x() + p.get_width() / 2
                _y = p.get_y() + p.get_height()
                value = '{:.0f}'.format(p.get_height())
                ax.text(_x, _y, value, ha="center") 

        if isinstance(axs, np.ndarray):
            for idx, ax in np.ndenumerate(axs):
                _show_on_single_plot(ax)
        else:
            _show_on_single_plot(axs)

        
if __name__ == "__main__":
    servers = [('IP', PORT, 'root', 'PASSWD', 'adsl5'),
   
    ]  

    for server in servers:
        proxy_monitor = PROXY_MON(*server)
        proxy_monitor.log_check()              

scrapyd – Couldn’t bind: 24: Too many open files

Google收录的相关信息特别少,只有下面两个链接有些帮助:

http://stackoverflow.com/questions/19995855/increase-max-open-files-for-

https://groups.google.com/g/scrapy-users/c/rArexq4tI7I

问题的根源在于,我的scrapy是作为一个service启动的,当初是为了方便程序管理和重启。这样一来scrapyd启动后并不遵守系统的ulimit设置,所以网上的设置ulimit open files都是没用的。比如我的系统设置open files设置的已经足够高了,即使scrapy高并发按理说也能应付。

(boss) [chen@VM_0_2_centos product_1]$ ulimit -a
core file size          (blocks, -c) 4194304
data seg size           (kbytes, -d) unlimited
scheduling priority             (-e) 0
file size               (blocks, -f) unlimited
pending signals                 (-i) 31143
max locked memory       (kbytes, -l) 64
max memory size         (kbytes, -m) unlimited
open files                      (-n) 1000001
pipe size            (512 bytes, -p) 8
POSIX message queues     (bytes, -q) 819200
real-time priority              (-r) 0
stack size              (kbytes, -s) 8192
cpu time               (seconds, -t) unlimited
max user processes              (-u) 31143
virtual memory          (kbytes, -v) unlimited
file locks                      (-x) unlimited

解决办法 – 单独给scrapyd设置open file limits:

找到scrapyd的PID,比如这里的1011,然后运行下面的命令:

sudo prlimit --pid 1011 --nofile=1000000:1000000

查找PID:
(boss) [chen@VM_0_2_centos product_1]$ ps -ef|grep scrapyd
chen      1011     1  1 Dec23 ?        00:39:50 /home/chen/anaconda3/envs/boss/bin/python /home/chen/anaconda3/envs/boss/bin/scrapyd
chen      4476  1011 19 Dec25 ?        00:23:58 /home/chen/anaconda3/envs/boss/bin/python -m scrapyd.runner crawl jd_comment -a _job=e78327f046c611eba6e5bd9b7c1c3b18
chen     10838  1011 23 Dec25 ?        01:41:02 /home/chen/anaconda3/envs/boss/bin/python -m scrapyd.runner crawl product_1 -a _job=45e566ae469b11eba6e5bd9b7c1c3b18
chen     14434 21260  0 01:38 pts/3    00:00:00 grep --color=auto scrapyd
查看该PID下的Process的limits参数:
(boss) [chen@VM_0_2_centos product_1]$ cat /proc/1011/limits
Limit                     Soft Limit           Hard Limit           Units     
Max cpu time              unlimited            unlimited            seconds   
Max file size             unlimited            unlimited            bytes     
Max data size             unlimited            unlimited            bytes     
Max stack size            8388608              unlimited            bytes     
Max core file size        0                    unlimited            bytes     
Max resident set          unlimited            unlimited            bytes     
Max processes             31143                31143                processes 
Max open files            1000000              1000000              files     
Max locked memory         65536                65536                bytes     
Max address space         unlimited            unlimited            bytes     
Max file locks            unlimited            unlimited            locks     
Max pending signals       31143                31143                signals   
Max msgqueue size         819200               819200               bytes     
Max nice priority         0                    0                    
Max realtime priority     0                    0                    
Max realtime timeout      unlimited            unlimited            us