如何避免scrapy爬虫302定向到移动页面

解决京东302重定向到移动页面的问题,重构desktop平台user_agent

上一篇博文我简单介绍了302重定向的问题和解决方案,但是那是一个折衷,并不是最好的方案,最佳方案当然是找到302重定向的原因解决问题。来分析下京东这段js代码:

            function jump_mobile() {
              if(is_sort_black_list()) {
                return;
              }

              var userAgent = navigator.userAgent || "";
              userAgent = userAgent.toUpperCase();
                            if(userAgent == "" || userAgent.indexOf("PAD") > -1) {
                  return;
              }

                            if(window.location.hash == '#m') {
                var exp = new Date();
                exp.setTime(exp.getTime() + 30 * 24 * 60 * 60 * 1000);
                document.cookie = "pcm=1;expires=" + exp.toGMTString() + ";path=/;domain=jd.com";
                                window.showtouchurl = true;
                return;
              }

                            if (/MOBILE/.test(userAgent) && /(MICROMESSENGER|QQ\/)/.test(userAgent)) {
                  var paramIndex = location.href.indexOf("?");
                  window.location.href = "//item.m.jd.com/product/11494732.html"+(paramIndex>0?location.href.substring(paramIndex,location.href.length):'');
                  return;
              }

其实挺简单,就是根据user_agent来判断你的平台,如果是移动平台会强制302重定向到移动页面。问题是移动页面通常信息不全,不能满足我们的爬虫要求。很多人会想到伪造请求头,把user_agent带上。想起来很简单,问题是很多时候你伪造的user_agent并不好使,尤其是对于京东这段js代码。难点在于伪造一个pc平台的user_agent。

我试过多种方案,用过多个包,包括fake_ua, user_agent。后来发现只有user_agent能满足要求,它的算法很容易理解,选择系统平台,核心代码如下:

https://github.com/lorien/user_agent/blob/master/user_agent/base.py

如果你不喜欢安装包,可以用我模仿user_agent包的代码:

from user_agent import generate_user_agent, generate_navigator
from pprint import pprint
import random

# generate_user_agent()
generate_user_agent(os='win', navigator='chrome', 
                        device_type='desktop')

# 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.120 Safari/537.36'

CHROME_BUILD = '''
80.0.3987.132
80.0.3987.149
80.0.3987.99
81.0.4044.117
81.0.4044.138
83.0.4103.101
83.0.4103.106
83.0.4103.96
84.0.4147.105
84.0.4147.111
84.0.4147.125
84.0.4147.135
84.0.4147.89
85.0.4183.101
85.0.4183.102
85.0.4183.120
85.0.4183.121
85.0.4183.127
85.0.4183.81
85.0.4183.83
86.0.4240.110
86.0.4240.111
86.0.4240.114
86.0.4240.183
86.0.4240.185
86.0.4240.75
86.0.4240.78
86.0.4240.80
86.0.4240.96
86.0.4240.99
'''.strip().splitlines()


OS_PLATFORM = {
    'win': (
        'Windows NT 5.1', # Windows XP
        'Windows NT 6.1', # Windows 7
        'Windows NT 6.2', # Windows 8
        'Windows NT 6.3', # Windows 8.1
        'Windows NT 10.0', # Windows 10
    ),
    'mac': (
        'Macintosh; Intel Mac OS X 10.8',
        'Macintosh; Intel Mac OS X 10.9',
        'Macintosh; Intel Mac OS X 10.10',
        'Macintosh; Intel Mac OS X 10.11',
        'Macintosh; Intel Mac OS X 10.12',
    ),
}
OS_CPU = {
    'win': (
        '', # 32bit
        'Win64; x64', # 64bit
        'WOW64', # 32bit process on 64bit system
    )}
platform = random.choice(OS_PLATFORM["win"])
cpu = random.choice(OS_CPU["win"])
chrome_build = random.choice(CHROME_BUILD)
if len(cpu) > 0:
    tmp = (f'Mozilla/5.0 ({platform}; {cpu}) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{chrome_build} Safari/537.36')
else:
    tmp = (f'Mozilla/5.0 ({platform}) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{chrome_build} Safari/537.36')
print(tmp)

经测试,京东再也不会302重定向了。

5秒钟揪出电脑中大文件 – python多线程遍历文件夹

python多线程遍历文件夹, 获取文件大小, list 线程安全

电脑用久了,自然疏于管理大文件,不经意间硬盘就告急了。这里祭出大招,5秒钟遍历13万文件,揪出你想找的大文件!这里用了多线程,测试中发现,线程数超过4就意义不大了,毕竟只有十几万文件。

虽然脚本只有几十行,但是这个多线程的使用很有意义,解决了python list 线程安全这个问题

# the script was written to loop through a folder check sizes of all files
# multi-threading is turned on to speed up process

import os
import threading
from sys import argv
import time

years = ['2019', '2020']

def filewrite(filepath):
    with open('filesize_list.txt', 'a') as f:
        f.write(os.path.abspath(filepath))
        f.write('\n')

def sizeCheck(files):
    thr_name = threading.current_thread().name
    print(f'{thr_name} is processing {len(files)} files')
    for file in files:
        file_path = os.path.join(root,file)
        # file_path = os.path.abspath(file)
        filesize = os.stat(file_path).st_size/1024/1024
        if filesize > 3072:
            print(f'threading: {file}, {file_path}, {int(filesize)}MB')
            # lock.acquire()
            filewrite(file_path)
            # lock.release()
            
def split_files(file_list, split_num):
    thread_list = []
    # list size each thread has to process
    list_size = (len(file_list) // split_num) if (len(file_list) % split_num == 0) else ((len(file_list) // split_num) + 1)
    print(f'num of files to check {list_size}')
    # start thread
    for i in range(split_num):
        # get url that current thread need to process
        file_list_split = file_list[
                         i * list_size:(i + 1) * list_size if len(file_list) > (i + 1) * list_size else len(file_list)]
        thread = threading.Thread(target=sizeCheck, args=(file_list_split,))
        thread.setName("Thread" + str(i))
        thread_list.append(thread)
        # start in thread
        thread.start()
        # print(thread.getName() + "started")
    # combine at the end of the job
    for _item in thread_list:
        _item.join()
        
if __name__ == "__main__":
    t1 = time.time()
    thread_num = 6
    lock = threading.Lock()
    print("add the directory where you want to check filesizes or leave it to default:") 
    if len(argv) > 1:
        dirs_to_check = [','.join(argv[i]) for i in range(len(argv))]        
        print(f'num of folders to check: {dirs_to_check}  {len(dirs_to_check)}')
    else:
        dirs_to_check = ['D:\\', 'E:\\', 'F:\\', 'G:\\']
    file_list_ = []
    for dir_to_check in dirs_to_check:      
        print(f'dir_to_check {dir_to_check}')
        for root, dirs, files in os.walk(dir_to_check):
            # print(root, dirs, files, len(files))
            for i in files:
                file_list_.append(''.join(root + '\\'+ i))
    print(f'num of files to scan: {len(file_list_)}')
    split_files(file_list_, thread_num) # thread_num
    t2 = time.time()
    print(f'time lapsed: {t2-t1}, num of threads used: {thread_num}')
            
       
5秒钟遍历13万文件