有趣的反爬虫经历


最近学校要求做爬虫,舍长问我咋做,我随便丢了一个某漏洞网站的爬虫的demo(别人写好的),在努力下终于运行起来了,既然还用chrome headless了。结果是跑起来了,但是报错,我看路了代码的流程,出现521了,第一次遇到,于是print response看一下,一看竟然是一串js,哈哈。心里想必然是遇见反爬虫了。又有东西玩咯,收藏,下课弄6


1用curl打印一下吧。代码如下:



var x="1@toString@try@__p@@@length@for@setTimeout@charCodeAt@parseInt@8@Array@2@join@@K8@captcha@@var@@0xEDB88320@div@@onreadystatechange@chars@@location@@@3@https@@@@@@RegExp@JgSe0upZ@@36@challenge@@18@F@__jsl_clearance@else@4@window@a@0@document@_p@cookie@@addEventListener@@firstChild@f@split@@substr@rfA@1500@@@@@@@25@GMT@reverse@May@@search@@09@Expires@match@function@7@Path@if@@0xFF@@charAt@502@@T@attachEvent@@rOm9XFMtA3QKV7nYsPGT4lifyWwkq5vcjH2IdxUoCbhERLaz81DNB6@toLowerCase@@d@pathname@@6@@new@@1526286336@Mon@hantom@while@@createElement@g@fromCharCode@@return@eval@String@e@innerHTML@href@14@DOMContentLoaded@@as@@@false@catch@@replace".replace(/@*$/,"").split("@"),y="k 14=2f(){9('s.3j=s.2w+s.2a.3t(/[\\?|&]i-19/,\\'\\')',1v);1j.1l='1d=35.2n|1i|'+(2f(){k 39=[2f(14){3e 14},2f(39){3e 39},2f(14){3e 3f('3g.3c('+14+')')}],22=[[(-~{}<<-~-~(+![]))],[-~[]]+((+!'')+(+!'')+1f+[[]][1i]),[-~[]]+((+![])+[]+[]),([-~(+![])-~-~(+![])]*(-~(+![])-~-~(+![]))+[]+[[]][1i]),[-~[]],[-~[]]+([-~(+![])-~-~(+![])]*(-~(+![])-~-~(+![]))+[]+[[]][1i]),[-~[]]+(-~(+![])+2g+[]),[-~[]]+((-~{}+[e]>>e)+[]+[[]][1i]),(-~(+![])-~-~(+![])+(-~-~(+![]))*[-~-~(+![])]+[]+[]),(-~-~(+![])+[]+[[]][1i])+((-~{}+[e]>>e)+[]+[[]][1i]),[-~![]+(-~-~(+![]))*[-~-~(+![])]],(-~-~(+![])+[]+[[]][1i])+((+![])+[]+[]),((-~{}+[e]>>e)+[]+[[]][1i]),((+!'')+(+!'')+1f+[[]][1i]),(-~-~(+![])+[]+[[]][1i])+[-~[]],[-~[]]+(-~-~(+![])+[]+[[]][1i]),[-~[]]+(-~(+![])-~-~(+![])+(-~-~(+![]))*[-~-~(+![])]+[]+[]),(-~(+![])+2g+[]),(-~-~(+![])+[]+[[]][1i])+(-~-~(+![])+[]+[[]][1i]),[-~[]]+[(-~{}<<-~-~(+![]))],[-~[]]+[-~![]+(-~-~(+![]))*[-~-~(+![])]],[-~[]]+[-~[]],(-~-~(+![])+[]+[[]][1i]),((+![])+[]+[])];8(k 14=1i;14<22.7;14++){22[14]=39[[1,e,1i,e,1i,1,1i,e,1i,e,1i,1,e,1,e,1i,e,1,1i,e,1i,e,1,e][14]]([[((+!'')+(+!'')+1f+[[]][1i])+(-~(+![])+2g+[])],(-~-~(+![])+[]+[[]][1i]),'%v',[(-~(+![])+2g+[])+[(-~{}<<-~-~(+![]))]],'2p',(1g['4'+'37'+'3n']+[[]][1i]).2m(e-~[(-~(+![])<<-~(+![]))])+(!{}+[]+[[]][1i]).2m((e)*[e]),'h',(-~(+![])-~-~(+![])+(-~-~(+![]))*[-~-~(+![])]+[]+[]),'1u',[(-~(+![])+2g+[])+(-~(+![])-~-~(+![])+(-~-~(+![]))*[-~-~(+![])]+[]+[]),((-~{}+[e]>>e)+[]+[[]][1i])+(-~(+![])-~-~(+![])+(-~-~(+![]))*[-~-~(+![])]+[]+[])],[1g['1k'+'37']+[]+[[]][1i]][1i].2m(-~(+![])+v),[(-~(+![])-~-~(+![])+(-~-~(+![]))*[-~-~(+![])]+[]+[])+((+![])+[]+[]),((+!'')+(+!'')+1f+[[]][1i])+[-~![]+(-~-~(+![]))*[-~-~(+![])]]],(-~-~(+![])+[]+[[]][1i]),[((+!'')+(+!'')+1f+[[]][1i])+([-~(+![])-~-~(+![])]*(-~(+![])-~-~(+![]))+[]+[[]][1i]),[-~[]]+[-~[]]+(-~-~(+![])+[]+[[]][1i])],[(-~(+![])+2g+[])+[(-~{}<<-~-~(+![]))]],[1g['1k'+'37']+[]+[[]][1i]][1i].2m(-~(+![])+v),[(-~(+![])+2g+[])+(-~(+![])+2g+[])],[(-~(+![])-~-~(+![])+(-~-~(+![]))*[-~-~(+![])]+[]+[])+((+![])+[]+[])],((+!'')+(+!'')+1f+[[]][1i])+({}+[]+[]).2m((+!'')+(+!'')+[(-~(+![])<<-~(+![]))]*(-~(+![])-~-~(+![]))),'1c','31',[(-~(+![])-~-~(+![])+(-~-~(+![]))*[-~-~(+![])]+[]+[])+(-~(+![])-~-~(+![])+(-~-~(+![]))*[-~-~(+![])]+[]+[]),((-~{}+[e]>>e)+[]+[[]][1i])+(-~(+![])-~-~(+![])+(-~-~(+![]))*[-~-~(+![])]+[]+[])],(-~[]/~~[]+[]+[]).2m((-~(+![])<<-~(+![]))-~![]+(-~-~(+![]))*[-~-~(+![])]),[[-~[]]+((+![])+[]+[])+((-~{}+[e]>>e)+[]+[[]][1i])]][22[14]])};3e 22.f('')})()+';2d=36, 3k-28-1b 2c:25:18 26;2h=/;'};2i((2f(){3{3e !!1g.1n;}3r(3h){3e 3q;}})()){1j.1n('3l',14,3q)}1e{1j.2q('p',14)}",f=function(x,y){var a=0,b=0,c=0;x=x.split("");y=y||99;while((a=x.shift())&&(b=a.charCodeAt(0)-77.5))c=(Math.abs(b)<13?(b+48.5):parseInt(a,36))+y*c;return c},z=f(y.match(/\w/g).sort(function(x,y){return f(x)-f(y)}).pop());while(z++)try{eval(y.replace(/\b\w+\b/g, function(y){return x[f(y,z)-1]||("_"+y)}));break}catch(_){}

2.看着很乱 放sublime格式化一下:



var x = "1@toString@try@__p@@@length@for@setTimeout@charCodeAt@parseInt@8@Array@2@join@@K8@captcha@@var@@0xEDB88320@div@@onreadystatechange@chars@@location@@@3@https@@@@@@RegExp@JgSe0upZ@@36@challenge@@18@F@__jsl_clearance@else@4@window@a@0@document@_p@cookie@@addEventListener@@firstChild@f@split@@substr@rfA@1500@@@@@@@25@GMT@reverse@May@@search@@09@Expires@match@function@7@Path@if@@0xFF@@charAt@502@@T@attachEvent@@rOm9XFMtA3QKV7nYsPGT4lifyWwkq5vcjH2IdxUoCbhERLaz81DNB6@toLowerCase@@d@pathname@@6@@new@@1526286336@Mon@hantom@while@@createElement@g@fromCharCode@@return@eval@String@e@innerHTML@href@14@DOMContentLoaded@@as@@@false@catch@@replace".replace(/@*$/, "").split("@"),
    y = "k 14=2f(){9('s.3j=s.2w+s.2a.3t(/[\\?|&]i-19/,\\'\\')',1v);1j.1l='1d=35.2n|1i|'+(2f(){k 39=[2f(14){3e 14},2f(39){3e 39},2f(14){3e 3f('3g.3c('+14+')')}],22=[[(-~{}<<-~-~(+![]))],[-~[]]+((+!'')+(+!'')+1f+[[]][1i]),[-~[]]+((+![])+[]+[]),([-~(+![])-~-~(+![])]*(-~(+![])-~-~(+![]))+[]+[[]][1i]),[-~[]],[-~[]]+([-~(+![])-~-~(+![])]*(-~(+![])-~-~(+![]))+[]+[[]][1i]),[-~[]]+(-~(+![])+2g+[]),[-~[]]+((-~{}+[e]>>e)+[]+[[]][1i]),(-~(+![])-~-~(+![])+(-~-~(+![]))*[-~-~(+![])]+[]+[]),(-~-~(+![])+[]+[[]][1i])+((-~{}+[e]>>e)+[]+[[]][1i]),[-~![]+(-~-~(+![]))*[-~-~(+![])]],(-~-~(+![])+[]+[[]][1i])+((+![])+[]+[]),((-~{}+[e]>>e)+[]+[[]][1i]),((+!'')+(+!'')+1f+[[]][1i]),(-~-~(+![])+[]+[[]][1i])+[-~[]],[-~[]]+(-~-~(+![])+[]+[[]][1i]),[-~[]]+(-~(+![])-~-~(+![])+(-~-~(+![]))*[-~-~(+![])]+[]+[]),(-~(+![])+2g+[]),(-~-~(+![])+[]+[[]][1i])+(-~-~(+![])+[]+[[]][1i]),[-~[]]+[(-~{}<<-~-~(+![]))],[-~[]]+[-~![]+(-~-~(+![]))*[-~-~(+![])]],[-~[]]+[-~[]],(-~-~(+![])+[]+[[]][1i]),((+![])+[]+[])];8(k 14=1i;14<22.7;14++){22[14]=39[[1,e,1i,e,1i,1,1i,e,1i,e,1i,1,e,1,e,1i,e,1,1i,e,1i,e,1,e][14]]([[((+!'')+(+!'')+1f+[[]][1i])+(-~(+![])+2g+[])],(-~-~(+![])+[]+[[]][1i]),'%v',[(-~(+![])+2g+[])+[(-~{}<<-~-~(+![]))]],'2p',(1g['4'+'37'+'3n']+[[]][1i]).2m(e-~[(-~(+![])<<-~(+![]))])+(!{}+[]+[[]][1i]).2m((e)*[e]),'h',(-~(+![])-~-~(+![])+(-~-~(+![]))*[-~-~(+![])]+[]+[]),'1u',[(-~(+![])+2g+[])+(-~(+![])-~-~(+![])+(-~-~(+![]))*[-~-~(+![])]+[]+[]),((-~{}+[e]>>e)+[]+[[]][1i])+(-~(+![])-~-~(+![])+(-~-~(+![]))*[-~-~(+![])]+[]+[])],[1g['1k'+'37']+[]+[[]][1i]][1i].2m(-~(+![])+v),[(-~(+![])-~-~(+![])+(-~-~(+![]))*[-~-~(+![])]+[]+[])+((+![])+[]+[]),((+!'')+(+!'')+1f+[[]][1i])+[-~![]+(-~-~(+![]))*[-~-~(+![])]]],(-~-~(+![])+[]+[[]][1i]),[((+!'')+(+!'')+1f+[[]][1i])+([-~(+![])-~-~(+![])]*(-~(+![])-~-~(+![]))+[]+[[]][1i]),[-~[]]+[-~[]]+(-~-~(+![])+[]+[[]][1i])],[(-~(+![])+2g+[])+[(-~{}<<-~-~(+![]))]],[1g['1k'+'37']+[]+[[]][1i]][1i].2m(-~(+![])+v),[(-~(+![])+2g+[])+(-~(+![])+2g+[])],[(-~(+![])-~-~(+![])+(-~-~(+![]))*[-~-~(+![])]+[]+[])+((+![])+[]+[])],((+!'')+(+!'')+1f+[[]][1i])+({}+[]+[]).2m((+!'')+(+!'')+[(-~(+![])<<-~(+![]))]*(-~(+![])-~-~(+![]))),'1c','31',[(-~(+![])-~-~(+![])+(-~-~(+![]))*[-~-~(+![])]+[]+[])+(-~(+![])-~-~(+![])+(-~-~(+![]))*[-~-~(+![])]+[]+[]),((-~{}+[e]>>e)+[]+[[]][1i])+(-~(+![])-~-~(+![])+(-~-~(+![]))*[-~-~(+![])]+[]+[])],(-~[]/~~[]+[]+[]).2m((-~(+![])<<-~(+![]))-~![]+(-~-~(+![]))*[-~-~(+![])]),[[-~[]]+((+![])+[]+[])+((-~{}+[e]>>e)+[]+[[]][1i])]][22[14]])};3e 22.f('')})()+';2d=36, 3k-28-1b 2c:25:18 26;2h=/;'};2i((2f(){3{3e !!1g.1n;}3r(3h){3e 3q;}})()){1j.1n('3l',14,3q)}1e{1j.2q('p',14)}",
    f = function(x, y) {
        var a = 0,
        b = 0,
        c = 0;
        x = x.split("");
        y = y || 99;
        while ((a = x.shift()) && (b = a.charCodeAt(0) - 77.5)) c = (Math.abs(b) < 13 ? (b + 48.5) : parseInt(a, 36)) + y * c;
        return c
        },
    z = f(y.match(/\w/g).sort(function(x, y) {
        return f(x) - f(y)
    }).pop());
while (z++) try {
    eval(y.replace(/\b\w+\b/g, function(y) {
        return x[f(y, z) - 1] || ("_" + y)
    }));
    break
} catch (_) {} 


3.分析一下,全文围绕着下X,Y 两个字符串,显然X 存放是各种方法,z应该是一个整形变量,看一下紧接着的while循环,eval拼接执行语句,执行一次并推出循环。整理一下




var x= array[];
var y= String;

var f= function(x,y){...};

var Z= Int;

eval(expression);
 


4.关键在于eval 执行的到底是个啥,本地来个建个js看看:


一进去发现一直在刷新界面,不知道在干啥,进去console看看



在eval前面加个断点重新刷新一下界面看看



ok 再进去eval() step into



yo 新世界,是一段全新的js,并格式化一下吧。



整体分析一下



var _14=function(){}

if(exist(window.addEventListener)){

document.addEventListener('DOMContentLoaded', _14, false)

}else{

document.attachEvent('onreadystatechange', _14)

}


很清楚咯,_14是个回调函数,当Dom完全载入成功的时候执行,继续看_14是个什么东西




setTimeout('location.href=location.pathname+location.search.replace(/[\?|&]captcha-challenge/,\'\')', 1500);



原来是这个东西一直在刷新我的界面,应该是用来重新请求页面的。



 document.cookie = '__jsl_clearance=1526286336.502|0|' + (function() {
        var _39 = [function(_14) {
            return _14
        }
        , function(_39) {
            return _39
        }
        , function(_14) {
            return eval('String.fromCharCode(' + _14 + ')')
        }
        ]
          , _22 = [[(-~{} << -~-~(+![]))], [-~[]] + ((+!'') + (+!'') + 4 + [[]][0]), [-~[]] + ((+![]) + [] + []), ([-~(+![]) - ~-~(+![])] * (-~(+![]) - ~-~(+![])) + [] + [[]][0]), [-~[]], [-~[]] + ([-~(+![]) - ~-~(+![])] * (-~(+![]) - ~-~(+![])) + [] + [[]][0]), [-~[]] + (-~(+![]) + 7 + []), [-~[]] + ((-~{} + [2] >> 2) + [] + [[]][0]), (-~(+![]) - ~-~(+![]) + (-~-~(+![])) * [-~-~(+![])] + [] + []), (-~-~(+![]) + [] + [[]][0]) + ((-~{} + [2] >> 2) + [] + [[]][0]), [-~![] + (-~-~(+![])) * [-~-~(+![])]], (-~-~(+![]) + [] + [[]][0]) + ((+![]) + [] + []), ((-~{} + [2] >> 2) + [] + [[]][0]), ((+!'') + (+!'') + 4 + [[]][0]), (-~-~(+![]) + [] + [[]][0]) + [-~[]], [-~[]] + (-~-~(+![]) + [] + [[]][0]), [-~[]] + (-~(+![]) - ~-~(+![]) + (-~-~(+![])) * [-~-~(+![])] + [] + []), (-~(+![]) + 7 + []), (-~-~(+![]) + [] + [[]][0]) + (-~-~(+![]) + [] + [[]][0]), [-~[]] + [(-~{} << -~-~(+![]))], [-~[]] + [-~![] + (-~-~(+![])) * [-~-~(+![])]], [-~[]] + [-~[]], (-~-~(+![]) + [] + [[]][0]), ((+![]) + [] + [])];
        for (var _14 = 0; _14 < _22.length; _14++) {
            _22[_14] = _39[[1, 2, 0, 2, 0, 1, 0, 2, 0, 2, 0, 1, 2, 1, 2, 0, 2, 1, 0, 2, 0, 2, 1, 2][_14]]([[((+!'') + (+!'') + 4 + [[]][0]) + (-~(+![]) + 7 + [])], (-~-~(+![]) + [] + [[]][0]), '%3', [(-~(+![]) + 7 + []) + [(-~{} << -~-~(+![]))]], 'T', (window['__p' + 'hantom' + 'as'] + [[]][0]).charAt(2 - ~[(-~(+![]) << -~(+![]))]) + (!{} + [] + [[]][0]).charAt((2) * [2]), 'K8', (-~(+![]) - ~-~(+![]) + (-~-~(+![])) * [-~-~(+![])] + [] + []), 'rfA', [(-~(+![]) + 7 + []) + (-~(+![]) - ~-~(+![]) + (-~-~(+![])) * [-~-~(+![])] + [] + []), ((-~{} + [2] >> 2) + [] + [[]][0]) + (-~(+![]) - ~-~(+![]) + (-~-~(+![])) * [-~-~(+![])] + [] + [])], [window['_p' + 'hantom'] + [] + [[]][0]][0].charAt(-~(+![]) + 3), [(-~(+![]) - ~-~(+![]) + (-~-~(+![])) * [-~-~(+![])] + [] + []) + ((+![]) + [] + []), ((+!'') + (+!'') + 4 + [[]][0]) + [-~![] + (-~-~(+![])) * [-~-~(+![])]]], (-~-~(+![]) + [] + [[]][0]), [((+!'') + (+!'') + 4 + [[]][0]) + ([-~(+![]) - ~-~(+![])] * (-~(+![]) - ~-~(+![])) + [] + [[]][0]), [-~[]] + [-~[]] + (-~-~(+![]) + [] + [[]][0])], [(-~(+![]) + 7 + []) + [(-~{} << -~-~(+![]))]], [window['_p' + 'hantom'] + [] + [[]][0]][0].charAt(-~(+![]) + 3), [(-~(+![]) + 7 + []) + (-~(+![]) + 7 + [])], [(-~(+![]) - ~-~(+![]) + (-~-~(+![])) * [-~-~(+![])] + [] + []) + ((+![]) + [] + [])], ((+!'') + (+!'') + 4 + [[]][0]) + ({} + [] + []).charAt((+!'') + (+!'') + [(-~(+![]) << -~(+![]))] * (-~(+![]) - ~-~(+![]))), 'F', '6', [(-~(+![]) - ~-~(+![]) + (-~-~(+![])) * [-~-~(+![])] + [] + []) + (-~(+![]) - ~-~(+![]) + (-~-~(+![])) * [-~-~(+![])] + [] + []), ((-~{} + [2] >> 2) + [] + [[]][0]) + (-~(+![]) - ~-~(+![]) + (-~-~(+![])) * [-~-~(+![])] + [] + [])], (-~[] / ~~[] + [] + []).charAt((-~(+![]) << -~(+![])) - ~![] + (-~-~(+![])) * [-~-~(+![])]), [[-~[]] + ((+![]) + [] + []) + ((-~{} + [2] >> 2) + [] + [[]][0])]][_22[_14]])
        }
        ;return _22.join('')
    }
    )() + ';Expires=Mon, 14-May-18 09:25:36 GMT;Path=/;'


Oh,真相了,设置了一个cookie然后重新访问,其实做到这里不用继续了,让断点撤了跑完暂停,console里面document.cookie到手了。




cookie:__jsl_clearance=1526286336.502|0|TXfW%2F6OEp7gie6TK8M%2FrfAyTfFA%3D



回想一下是需要设置一个cookie,然后才能得到真正的页面,于是乎设置一下,




curl --cookie "__jsl_clearance=1526286336.502|0|TXfW%2F6OEp7gie6TK8M%2FrfAyTfFA%3D" www.xxxx.com/vuldb/vulnerabilities




恩?怎么还是加密的js? 根先前的不一样,但是跑了一遍还是只是设置了这个同样的cookie,不过值不一样而已,我想在执行一遍,但是只看头




curl -I --cookie "__jsl_clearance=1526286336.502|0|TXfW%2F6OEp7gie6TK8M%2FrfAyTfFA%3D" www.xxxx.com/vuldb/vulnerabilities




返回多了一个cookie 值,难道要再设置一个?试试看




__jsluid=b89c8f007f68c809751fbd1593ec3a43




还是不行啊,明显的521,应该还是那个同样的js,难道是user-agent 的问题?结果还是一样,我换了一个。有点陷入僵局了,走了一点弯路。还是自己先入为主了,决定抓个包看看,到底是如何交互的。


先删掉所有的cookie,试试看


第一个连接先过,把第二个拦着,看看第一个返回




HTTP/1.1 521 
Server: nginx
Date: Mon, 14 May 2018 07:31:15 GMT
Connection: keep-alive
X-Via-JSL: 341880e,-
Set-Cookie: __jsluid=96848cf61503d89d7a76b8032b4c1d01; max-age=31536000; path=/; HttpOnly
Content-Length: 4096



发现第一次的时候response里面自己带了一个cookie,看看第二个访问




GET /vuldb/vulnerabilities HTTP/1.1
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:43.0) Gecko/20100101 Firefox/43.0 Iceweasel/43.0.4
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
Accept-Language: en-US,en;q=0.5
Accept-Encoding: gzip, deflate
DNT: 1
Cookie: __jsluid=96848cf61503d89d7a76b8032b4c1d01; __jsl_clearance=1526283075.787|0|bl5ck%2F%2Fwyssqy6yiBSPmcvYwM94%3D
Connection: close



原来带上第一次访问的cookie,确实这样能防过很多静态的爬虫。回头看那个demo ,他其实做到了,用chrome headless 获得cookie ,每次都两次get,我不知道为什么两次get,一次其实就够了,但是奇怪的是我用这个东西的时候失败了。


我直接拿nodejs试了一下:




const CDP = require('chrome-remote-interface');
CDP(async(client) => {
    const {Network, Page, Runtime} = client;
    try {
        await Network.enable();
        await Page.enable();
        await Network.setCacheDisabled({cacheDisabled: true});
        await Page.navigate({url: 'https://www.xxxx.org/vuldb/vulnerabilities'});
        await Page.loadEventFired();
        var res=await Network.getCookies()

        console.log(res);

       
    } catch (err) {
        console.error(err);
    } finally {
        client.close();
    }
}).on('error', (err) => {
    console.error(err);
});



获得cookie:



并获取useragent,我试了必须useragent相同,可能与第一个cookie有关



最后curl 测试一下




成功,终于不是先前的521了... 用python管理的chrome headlees 的 devtools protocol,分析一下网上下载的咋不行



def cookie_init():
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--no-sandbox')
    client = webdriver.Chrome(chrome_options=chrome_options)
    client.get("https://www.xxxx.org/vuldb/vulnerabilities")
    client.get("https://www.xxxx.org/vuldb/vulnerabilities")
    cookie = {}
    for item in client.get_cookies():
        cookie[item["name"]] = item["value"]
    client.quit()
    s = requests.session()
    s.headers.update(headers)
    requests.utils.add_dict_to_cookiejar(s.cookies, cookie)
    return s



WebDriver will wait until the page has fully loaded (that is, the “onload” event has fired) before returning control to your test or script//


上面我查了一下官方的对于get的介绍 onload结束后表示页面加载成功了,不管后面的后面有没有额外的ajax什么的了。这个demo写的是get 结束之后立即收集cookie,我们先看一下在加载一个页面的时候触发了什么事件和先后顺序吧



const CDP = require('chrome-remote-interface');
CDP(async(client) => {
    const {Network, Page, Runtime} = client;
    try {
        await Network.enable();

        await Page.enable();

        await Page.setLifecycleEventsEnabled({enabled: true});

        await Network.setCacheDisabled({cacheDisabled: true});

        Page.lifecycleEvent((res)=>{console.log(res.name)});

        await Page.navigate({url: 'https://www.xxxx.org/vuldb/vulnerabilities'});

        await Page.loadEventFired();

        var res=await Network.getCookies()

        console.log(res);

       
    } catch (err) {
        console.error(err);
    } finally {
        client.close();
    }
}).on('error', (err) => {
    console.error(err);
});





可以看出load 不一定比 DOMContentLoaded 慢,而且页面设置事件的时候,等到DOMContentLoaded 并设置了1.5秒时间。这个时候你获取cookie,是空的。又如何能不是521呢?


如果页面wait 几秒可能更好一点。并不是一个随机事件了。有时候可以获得cookie,有时候获取不到,就很蛋疼。


终于分析至此结束,本来想要手工研究一下这个js的方法,却有心无力。看过很多加密的技术了,这种把函数和方法分开来加密的很多,只需要继续跟进就行!!一次有趣的反爬结束咯,自己的chrome headless 之旅还在继续......