gpt4 book ai didi

javascript - 如何从网页中抓取 `window.initialState`?

转载 作者:行者123 更新时间:2023-11-30 06:15:26 26 4
gpt4 key购买 nike

这是我要抓取的页面:

<!DOCTYPE html><html dir="ltr" class="rezemp-ResumeViewLayout-html"><head>
<!-- Google Tag Manager -->
<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
})(window,document,'script','dataLayer','GTM-WNSB8XG');</script>
<!-- End Google Tag Manager -->
<script src="https://cdn.optimizely.com/js/6377170661.js"></script>
<script>
window.createRecaptchaPromise = function () {
return new Promise(function(resolve) { resolve(''); });
};

window.createRecaptchaChallengePromise = function () {
return new Promise(function(resolve) { resolve(''); });
};
</script>
<title>Cat Sitter - Perkasie, PA | Indeed.com</title><meta name="viewport" content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"><link rel="stylesheet" type="text/css" href="/static/a965426693faf68209ad/styles/resume-view-app.css"></head><body class="rezemp-ResumeViewLayout-body">
<!-- Google Tag Manager (noscript) -->
<noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-WNSB8XG"
height="0" width="0" style="display:none;visibility:hidden"></iframe></noscript>
<!-- End Google Tag Manager (noscript) -->
<div id="content"><noscript>This page requires JavaScript.</noscript></div><script type="text/javascript">var _sift = window._sift = window._sift || []; _sift.push(['_setAccount', 'fb21e9c129']); _sift.push(['_setUserId', 'b90ff823cb1bcec9']); _sift.push(['_setSessionId', '1dcicspl2f8a9800']); _sift.push(['_trackPageview']);
(function() {
function ls() {
var e = document.createElement('script');
e.src = 'https://cdn.siftscience.com/s.js';
document.body.appendChild(e);
}
if (window.attachEvent) {
window.attachEvent('onload', ls);
} else {
window.addEventListener('load', ls, false);
}
})();
</script><script>window.initialState = JSON.parse('{\x22commonModel\x22:{\x22advertiser\x22:\x22Nyaa Studio\x22,\x22baseAdsUrl\x22:\x22https:\\u002F\\u002Fads.indeed.com\x22,\x22baseAnalyticsUrl\x22:\x22https:\\u002F\\u002Fanalytics.indeed.com\x22,\x22baseBillingUrl\x22:\x22https:\\u002F\\u002Fbilling.indeed.com\x22,\x22baseIndeedEmployerHelpUrl\x22:\x22https:\\u002F\\u002Findeedemployers.zendesk.com\x22,\x22baseIndeedUrl\x22:\x22https:\\u002F\\u002Fwww.indeed.com\x22,\x22baseMyIndeedUrl\x22:\x22https:\\u002F\\u002Fmy.indeed.com\x22,\x22basePieUrl\x22:\x22https:\\u002F\\u002Faccount.indeed.com\x22,\x22baseRozUrl\x22:\x22https:\\u002F\\u002Fresumes.indeed.com\x22,\x22baseSecureUrl\x22:\x22https:\\u002F\\u002Fsecure.indeed.com\x22,\x22billingIssue\x22:\x22CAN_PURCHASE\x22,\x22canSwitchAccount\x22:false,\x22confirmed\x22:true,\x22country\x22:\x22US\x22,\x22csrfParam\x22:\x22indeedcsrftoken\x22,\x22csrfToken\x22:\x22RonYXgzB6OxlClQV4QY9woqaatyPStN8\x22,\x22currentRelativeUrl\x22:\x22\\u002Fresume\\u002Fd53377828e23d884?s\x3dl%3D%26q%3Dcat%2520sitter%26searchFields%3Djt\x22,\x22currentUrl\x22:\x22https:\\u002F\\u002Fresumes.indeed.com\\u002Fresume\\u002Fd53377828e23d884?s\x3dl%3D%26q%3Dcat%2520sitter%26searchFields%3Djt\x22,\x22currentUserAccountKey\x22:\x2286c56776fbc49dff\x22,\x22emailAddress\x22:\x22nyaa.studio.apps@gmail.com\x22,\x22featuredEmployer\x22:false,\x22isMasquerade\x22:false,\x22language\x22:\x22en\x22,\x22locale\x22:\x22en_US\x22,\x22loggedIn\x22:true,\x22masquerade\x22:false,\x22moderated\x22:false,\x22nonMonetizedMarket\x22:false,\x22privileged\x22:false,\x22showLaunchBanner\x22:true,\x22subscriptionInfo\x22:{\x22admin\x22:true,\x22bulkContact\x22:false,\x22contactsRemaining\x22:0,\x22hasUnassignedSubscription\x22:false,\x22hasUnlimitedContacts\x22:false,\x22subscriptionAssigned\x22:false,\x22trial\x22:false},\x22subscriptionsEnabled\x22:true},\x22contactRecord\x22:{\x22allowContact\x22:true,\x22allowRepeatedContact\x22:false,\x22contactedByCoworkerDate\x22:\x22\x22,\x22contactedByCoworkerEmail\x22:\x22\x22,\x22contactedByUserDate\x22:\x22\x22,\x22responseStatus\x22:\x22UNRESPONDED\x22},\x22countryOfEligibility\x22:\x22United States\x22,\x22eligibility\x22:\x22ELG\x22,\x22isSavedResume\x22:false,\x22resumeModel\x22:{\x22accountKey\x22:\x22d53377828e23d884\x22,\x22additionalInformation\x22:\x22Skills\\nWord 2010 and 2013, Excel, Powerpoint, computer and typing skills, interpersonal skills,\\norganizational skills, some ASL knowledge, love of animals, previous animal care experience\x22,\x22assessments\x22:[],\x22awards\x22:[],\x22certifications\x22:[],\x22education\x22:[{\x22dateRange\x22:\x22December 2015 to Present\x22,\x22degree\x22:\x22Liberal Arts degree\x22,\x22field\x22:\x22Liberal Arts\x22,\x22id\x22:\x22EecYz-PgixmaTKmUQsuaQg\x22,\x22location\x22:\x22Newtown, PA\x22,\x22university\x22:\x22Bucks County Community College\x22}],\x22email\x22:\x22\x22,\x22firstName\x22:\x22Cat Sitter\x22,\x22fullName\x22:\x22Cat Sitter\x22,\x22groups\x22:[],\x22headline\x22:\x22Cat Sitter - Local Residence\x22,\x22highlightedWords\x22:[\x22sitters\x22,\x22cat\x22,\x22sitter\x22],\x22id\x22:\x22EecYz-PgixWaTKmUQsuaQg\x22,\x22licenses\x22:[],\x22links\x22:[],\x22location\x22:\x22Perkasie, PA\x22,\x22militaryService\x22:[],\x22patents\x22:[],\x22phoneNumber\x22:\x22\x22,\x22publications\x22:[],\x22skills\x22:[{\x22id\x22:\x22EecYz-PgixqaTKmUQsuaQg\x22,\x22monthsOfExperience\x22:12,\x22skill\x22:\x22Excel\x22},{\x22id\x22:\x22EecYz-PgixuaTKmUQsuaQg\x22,\x22monthsOfExperience\x22:120,\x22skill\x22:\x22organizational skills\x22},{\x22id\x22:\x22EecYz-PgixyaTKmUQsuaQg\x22,\x22monthsOfExperience\x22:12,\x22skill\x22:\x22Powerpoint\x22},{\x22id\x22:\x22EecYz-Pgix2aTKmUQsuaQg\x22,\x22monthsOfExperience\x22:24,\x22skill\x22:\x22typing\x22},{\x22id\x22:\x22EecYz-Pgix6aTKmUQsuaQg\x22,\x22monthsOfExperience\x22:24,\x22skill\x22:\x22Word\x22},{\x22id\x22:\x22Eehfj2sVJeqeoM7c3iCmnw\x22,\x22monthsOfExperience\x22:72,\x22skill\x22:\x22working with animals\x22}],\x22summary\x22:\x22\x22,\x22updatedDate\x22:\x22May 26, 2019\x22,\x22workExperience\x22:[{\x22company\x22:\x22Local Residence\x22,\x22dateRange\x22:\x222015 to Present\x22,\x22description\x22:\x22Feed cats\\n●\\tClean litter boxes\\n●\\tDaily check-ins on cats\x22,\x22id\x22:\x22EecYz-PgixaaTKmUQsuaQg\x22,\x22location\x22:\x22Quakertown, PA\x22,\x22title\x22:\x22Cat Sitter\x22},{\x22company\x22:\x22Local Residence - Dog Walker\x22,\x22dateRange\x22:\x22January 2014 to January 2016\x22,\x22description\x22:\x22Walk dogs\\n●\\tFeed dogs\\n●\\tCheck-ins and play time with dogs\x22,\x22id\x22:\x22EecYz-PgixeaTKmUQsuaQg\x22,\x22location\x22:\x22Quakertown, PA\x22,\x22title\x22:\x22Dog Sitter\x22}]},\x22tk\x22:\x221dckr2vhn3p22800\x22}');</script><script>window.proctorGroups = JSON.parse('[[5,null],[1,null],[1,null],[9,null],[7,null],[1,null],[0,null],[19,null],[-1,null],[0,null],[3,null],[1,null],[19,null],[-1,null],[-1,null],[-1,null],[-1,null],[0,null],[1,null],[-1,null],[-1,\x22${contactName} sent you a message about your resume on Indeed.\x22],[1,null],[1,null],[-1,null],[0,null],[-1,null],[-1,null],[-1,null],[1,null],[1,null],[-1,null],[-1,null],[1,null],[1,null],[-1,null],[1,null],[-1,null],[1,null],[1,{\x22recaptchaThreshold\x22:0.49}],[2,null],[-1,null],[1,null],[1,null],[0,null],[1,null],[-1,null],[1,null],[1,null],[-1,null],[-1,null],[-1,null],[-1,null],[-1,null],[1,null],[-1,null],[-1,null],[1,{\x22accountBlocks\x22:[371495985,371492945,371496796,371495142,371255403,180896675,402708456],\x22ipBlocks\x22:[\x22142.93.160.149\x22,\x22156.213.187.109\x22,\x2254.144.251.118\x22,\x2254.160.231.37\x22,\x2254.161.232.223\x22,\x2254.163.111.234\x22,\x2254.166.201.27\x22,\x2254.167.132.121\x22,\x2254.211.243.158\x22,\x2254.221.65.205\x22,\x2254.234.36.11\x22,\x2254.235.23.71\x22,\x2254.242.123.36\x22,\x2254.242.125.90\x22,\x2254.242.94.44\x22,\x2254.91.29.30\x22,\x2254.81.91.102\x22,\x2218.130.133.224\x22,\x2218.130.98.215\x22,\x223.8.18.212\x22,\x223.8.20.40\x22,\x2234.206.53.38\x22,\x2282.12.238.32\x22,\x22137.135.96.20\x22,\x2213.90.195.83\x22,\x22137.135.96.20\x22,\x22106.51.66.119\x22,\x22116.75.87.250\x22,\x2213.90.195.83\x22,\x22104.131.19.173\x22,\x22106.51.66.119\x22,\x22108.2.166.209\x22,\x2212.133.183.51\x22,\x22163.198.35.32\x22,\x22168.62.165.43\x22,\x2218.203.123.118\x22,\x2223.96.14.105\x22,\x2252.60.89.234\x22,\x2271.14.194.130\x22,\x2273.2.223.45\x22]}],[1,null],[1,null],[1,null],[1,null],[-1,null],[-1,null],[-1,null],[-1,null],[1,null],[1,null],[1,null],[1,null],[-1,null],[2,null],[1,null],[1,null],[-1,null],[3,null]]');</script><script type="text/javascript" src="/static/b9c32234bdbed298be40/scripts/vendor.js"></script><script type="text/javascript" src="/static/f38ebfd/en_US.js"></script><script>!function(n){function r(n){for(var r=a,t=n.length;t;)r=33*r^n.charCodeAt(--t);return r>>>0}var t=this['indeed.i18n.localeData'],e=t['']||{},a=e.salt;if(e.hasOwnProperty('salt'))for(var i in n)t[function(n){var t=r(n);return e.hasOwnProperty('id_length')&&(t=String(t).substring(0,e.id_length)),t}(i)]=n[i];else for(var i in n)t[i]=[null].concat(n[i])}({"Email {0} job seeker":["Contact {0} job seeker","Contact {0} job seekers"],"Email":["Contact"],"Email {0}":["Contact {0}"],"Send Email":["Message"]});</script><script type="text/javascript" src="/static/70ab8de6e2102d523c43/scripts/resume-view-app.js"></script></body></html>

我感兴趣的是末尾的 window.initialState 的一部分。我应该如何提取它?

附言我目前正在使用 SeleniumChromedriver。使用 requests 抓取信息是不可能的。

最佳答案

decode('unicode-escape') 普通字节字符串上的方法,它将把它们转换为 Unicode 字符串,encode('utf8') 方法从 Unicode 编码到 UTF-8 字节字符串,jsonString[2:-2] 删除字符串的第一个和最后两个字符,json.loads() 方法将字符串转换为 json。

re.compile() 返回一个 regular expression object ,这意味着 h 是一个正则表达式对象。

regex 对象有自己的 match 方法,带有可选的 pos 和 endpos 参数:

regex.match(string[, pos[, endpos]])

from bs4 import BeautifulSoup
import re
import json

html = """ <!DOCTYPE html><html dir="ltr" class="rezemp-ResumeViewLayout-html"><head>
<!-- Google Tag Manager -->
<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
})(window,document,'script','dataLayer','GTM-WNSB8XG');</script>
<!-- End Google Tag Manager -->
<script src="https://cdn.optimizely.com/js/6377170661.js"></script>
<script>
window.createRecaptchaPromise = function () {
return new Promise(function(resolve) { resolve(''); });
};

window.createRecaptchaChallengePromise = function () {
return new Promise(function(resolve) { resolve(''); });
};
</script>
<title>Cat Sitter - Perkasie, PA | Indeed.com</title><meta name="viewport" content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no"><link rel="stylesheet" type="text/css" href="/static/a965426693faf68209ad/styles/resume-view-app.css"></head><body class="rezemp-ResumeViewLayout-body">
<!-- Google Tag Manager (noscript) -->
<noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-WNSB8XG"
height="0" width="0" style="display:none;visibility:hidden"></iframe></noscript>
<!-- End Google Tag Manager (noscript) -->
<div id="content"><noscript>This page requires JavaScript.</noscript></div><script type="text/javascript">var _sift = window._sift = window._sift || []; _sift.push(['_setAccount', 'fb21e9c129']); _sift.push(['_setUserId', 'b90ff823cb1bcec9']); _sift.push(['_setSessionId', '1dcicspl2f8a9800']); _sift.push(['_trackPageview']);
(function() {
function ls() {
var e = document.createElement('script');
e.src = 'https://cdn.siftscience.com/s.js';
document.body.appendChild(e);
}
if (window.attachEvent) {
window.attachEvent('onload', ls);
} else {
window.addEventListener('load', ls, false);
}
})();
</script><script>
window.initialState = JSON.parse('{\x22commonModel\x22:{\x22advertiser\x22:\x22Nyaa Studio\x22,\x22baseAdsUrl\x22:\x22https:\\u002F\\u002Fads.indeed.com\x22,\x22baseAnalyticsUrl\x22:\x22https:\\u002F\\u002Fanalytics.indeed.com\x22,\x22baseBillingUrl\x22:\x22https:\\u002F\\u002Fbilling.indeed.com\x22,\x22baseIndeedEmployerHelpUrl\x22:\x22https:\\u002F\\u002Findeedemployers.zendesk.com\x22,\x22baseIndeedUrl\x22:\x22https:\\u002F\\u002Fwww.indeed.com\x22,\x22baseMyIndeedUrl\x22:\x22https:\\u002F\\u002Fmy.indeed.com\x22,\x22basePieUrl\x22:\x22https:\\u002F\\u002Faccount.indeed.com\x22,\x22baseRozUrl\x22:\x22https:\\u002F\\u002Fresumes.indeed.com\x22,\x22baseSecureUrl\x22:\x22https:\\u002F\\u002Fsecure.indeed.com\x22,\x22billingIssue\x22:\x22CAN_PURCHASE\x22,\x22canSwitchAccount\x22:false,\x22confirmed\x22:true,\x22country\x22:\x22US\x22,\x22csrfParam\x22:\x22indeedcsrftoken\x22,\x22csrfToken\x22:\x22RonYXgzB6OxlClQV4QY9woqaatyPStN8\x22,\x22currentRelativeUrl\x22:\x22\\u002Fresume\\u002Fd53377828e23d884?s\x3dl%3D%26q%3Dcat%2520sitter%26searchFields%3Djt\x22,\x22currentUrl\x22:\x22https:\\u002F\\u002Fresumes.indeed.com\\u002Fresume\\u002Fd53377828e23d884?s\x3dl%3D%26q%3Dcat%2520sitter%26searchFields%3Djt\x22,\x22currentUserAccountKey\x22:\x2286c56776fbc49dff\x22,\x22emailAddress\x22:\x22nyaa.studio.apps@gmail.com\x22,\x22featuredEmployer\x22:false,\x22isMasquerade\x22:false,\x22language\x22:\x22en\x22,\x22locale\x22:\x22en_US\x22,\x22loggedIn\x22:true,\x22masquerade\x22:false,\x22moderated\x22:false,\x22nonMonetizedMarket\x22:false,\x22privileged\x22:false,\x22showLaunchBanner\x22:true,\x22subscriptionInfo\x22:{\x22admin\x22:true,\x22bulkContact\x22:false,\x22contactsRemaining\x22:0,\x22hasUnassignedSubscription\x22:false,\x22hasUnlimitedContacts\x22:false,\x22subscriptionAssigned\x22:false,\x22trial\x22:false},\x22subscriptionsEnabled\x22:true},\x22contactRecord\x22:{\x22allowContact\x22:true,\x22allowRepeatedContact\x22:false,\x22contactedByCoworkerDate\x22:\x22\x22,\x22contactedByCoworkerEmail\x22:\x22\x22,\x22contactedByUserDate\x22:\x22\x22,\x22responseStatus\x22:\x22UNRESPONDED\x22},\x22countryOfEligibility\x22:\x22United States\x22,\x22eligibility\x22:\x22ELG\x22,\x22isSavedResume\x22:false,\x22resumeModel\x22:{\x22accountKey\x22:\x22d53377828e23d884\x22,\x22additionalInformation\x22:\x22Skills\\nWord 2010 and 2013, Excel, Powerpoint, computer and typing skills, interpersonal skills,\\norganizational skills, some ASL knowledge, love of animals, previous animal care experience\x22,\x22assessments\x22:[],\x22awards\x22:[],\x22certifications\x22:[],\x22education\x22:[{\x22dateRange\x22:\x22December 2015 to Present\x22,\x22degree\x22:\x22Liberal Arts degree\x22,\x22field\x22:\x22Liberal Arts\x22,\x22id\x22:\x22EecYz-PgixmaTKmUQsuaQg\x22,\x22location\x22:\x22Newtown, PA\x22,\x22university\x22:\x22Bucks County Community College\x22}],\x22email\x22:\x22\x22,\x22firstName\x22:\x22Cat Sitter\x22,\x22fullName\x22:\x22Cat Sitter\x22,\x22groups\x22:[],\x22headline\x22:\x22Cat Sitter - Local Residence\x22,\x22highlightedWords\x22:[\x22sitters\x22,\x22cat\x22,\x22sitter\x22],\x22id\x22:\x22EecYz-PgixWaTKmUQsuaQg\x22,\x22licenses\x22:[],\x22links\x22:[],\x22location\x22:\x22Perkasie, PA\x22,\x22militaryService\x22:[],\x22patents\x22:[],\x22phoneNumber\x22:\x22\x22,\x22publications\x22:[],\x22skills\x22:[{\x22id\x22:\x22EecYz-PgixqaTKmUQsuaQg\x22,\x22monthsOfExperience\x22:12,\x22skill\x22:\x22Excel\x22},{\x22id\x22:\x22EecYz-PgixuaTKmUQsuaQg\x22,\x22monthsOfExperience\x22:120,\x22skill\x22:\x22organizational skills\x22},{\x22id\x22:\x22EecYz-PgixyaTKmUQsuaQg\x22,\x22monthsOfExperience\x22:12,\x22skill\x22:\x22Powerpoint\x22},{\x22id\x22:\x22EecYz-Pgix2aTKmUQsuaQg\x22,\x22monthsOfExperience\x22:24,\x22skill\x22:\x22typing\x22},{\x22id\x22:\x22EecYz-Pgix6aTKmUQsuaQg\x22,\x22monthsOfExperience\x22:24,\x22skill\x22:\x22Word\x22},{\x22id\x22:\x22Eehfj2sVJeqeoM7c3iCmnw\x22,\x22monthsOfExperience\x22:72,\x22skill\x22:\x22working with animals\x22}],\x22summary\x22:\x22\x22,\x22updatedDate\x22:\x22May 26, 2019\x22,\x22workExperience\x22:[{\x22company\x22:\x22Local Residence\x22,\x22dateRange\x22:\x222015 to Present\x22,\x22description\x22:\x22Feed cats\\n●\\tClean litter boxes\\n●\\tDaily check-ins on cats\x22,\x22id\x22:\x22EecYz-PgixaaTKmUQsuaQg\x22,\x22location\x22:\x22Quakertown, PA\x22,\x22title\x22:\x22Cat Sitter\x22},{\x22company\x22:\x22Local Residence - Dog Walker\x22,\x22dateRange\x22:\x22January 2014 to January 2016\x22,\x22description\x22:\x22Walk dogs\\n●\\tFeed dogs\\n●\\tCheck-ins and play time with dogs\x22,\x22id\x22:\x22EecYz-PgixeaTKmUQsuaQg\x22,\x22location\x22:\x22Quakertown, PA\x22,\x22title\x22:\x22Dog Sitter\x22}]},\x22tk\x22:\x221dckr2vhn3p22800\x22}');</script><script>window.proctorGroups = JSON.parse('[[5,null],[1,null],[1,null],[9,null],[7,null],[1,null],[0,null],[19,null],[-1,null],[0,null],[3,null],[1,null],[19,null],[-1,null],[-1,null],[-1,null],[-1,null],[0,null],[1,null],[-1,null],[-1,\x22${contactName} sent you a message about your resume on Indeed.\x22],[1,null],[1,null],[-1,null],[0,null],[-1,null],[-1,null],[-1,null],[1,null],[1,null],[-1,null],[-1,null],[1,null],[1,null],[-1,null],[1,null],[-1,null],[1,null],[1,{\x22recaptchaThreshold\x22:0.49}],[2,null],[-1,null],[1,null],[1,null],[0,null],[1,null],[-1,null],[1,null],[1,null],[-1,null],[-1,null],[-1,null],[-1,null],[-1,null],[1,null],[-1,null],[-1,null],[1,{\x22accountBlocks\x22:[371495985,371492945,371496796,371495142,371255403,180896675,402708456],\x22ipBlocks\x22:[\x22142.93.160.149\x22,\x22156.213.187.109\x22,\x2254.144.251.118\x22,\x2254.160.231.37\x22,\x2254.161.232.223\x22,\x2254.163.111.234\x22,\x2254.166.201.27\x22,\x2254.167.132.121\x22,\x2254.211.243.158\x22,\x2254.221.65.205\x22,\x2254.234.36.11\x22,\x2254.235.23.71\x22,\x2254.242.123.36\x22,\x2254.242.125.90\x22,\x2254.242.94.44\x22,\x2254.91.29.30\x22,\x2254.81.91.102\x22,\x2218.130.133.224\x22,\x2218.130.98.215\x22,\x223.8.18.212\x22,\x223.8.20.40\x22,\x2234.206.53.38\x22,\x2282.12.238.32\x22,\x22137.135.96.20\x22,\x2213.90.195.83\x22,\x22137.135.96.20\x22,\x22106.51.66.119\x22,\x22116.75.87.250\x22,\x2213.90.195.83\x22,\x22104.131.19.173\x22,\x22106.51.66.119\x22,\x22108.2.166.209\x22,\x2212.133.183.51\x22,\x22163.198.35.32\x22,\x22168.62.165.43\x22,\x2218.203.123.118\x22,\x2223.96.14.105\x22,\x2252.60.89.234\x22,\x2271.14.194.130\x22,\x2273.2.223.45\x22]}],[1,null],[1,null],[1,null],[1,null],[-1,null],[-1,null],[-1,null],[-1,null],[1,null],[1,null],[1,null],[1,null],[-1,null],[2,null],[1,null],[1,null],[-1,null],[3,null]]');</script><script type="text/javascript" src="/static/b9c32234bdbed298be40/scripts/vendor.js"></script><script type="text/javascript" src="/static/f38ebfd/en_US.js"></script><script>!function(n){function r(n){for(var r=a,t=n.length;t;)r=33*r^n.charCodeAt(--t);return r>>>0}var t=this['indeed.i18n.localeData'],e=t['']||{},a=e.salt;if(e.hasOwnProperty('salt'))for(var i in n)t[function(n){var t=r(n);return e.hasOwnProperty('id_length')&&(t=String(t).substring(0,e.id_length)),t}(i)]=n[i];else for(var i in n)t[i]=[null].concat(n[i])}({"Email {0} job seeker":["Contact {0} job seeker","Contact {0} job seekers"],"Email":["Contact"],"Email {0}":["Contact {0}"],"Send Email":["Message"]});</script>
<script type="text/javascript" src="/static/70ab8de6e2102d523c43/scripts/resume-view-app.js"></script></body></html>"""

soup = BeautifulSoup(html, 'lxml')
script = soup.find_all("script")
pattern = re.compile('window.initialState = JSON.parse(.*);')

for i in script:
strObj = i.text
match = pattern.search(strObj)
if match:
jsonString = strObj.split("window.initialState = JSON.parse")[1][:-1].encode('utf8').decode('unicode_escape')
jsonData = json.loads(jsonString[2:-2], strict=False)
print(jsonData)

O/P:

{'commonModel': {'advertiser': 'Nyaa Studio', 'baseAdsUrl': 'https://ads.indeed.com', 'baseAnalyticsUrl': 'https://analytics.indeed.com', 'baseBillingUrl': 'https://billing.indeed.com', 'baseIndeedEmployerHelpUrl': 'https://indeedemployers.zendesk.com', 'baseIndeedUrl': 'https://www.indeed.com', 'baseMyIndeedUrl': 'https://my.indeed.com', 'basePieUrl': 'https://account.indeed.com', 'baseRozUrl': 'https://resumes.indeed.com', 'baseSecureUrl': 'https://secure.indeed.com', 'billingIssue': 'CAN_PURCHASE', 'canSwitchAccount': False, 'confirmed': True, 'country': 'US', 'csrfParam': 'indeedcsrftoken', 'csrfToken': 'RonYXgzB6OxlClQV4QY9woqaatyPStN8', 'currentRelativeUrl': '/resume/d53377828e23d884?s=l%3D%26q%3Dcat%2520sitter%26searchFields%3Djt', 'currentUrl': 'https://resumes.indeed.com/resume/d53377828e23d884?s=l%3D%26q%3Dcat%2520sitter%26searchFields%3Djt', 'currentUserAccountKey': '86c56776fbc49dff', 'emailAddress': 'nyaa.studio.apps@gmail.com', 'featuredEmployer': False, 'isMasquerade': False, 'language': 'en', 'locale': 'en_US', 'loggedIn': True, 'masquerade': False, 'moderated': False, 'nonMonetizedMarket': False, 'privileged': False, 'showLaunchBanner': True, 'subscriptionInfo': {'admin': True, 'bulkContact': False, 'contactsRemaining': 0, 'hasUnassignedSubscription': False, 'hasUnlimitedContacts': False, 'subscriptionAssigned': False, 'trial': False}, 'subscriptionsEnabled': True}, 'contactRecord': {'allowContact': True, 'allowRepeatedContact': False, 'contactedByCoworkerDate': '', 'contactedByCoworkerEmail': '', 'contactedByUserDate': '', 'responseStatus': 'UNRESPONDED'}, 'countryOfEligibility': 'United States', 'eligibility': 'ELG', 'isSavedResume': False, 'resumeModel': {'accountKey': 'd53377828e23d884', 'additionalInformation': 'Skills\nWord 2010 and 2013, Excel, Powerpoint, computer and typing skills, interpersonal skills,\norganizational skills, some ASL knowledge, love of animals, previous animal care experience', 'assessments': [], 'awards': [], 'certifications': [], 'education': [{'dateRange': 'December 2015 to Present', 'degree': 'Liberal Arts degree', 'field': 'Liberal Arts', 'id': 'EecYz-PgixmaTKmUQsuaQg', 'location': 'Newtown, PA', 'university': 'Bucks County Community College'}], 'email': '', 'firstName': 'Cat Sitter', 'fullName': 'Cat Sitter', 'groups': [], 'headline': 'Cat Sitter - Local Residence', 'highlightedWords': ['sitters', 'cat', 'sitter'], 'id': 'EecYz-PgixWaTKmUQsuaQg', 'licenses': [], 'links': [], 'location': 'Perkasie, PA', 'militaryService': [], 'patents': [], 'phoneNumber': '', 'publications': [], 'skills': [{'id': 'EecYz-PgixqaTKmUQsuaQg', 'monthsOfExperience': 12, 'skill': 'Excel'}, {'id': 'EecYz-PgixuaTKmUQsuaQg', 'monthsOfExperience': 120, 'skill': 'organizational skills'}, {'id': 'EecYz-PgixyaTKmUQsuaQg', 'monthsOfExperience': 12, 'skill': 'Powerpoint'}, {'id': 'EecYz-Pgix2aTKmUQsuaQg', 'monthsOfExperience': 24, 'skill': 'typing'}, {'id': 'EecYz-Pgix6aTKmUQsuaQg', 'monthsOfExperience': 24, 'skill': 'Word'}, {'id': 'Eehfj2sVJeqeoM7c3iCmnw', 'monthsOfExperience': 72, 'skill': 'working with animals'}], 'summary': '', 'updatedDate': 'May 26, 2019', 'workExperience': [{'company': 'Local Residence', 'dateRange': '2015 to Present', 'description': 'Feed cats\nâ\x97\x8f\tClean litter boxes\nâ\x97\x8f\tDaily check-ins on cats', 'id': 'EecYz-PgixaaTKmUQsuaQg', 'location': 'Quakertown, PA', 'title': 'Cat Sitter'}, {'company': 'Local Residence - Dog Walker', 'dateRange': 'January 2014 to January 2016', 'description': 'Walk dogs\nâ\x97\x8f\tFeed dogs\nâ\x97\x8f\tCheck-ins and play time with dogs', 'id': 'EecYz-PgixeaTKmUQsuaQg', 'location': 'Quakertown, PA', 'title': 'Dog Sitter'}]}, 'tk': '1dckr2vhn3p22800'}

关于javascript - 如何从网页中抓取 `window.initialState`?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/56486511/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com