gpt4 book ai didi

python - 使用 beautifulsoup 从 <script> 标签中获取数据

转载 作者:行者123 更新时间:2023-12-05 06:02:01 32 4
gpt4 key购买 nike

我使用 Beautifulsoup 从网站获取数据,我需要的数据位于 <script> 标记内。

我收到类似这样的响应,想获取“name”、“thumbnailUrl、account、Id”的内容:

<script type="text/javascript">
var modelData = {
name: 'somename',
thumbnailUrl: 'https://website.com/blob/bG9uZ2RhbjovL0ZPVVIvbGRwcm9kLWRlL3ljb3B6YTY4N0pnQ2Nfc3JYcVV3VXc9PQ',
account: '5LH7J44IYPAGEZEYA9KIL',
Id: 'someid'
};
store.initOmlib({"ClusterEndpoints":{"ONE":["http://us.site.me"],"TWO":["http://sg.site.me"],"FOUR":["http://de.site.me"],"FIVE":["http://in.site.me"],"SIX":["http://ja.site.me"],"SEVEN":["http://br.site.me"]},"ClusterEndpointsInternal":{"ONE":["http://usi.site.me"],"TWO":["http://sgi.site.me"],"FOUR":["http://dei.site.me"],"FIVE":["http://ini.site.me"],"SIX":["http://jai.site.me"],"SEVEN":["http://bri.site.me"]},"ClusterKeys":{"FIVE":"Cf0Mw0I2/cZf6alwfMhelEEOb6xq23IhPvC9E4eoaXU=","SIX":"bfYXVkWhs/gv+TCJ3EeeEE3oxiZRDpJO0fecUGdq2Qg=","ONE":"xkkzyNJmZ1DmNPxGwrykZ2O91f10KNXQvspa15nKKGs=","FOUR":"xMRCvh1eki9JEceBcV7Bx49uaQYpX8FdD0eZ+LCGqCc=","TWO":"XaG4I7b7wDOZ+lGHSPwbJ2HLkIFf0UGYAWz9c9LkiQk=","SEVEN":"LuSOGA/u5PL7rP8PG3cr6bqgQy7jXEv65iuHUX9ePQY="},"DefaultCluster":"ONE","IdpEndpoints":["http://idp.site.me"],"IdpKey":"MIOC9PS8KIwXOXSHtplBZLSpIqcifns0jzExtkHXw1g=","ReadOnlyEndpoints":["http://site.gg"],"ReadOnlyKey":"QKxHfdLVgbn+VYpnUiCFLMq/hhUpkpx7occEY3Z0Wnk="}, {"Id":"001026a1c1064a1b9305400814783c2385e2a978f13a","Secret":"0110de13b2187fe3078e13d9f6ad4e7567fdc143e915c9cb4df67ca"});

if (store.renderArc) {
store.renderArc(document.getElementById('root'), modelData, translateTable);
} else {
store.renderUser(document.getElementById('root'), modelData, translateTable);
}
</script>

我得到上述响应的代码是这样的:

url = 'https://website.com'
response = urllib.request.urlopen(url)
soup = BeautifulSoup(response.read(), "html.parser")
results = soup.find_all("script", {"type": "text/javascript"})[6]
print(results)

如何进入 <script> 标签并获取其中的数据?我已经查看了这里的其他几篇文章,但没有一篇能够成功地开始工作。

最佳答案

您可以使用此示例将 javascript 对象转换为 JSON:

import re
import json
from bs4 import BeautifulSoup


html_doc = """
<script type="text/javascript">
var modelData = {
name: 'somename',
thumbnailUrl: 'https://website.com/blob/bG9uZ2RhbjovL0ZPVVIvbGRwcm9kLWRlL3ljb3B6YTY4N0pnQ2Nfc3JYcVV3VXc9PQ',
account: '5LH7J44IYPAGEZEYA9KIL',
Id: 'someid'
};
store.initOmlib({"ClusterEndpoints":{"ONE":["http://us.site.me"],"TWO":["http://sg.site.me"],"FOUR":["http://de.site.me"],"FIVE":["http://in.site.me"],"SIX":["http://ja.site.me"],"SEVEN":["http://br.site.me"]},"ClusterEndpointsInternal":{"ONE":["http://usi.site.me"],"TWO":["http://sgi.site.me"],"FOUR":["http://dei.site.me"],"FIVE":["http://ini.site.me"],"SIX":["http://jai.site.me"],"SEVEN":["http://bri.site.me"]},"ClusterKeys":{"FIVE":"Cf0Mw0I2/cZf6alwfMhelEEOb6xq23IhPvC9E4eoaXU=","SIX":"bfYXVkWhs/gv+TCJ3EeeEE3oxiZRDpJO0fecUGdq2Qg=","ONE":"xkkzyNJmZ1DmNPxGwrykZ2O91f10KNXQvspa15nKKGs=","FOUR":"xMRCvh1eki9JEceBcV7Bx49uaQYpX8FdD0eZ+LCGqCc=","TWO":"XaG4I7b7wDOZ+lGHSPwbJ2HLkIFf0UGYAWz9c9LkiQk=","SEVEN":"LuSOGA/u5PL7rP8PG3cr6bqgQy7jXEv65iuHUX9ePQY="},"DefaultCluster":"ONE","IdpEndpoints":["http://idp.site.me"],"IdpKey":"MIOC9PS8KIwXOXSHtplBZLSpIqcifns0jzExtkHXw1g=","ReadOnlyEndpoints":["http://site.gg"],"ReadOnlyKey":"QKxHfdLVgbn+VYpnUiCFLMq/hhUpkpx7occEY3Z0Wnk="}, {"Id":"001026a1c1064a1b9305400814783c2385e2a978f13a","Secret":"0110de13b2187fe3078e13d9f6ad4e7567fdc143e915c9cb4df67ca"});

if (store.renderArc) {
store.renderArc(document.getElementById('root'), modelData, translateTable);
} else {
store.renderUser(document.getElementById('root'), modelData, translateTable);
}
</script>
"""

soup = BeautifulSoup(html_doc, "html.parser")
# locate the script, get the contents
script_text = soup.select_one("script").contents[0]

# get javascript object inside the script
model_data = re.search(r"modelData = ({.*?});", script_text, flags=re.S)
model_data = model_data.group(1)

# "convert" the javascript object to json-valid object
model_data = re.sub(
r"^\s*([^:\s]+):", r'"\1":', model_data.replace("'", '"'), flags=re.M
)

# json decode the object
model_data = json.loads(model_data)

# print the data
print(model_data["name"])
print(model_data["thumbnailUrl"])
print(model_data["account"])

打印:

somename
https://website.com/blob/bG9uZ2RhbjovL0ZPVVIvbGRwcm9kLWRlL3ljb3B6YTY4N0pnQ2Nfc3JYcVV3VXc9PQ
5LH7J44IYPAGEZEYA9KIL

或者:用re解析即可(html_doc与前面的例子相同):

soup = BeautifulSoup(html_doc, "html.parser")
script_text = soup.select_one("script").contents[0]

name = re.search(r"name: '(.*?)'", script_text).group(1)
thumbnailUrl = re.search(r"thumbnailUrl: '(.*?)'", script_text).group(1)
account = re.search(r"account: '(.*?)'", script_text).group(1)

print(name)
print(thumbnailUrl)
print(account)

打印:

somename
https://website.com/blob/bG9uZ2RhbjovL0ZPVVIvbGRwcm9kLWRlL3ljb3B6YTY4N0pnQ2Nfc3JYcVV3VXc9PQ
5LH7J44IYPAGEZEYA9KIL

关于python - 使用 beautifulsoup 从 &lt;script&gt; 标签中获取数据,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/67051131/

32 4 0