- html - 出于某种原因,IE8 对我的 Sass 文件中继承的 html5 CSS 不友好?
- JMeter 在响应断言中使用 span 标签的问题
- html - 在 :hover and :active? 上具有不同效果的 CSS 动画
- html - 相对于居中的 html 内容固定的 CSS 重复背景?
目标:
查找并打印 <td>
之间的一个值来自 HTML table
的标签使用lexbor
。 The details and source code of Lexbor can be found here.
更多详细信息:
有很多<td>
标签,每个标签都由唯一的 header
表示。下面是一个简单的例子,其中只有第一列值 0.7
感兴趣(即带有 header="choose-this-header"
的标签)。
<table>
<tbody>
<tr>
<td header="choose-this-header">0.7</td>
<td header="ignore-this-header">1.3</td>
<td header="ignore-this-header">5.4</td>
</tr>
</tbody>
</table>
因此,找到该值的最佳方法似乎是:
HTML
对于元素 header="chosen-header"
HTML
的这一行,并提取 <td>...</td>
之间的值标签问题:
Based on this lexbor example , step_one.c
如下所示,成功检测到 HTML
行包含所需的header
,但它在终端上打印为 <td header="choose-this-header">
没有文本值或结束 </td>
标签。如果有办法将整行(即 <td header="choose-this-header">0.7</td>
)保存到缓冲区中,则程序 step_two.c
based on this example下面可用于提取 0.7
的文本值.
step_one.c
#include "base.h"
#include <lexbor/dom/dom.h>
static void
print_collection_elements(lxb_dom_collection_t *collection)
{
lxb_dom_element_t *element;
for (size_t i = 0; i < lxb_dom_collection_length(collection); i++) {
element = lxb_dom_collection_element(collection, i);
serialize_node(lxb_dom_interface_node(element));
}
lxb_dom_collection_clean(collection);
}
int
main(int argc, const char *argv[])
{
lxb_status_t status;
lxb_dom_element_t *body;
lxb_html_document_t *document;
lxb_dom_collection_t *collection;
const lxb_char_t html[] = "<table>"
"<tbody>"
"<tr>"
"<td header=\"choose-this-header\">0.7</td>"
"<td header=\"ignore-this-header\">1.3</td>"
"<td header=\"ignore-this-header\">5.4</td>"
"</tr>"
"</tbody>"
"</table>";
size_t html_szie = sizeof(html) - 1;
PRINT("HTML:");
PRINT("%s", (const char *) html);
document = parse(html, html_szie);
body = lxb_dom_interface_element(document->body);
collection = lxb_dom_collection_make(&document->dom_document, 128);
if (collection == NULL) {
FAILED("Failed to create Collection object");
}
/* Full match */
status = lxb_dom_elements_by_attr(body, collection,
(const lxb_char_t *) "header", 6,
(const lxb_char_t *) "choose-this-header", 18,
true);
if (status != LXB_STATUS_OK) {
FAILED("Failed to get elements by name");
}
PRINT("\nFull match by 'choose-this-header':");
print_collection_elements(collection);
lxb_dom_collection_destroy(collection, true);
lxb_html_document_destroy(document);
return 0;
}
step_one.c 输出:
HTML:
<table><tbody><tr><td header="choose-this-header">0.7</td><td header="ignore-this-header">1.3</td><td header="ignore-this-header">5.4</td></tr></tbody></table>
Full match by 'choose-this-header':
<td header="choose-this-header"> // no text value or closing tag is printed
step_two.c
#include "lexbor/html/tokenizer.h"
#define FAILED(...) \
do { \
fprintf(stderr, __VA_ARGS__); \
fprintf(stderr, "\n"); \
exit(EXIT_FAILURE); \
} \
while (0)
static lxb_html_token_t *
token_callback(lxb_html_tokenizer_t *tkz, lxb_html_token_t *token, void *ctx)
{
/* Skip all not #text tokens */
if (token->tag_id != LXB_TAG__TEXT) {
return token;
}
printf("%.*s", (int) (token->text_end - token->text_start),
token->text_start);
return token;
}
int
main(int argc, const char *argv[])
{
lxb_status_t status;
lxb_html_tokenizer_t *tkz;
const lxb_char_t data[] = "<td headers=\"choose-this-header\">0.7</td>";
printf("HTML:\n%s\n\n", (char *) data);
printf("Result:\n");
tkz = lxb_html_tokenizer_create();
status = lxb_html_tokenizer_init(tkz);
if (status != LXB_STATUS_OK) {
FAILED("Failed to create tokenizer object");
}
/* Set callback for token */
lxb_html_tokenizer_callback_token_done_set(tkz, token_callback, NULL);
status = lxb_html_tokenizer_begin(tkz);
if (status != LXB_STATUS_OK) {
FAILED("Failed to prepare tokenizer object for parsing");
}
status = lxb_html_tokenizer_chunk(tkz, data, (sizeof(data) - 1));
if (status != LXB_STATUS_OK) {
FAILED("Failed to parse the html data");
}
status = lxb_html_tokenizer_end(tkz);
if (status != LXB_STATUS_OK) {
FAILED("Failed to ending of parsing the html data");
}
printf("\n");
lxb_html_tokenizer_destroy(tkz);
return 0;
}
step_two.c 输出:
HTML:
<td headers="choose-this-header">0.7</td>
Result:
0.7
其他详细信息:
lexbor
因为它很快Ubuntu 20.04.1 LTS
gcc myprogram.c -llexbor -o myprogram
进行编译摘要问题:
Q1.程序如何step_one.c
是否进行修改以将 ENTIRE 行保存到缓冲区中?一旦实现了这一点,将两个程序合并成一个变量 data[]
的程序就会相对简单。在step_two.c
将是使用step_one.c
中看到的逻辑找到的整行.
最佳答案
这是一个例子:
#include <lexbor/html/html.h>
#include <lexbor/css/css.h>
#include <lexbor/selectors/selectors.h>
lxb_status_t
callback(const lxb_char_t *data, size_t len, void *ctx)
{
printf("%.*s", (int) len, (const char *) data);
return LXB_STATUS_OK;
}
lxb_status_t
find_callback(lxb_dom_node_t *node, lxb_css_selector_specificity_t *spec,
void *ctx)
{
printf("Tag:\n");
/* Print only <td> tag. */
(void) lxb_html_serialize_cb(node, callback, NULL);
printf("\n\nTag with children:\n");
/* Print <td> element and all children in <td>. */
(void) lxb_html_serialize_tree_cb(node, callback, NULL);
printf("\n\nChildren:\n");
/* Print children in <td>. */
(void) lxb_html_serialize_deep_cb(node, callback, NULL);
/* Use lxb_html_serialize_*_str(...) for buffer. */
return LXB_STATUS_OK;
}
int main(void) {
lxb_status_t status;
lxb_dom_node_t *body;
lxb_html_document_t *document;
lxb_css_parser_t *parser;
lxb_selectors_t *selectors;
lxb_css_selector_list_t *list;
const lxb_char_t html[] = "<table>"
"<tbody>"
"<tr>"
"<td header=\"choose-this-header\">0.7</td>"
"<td header=\"ignore-this-header\">1.3</td>"
"<td header=\"ignore-this-header\">5.4</td>"
"</tr>"
"</tbody>"
"</table>";
static const lxb_char_t slctrs[] = "td[header='choose-this-header']";
document = lxb_html_document_create();
if (document == NULL) {
return EXIT_FAILURE;
}
status = lxb_html_document_parse(document, html, sizeof(html) - 1);
if (status != LXB_STATUS_OK) {
return EXIT_FAILURE;
}
/* Create CSS parser. */
parser = lxb_css_parser_create();
status = lxb_css_parser_init(parser, NULL, NULL);
if (status != LXB_STATUS_OK) {
return EXIT_FAILURE;
}
/* Selectors. */
selectors = lxb_selectors_create();
status = lxb_selectors_init(selectors);
if (status != LXB_STATUS_OK) {
return EXIT_FAILURE;
}
list = lxb_css_selectors_parse(parser, slctrs, sizeof(slctrs) - 1);
if (parser->status != LXB_STATUS_OK) {
return EXIT_FAILURE;
}
/* Find DOM/HTML nodes by selectors. */
body = lxb_dom_interface_node(lxb_html_document_body_element(document));
if (body == NULL) {
return EXIT_FAILURE;
}
// lxb_html_serialize_deep_cb(body, callback, NULL);
status = lxb_selectors_find(selectors, body, list, find_callback, NULL);
if (status != LXB_STATUS_OK) {
return EXIT_FAILURE;
}
printf("\n");
/* Destroy Selectors object. */
(void) lxb_selectors_destroy(selectors, true);
/* Destroy resources for CSS Parser. */
(void) lxb_css_parser_destroy(parser, true);
/* Destroy all Selector List memory. */
lxb_css_selector_list_destroy_memory(list);
/* Destroy HTML Document. */
lxb_html_document_destroy(document);
return 0;
}
输出:
Tag:
<td header="choose-this-header">
Tag with children:
<td header="choose-this-header">0.7</td>
Children:
0.7
关于html - 莱克斯堡 : webscraping an HTML table in C,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/69827153/
我是一名优秀的程序员,十分优秀!