gpt4 book ai didi

c - 如何在c中将相对url转换为绝对url

转载 作者:行者123 更新时间:2023-11-30 16:14:55 25 4
gpt4 key购买 nike

我正在构建一个网络爬虫。我可以下载页面并使用gumbo我可以从网页中提取href。但提取的许多 url 都是相对 url。如何使用 C 将相对 url 转换为绝对 url。

以下是一些使用gumbo的代码。

#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <netdb.h>
#include <gumbo.h>
#include <assert.h>
#include <openssl/ssl.h>
#include <openssl/err.h>
#include "http_parser.h"

#define MAX_HEADER_BYTES 8192

static void search_for_links(GumboNode* node) {
if (node->type != GUMBO_NODE_ELEMENT) {
return;
}
GumboAttribute* href;
if (node->v.element.tag == GUMBO_TAG_A &&
(href = gumbo_get_attribute(&node->v.element.attributes, "href"))) {
printf("Got url %s\n", href->value);
}

GumboVector* children = &node->v.element.children;
for (unsigned int i = 0; i < children->length; ++i) {
search_for_links((GumboNode *)children->data[i]);
}
}

static const char* find_title(const GumboNode* root) {
assert(root->type == GUMBO_NODE_ELEMENT);
assert(root->v.element.children.length >= 2);

//printf("find_title() called\n");
const GumboVector* root_children = &root->v.element.children;
GumboNode* head = NULL;
for (int i = 0; i < root_children->length; ++i) {
GumboNode* child = root_children->data[i];
if (child->type == GUMBO_NODE_ELEMENT &&
child->v.element.tag == GUMBO_TAG_HEAD) {
head = child;
//printf("HEAD tag found\n");
break;
}
}
assert(head != NULL);

GumboVector* head_children = &head->v.element.children;
for (int i = 0; i < head_children->length; ++i) {
//printf("In loop iteration %d\n", i);
GumboNode* child = head_children->data[i];
if (child->type == GUMBO_NODE_ELEMENT &&
child->v.element.tag == GUMBO_TAG_TITLE) {
//printf("TITLE tag found\n");
if (child->v.element.children.length != 1) {
return "<empty title>";
}
GumboNode* title_text = child->v.element.children.data[0];
assert(title_text->type == GUMBO_NODE_TEXT || title_text->type == GUMBO_NODE_WHITESPACE);
return title_text->v.text.text;
}
}
return "<no title found>";
}

int main(int argc, char *argv[]) {
SSL_CTX *ctx;
SSL *ssl;
int bytes;
FILE * fp;
char * line = NULL;
size_t len = 0;
ssize_t read;
int sockfd, portno, n;
struct sockaddr_in serv_addr;
struct hostent *server;

SSL_library_init();
SSL_load_error_strings();
OpenSSL_add_all_algorithms();

ctx = SSL_CTX_new(TLS_client_method());

if (ctx == NULL)
{
ERR_print_errors_fp(stderr);
abort();
}

fp = fopen(argv[1], "r");
if (fp == NULL)
exit(EXIT_FAILURE);

while (read = getline(&line, &len, fp) != EOF){

if (line[strlen(line)-1]=='\n')
line[strlen(line)-1] = '\0';
char buffer[MAX_HEADER_BYTES];
char head_buffer[1024], get_buffer[1024];
char *header_token, *line_token, location_line[1024], result_line[1024], content_type_line[1024], content_length_line[1024];
char *cmp = "\r\n";
char location[1024], content_type[1024];
int content_length, return_code;
struct http_parser_url u;
char schema[1024];
char host[1024];
char path[8192];
location_line[0] = '\0';
result_line[0] = '\0';
content_type_line[0] = '\0';
content_length_line[0] = '\0';
portno = 80;

memset(schema, 0, sizeof(schema));
memset(host, 0, sizeof(host));
memset(path, 0, sizeof(path));

if (http_parser_parse_url(line, strlen(line), 0, &u) != 0) {
fprintf(stderr, "http_parser_parse_url() failed\n");
continue;
} else {
if ((u.field_set & (1 << UF_HOST))) {
strncpy(host, line + u.field_data[UF_HOST].off, u.field_data[UF_HOST].len);
} else {
fprintf(stderr, "No host name in line %s\n", line);
continue;
}
if ((u.field_set & (1 << UF_SCHEMA))) {
strncpy(schema, line + u.field_data[UF_SCHEMA].off, u.field_data[UF_SCHEMA].len);
} else {
strcpy(schema, "http");
}
if ((u.field_set & (1 << UF_PATH))) {
strcpy(path, line + u.field_data[UF_PATH].off);
} else {
strcpy(path, "/");
}
}
if (strcmp(schema, "https")) {
printf("http detected. Using port no 80\n");
portno = 80;
} else {
printf("https detected. Using port no 443\n");
portno = 443;
}


sockfd = socket(AF_INET, SOCK_STREAM, 0);
if (sockfd < 0) {
perror("ERROR opening socket");
exit(0);
}

//printf ("Resolving %s\n", line);
server = gethostbyname(host);
if (server == NULL) {
fprintf(stderr, "ERROR, no such host as %s\n", host);
close(sockfd);
continue;
}
//printf("%s\n",line);

bzero((char *) &serv_addr, sizeof(serv_addr));
serv_addr.sin_family = AF_INET;

bcopy((char *) server->h_addr, (char *) &serv_addr.sin_addr.s_addr, server->h_length);
serv_addr.sin_port = htons(portno);

//printf("Connecting to %s\n", line);
if (connect(sockfd, (struct sockaddr *) &serv_addr, sizeof(serv_addr)) < 0) {
perror("ERROR connecting");
close(sockfd);
continue;
}

//printf("Sending HEAD request to %s\n", line);
sprintf(head_buffer, "HEAD %s HTTP/1.1\r\nHost: %s\r\n\r\n", path, host);

size_t recived_len = 0;
if (portno == 80){
ssize_t nByte = send(sockfd, head_buffer, strlen(head_buffer), 0);
if (nByte <= 0) {
perror("send");
exit(EXIT_FAILURE);
}
//printf("%s", head_buffer);

//printf("Receiving response from %s\n", line);
if ((recived_len = recv(sockfd, buffer, MAX_HEADER_BYTES-1, 0)) > 0) {
//bzero(buffer, 256);
//printf("%s", buffer);
}
} else if (portno == 443) {
ssl = SSL_new(ctx);
SSL_set_fd(ssl, sockfd);
if ( SSL_connect(ssl) == 0 )
ERR_print_errors_fp(stderr);
else
{
printf("Connected with %s encryption\n", SSL_get_cipher(ssl));
SSL_write(ssl, head_buffer, strlen(head_buffer));
bytes = SSL_read(ssl, buffer, sizeof(buffer));
buffer[bytes] = 0;
printf("Received: \"%s\"\n", buffer);
}
}

//printf("Parsing headers from %s\n", line);
header_token = strtok(buffer, cmp);

while (header_token != NULL)
{
//printf ("header_token: %s\n\n", header_token);
if (strncmp(header_token, "Content-Length:", strlen("Content-Length:")) == 0
|| strncmp(header_token, "content-length:", strlen("content-length:")) == 0)
{
//printf ("header_token %s is equal to Content-Length:\n", header_token);
strcpy(content_length_line, header_token);
}
else if (strncmp(header_token, "Location:", strlen("Location:")) == 0
|| strncmp(header_token, "location:", strlen("location:")) == 0)
{
//printf ("header_token %s is equal to Location:\n", header_token);
strcpy(location_line, header_token);
} else if (strncmp(header_token, "Content-Type:", strlen("Content-Type:")) == 0
|| strncmp(header_token, "content-type:", strlen("content-type:")) == 0)
{
//printf ("header_token %s is equal to Content-Type:\n", header_token);
strcpy(content_type_line, header_token);
} else if (strncmp(header_token, "HTTP/1.1", strlen("HTTP/1.1")) == 0)
{

//printf ("header_token %s is equal to HTTP/1.1\n", header_token);
strcpy(result_line, header_token);

}
header_token = strtok(NULL, cmp);
}

if (strlen(content_length_line) > 0)
{
line_token = strtok(content_length_line, " ");
line_token = strtok(NULL, " ");
content_length = atoi(line_token);
//printf ("Content-Length = %d\n", content_length);
}
if (strlen(content_type_line) > 0)
{
char *ret;
ret = strstr(content_type_line, ";");
if (ret)
{
line_token = strtok(content_type_line, ";");
line_token = strtok(content_type_line, " ");
line_token = strtok(NULL, " ");
strcpy(content_type, line_token);
//printf ("Content_Type = %s\n", content_type);
}
else
{
line_token = strtok(content_type_line, " ");
line_token = strtok(NULL, " ");
strcpy(content_type, line_token);
//printf ("Content_Type = %s\n", content_type);
}
}
if (strlen(location_line) > 0)
{
line_token = strtok(location_line, " ");
line_token = strtok(NULL, " ");
strcpy(location, line_token);
//printf ("Location = %s\n", location);
}
if (strlen(result_line) > 0)
{
line_token = strtok(result_line, " ");
line_token = strtok(NULL, " ");
return_code = atoi(line_token);
//printf ("Return code = %d\n", return_code);
}

if (recived_len == -1) {
perror("recv");
}


if (return_code == 200 && strcmp(content_type, "text/html") == 0 && content_length > 0)
{
//char buffer[2048];
//char get[content_length + MAX_HEADER_BYTES];
char *get = malloc((content_length + MAX_HEADER_BYTES)*sizeof(char));
if (get == NULL) {
fprintf(stdout, "Out of memory\n");
return (EXIT_FAILURE);
}
char *html;
printf ("Return code = %d\n", return_code);
printf ("Content-Length = %d\n", content_length);
printf("Sending GET request to %s\n", line);
sprintf(get_buffer, "GET %s HTTP/1.1\r\nHost: %s\r\nConnection: close\r\n\r\n", path, host);
size_t i = 0;
if (portno == 80){
ssize_t nByte = send(sockfd, get_buffer, strlen(get_buffer), 0);
if (nByte <= 0) {
perror("send");
exit(EXIT_FAILURE);
}
printf("%s", get_buffer);

printf("Receiving response from %s\n", line);
size_t recived_len = 0;
while ((recived_len = recv(sockfd, buffer, sizeof(buffer)-1, 0)) > 0) {
//bzero(buffer, 256);
if (recived_len == -1) {
break;
} else {
printf("Received %zu bytes\n",recived_len);
memcpy(get+i, buffer, recived_len);
i += recived_len;
}
}
if (recived_len == -1) {
perror("recv");
close(sockfd);
continue;
}
get[i] = '\0';
} else if (portno == 443){
SSL_write(ssl, get_buffer, strlen(get_buffer));
do
{
bytes = SSL_read(ssl, buffer, sizeof(buffer));
printf("Received %d bytes\n",bytes);
printf("Received total of %ld bytes of %d\n", i+bytes, content_length);
//printf("Buffer contents: %s\n", buffer);
memcpy(get+i, buffer, bytes);
i += bytes;
} while (bytes > 0);
SSL_free(ssl);
}

printf("%s\n", get);

html = strstr(get, "\r\n\r\n");
if (html != NULL)
{
html[0] = '\0';
printf("\n\nHTML\n\n%s\n\n", html+strlen("\r\n\r\n"));

GumboOutput *output;
GumboAttribute *href;
href = NULL;

output = gumbo_parse(html+strlen("\r\n\r\n"));
if (output == NULL)
{
printf ("gumbo_parse() failed with %s", line);
}
const char* title = find_title(output->root);
printf("%s\n", title);

search_for_links(output->root);

gumbo_destroy_output(&kGumboDefaultOptions, output);
free(get);
} else {
free(get);
close(sockfd);
continue;
}
}
content_length = 0;
close(sockfd);
}
SSL_CTX_free(ctx);
return 0;
}

上面的代码需要一个文件,每行一个 URL。

我希望能够用几行 C 代码来完成此操作,但如果它有效的话,我可以接受使用一些库。

最佳答案

使用 libwget 库,您可以将相对 URI 转换为绝对 URI,如下所示:

#include <wget.h>

int main (int argc, char * argv[])
{
wget_iri *base = wget_iri_parse("http://example.com/subdir/y.html?e=2", NULL);
wget_buffer *buf = wget_buffer_alloc(128);

const char *relative_url = "../x.png";
printf("%s\n", wget_iri_relative_to_abs(base, relative_url, strlen(relative_url), buf));

wget_buffer_free(&buf);
wget_iri_free(&base);
}

关于c - 如何在c中将相对url转换为绝对url,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/57348102/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com