gpt4 book ai didi

C检查x是否在文件开头时如何跳过BOM

转载 作者:太空宇宙 更新时间:2023-11-04 00:34:37 25 4
gpt4 key购买 nike

在 C 数组/字符串中,如果文件有 BOM,我如何正确检测文件的开头是否有内容,因为有时 BOM 占用 1 个字符,有时 BOM 占用 3 个字符,等等次 BOM 不存在,导致 x 的实际位置并不总是从索引 0 开始

大多数时候是这个(十六进制)“ef bb bf”例如:

ef bb bf 23 21 2f 62 69 6e 2f 62 61 73 68 0a 61 20 26 26 20 62 0a 67 20 : ...#!/bin/bash.a && b.g 

会是这样吗?

#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>

struct BOM {
int is_BOM;
int length;
int type;
char * type_as_string;
char * BOM;
}

int matches(char * BOM_, char * string_, int length_) {
char * b = BOM_+1;
for(int i = 0; i < length_; i++) {
if (string_[i] == b[i]) matches = 1;
else {
matches = 0;
break;
}
}
return matches;
}

#define ifbom(bom_struct, is_BOM_, length_, type_, type_as_string_, BOM_, string_) if (matches(BOM_, string_, length_)) { \
bom_struct.is_BOM = is_BOM_; \
bom_struct.length = length_; \
bom_struct.type = type_; \
bom_struct.type_as_string = type_as_string_; \
bom_struct.BOM = BOM_+1 /* remove the ^ at the start */ ; \
}

#define elifbom(bom_struct, is_BOM_, length_, type_, type_as_string_, BOM_, string_) else ifbom(bom_struct, is_BOM_, length_, type_, type_as_string_, BOM_, string_)

#define elbom(bom_struct, is_BOM_, length_, type_, type_as_string_, BOM_) else { \
bom_struct.is_BOM = is_BOM_; \
bom_struct.length = length_; \
bom_struct.type = type_; \
bom_struct.type_as_string = type_as_string_; \
bom_struct.BOM = BOM_; \
}

#define cat 0
#define hex 1
#define both 2
#define json 3

int mode;

void __hexdump(unsigned char *buffer, unsigned long index, unsigned long width)
{
unsigned long i;
if (mode == both || mode == hex) {
for (i = 0; i < index; i++)
printf("%02x ", buffer[i]);
}
if (mode == both) {
for (unsigned long spacer = index; spacer < width; spacer++)
printf("\t");
printf(": ");
}
if (mode == cat || mode == both || mode == json) {
for (i = 0; i < index; i++)
{
if (buffer[i] < 32 || buffer[i] >= 127)
printf(".");
else
printf("%c", buffer[i]);
}
}
printf("\n");
}

int __hexdump_string(char *infile, unsigned long start, unsigned long stop, unsigned long width)
{
char ch;
unsigned long f_index = 0;
unsigned long bb_index = 0;
unsigned char *byte_buffer = malloc(width);
if (byte_buffer == NULL)
{
printf("Could not allocate memory for byte_buffer\n");
return -1;
}
while (*infile)
{
ch = *infile;
if ((f_index >= start) && (f_index <= stop))
{
byte_buffer[bb_index] = ch;
bb_index++;
}
if (bb_index >= width)
{
__hexdump(byte_buffer, bb_index, width);
bb_index = 0;
}
f_index++;
infile++;
}
if (bb_index)
__hexdump(byte_buffer, bb_index, width);
free(byte_buffer);
return 0;
}

#define builtin__BOM_print(bom_struct) { \
printf("%s.is_BOM = %s\n%s.length = %d\n%s.type = %d\n%s.type_as_string = %s\n%s.BOM = ", #bom_struct, bom_struct.is_BOM?"yes":"no", #bom_struct, bom_struct.length, #bom_struct, bom_struct.type, #bom_struct,bom_struct.type_as_string, #bom_struct); \
mode = both; \
__hexdump_string(bom_struct.BOM, 0, bom_struct.length, 5); \
}


struct BOM builtin__BOM_get(char * string) {
struct BOM bom;
ifbom(bom, true, 3, 1, "UTF-8", "^\xef\xbb\xbf", string)
elifbom(bom, true, 2, 2, "UTF-16 (BE)", "^\xfe\xff", string)
elifbom(bom, true, 2, 3, "UTF-16 (LE)", "^\xff\xfe", string)
elifbom(bom, true, 4, 4, "UTF-32 (BE)", "^\x00\x00\xfe\xff", string)
elifbom(bom, true, 4, 5, "UTF-32 (LE)", "^\xff\xfe\x00\x00", string)
elifbom(bom, true, 5, 6, "UTF-7", "^\x2b\x2f\x76\x38\x3d", string)
elifbom(bom, true, 4, 7, "UTF-7", "^\x2b\x2f\x76\x38", string)
elifbom(bom, true, 4, 8, "UTF-7", "^\x2b\x2f\x76\x39", string)
elifbom(bom, true, 4, 9, "UTF-7", "^\x2b\x2f\x76\x2b", string)
elifbom(bom, true, 4, 10, "UTF-7", "^\x2b\x2f\x76\x2f", string)
elifbom(bom, true, 3, 11, "UTF-1", "^\xf7\x64\x4c", string)
elifbom(bom, true, 4, 12, "UTF-EBCDIC", "^\xdd\x73\x66\x73", string)
elifbom(bom, true, 3, 13, "SCSU", "^\x0e\xfe\xff", string)
elifbom(bom, true, 3, 14, "BOCU-1", "^\xfb\xee\x28", string)
elifbom(bom, true, 4, 15, "GB-18030", "^\x84\x31\x95\x33", string)
elbom(bom, false, 0, 0, "Not present", "Not present")

return (struct BOM) bom;
}

int main()
{
struct BOM t = builtin__BOM_get("test");
builtin__BOM_print(t);
return 0;
}

最佳答案

您应该阅读第一个字符以了解 BOM 是否存在。

  • 如果前 4 个字符是 FF FE 00 00 : little endian UTF-32
  • 否则如果前 2 个字符是 FF FE : little endian UTF-16
  • 否则如果前 4 个字符是 00 00 FE FF : big endian UTF-32
  • 否则如果 2 个第一个字符是 FE FF : big endian UTF-16
  • 否则如果 3 个第一个字符是 EF BB BF : UTF-8
  • 等...

根据 BOM 长度,您知道实际文件数据从哪个索引开始。

您可以在维基百科页面上找到更完整的 BOM 列表:https://en.wikipedia.org/wiki/Byte_order_mark#Byte_order_marks_by_encoding

关于C检查x是否在文件开头时如何跳过BOM,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/51518244/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com