gpt4 book ai didi

c - 使用只有一个参数的字符编码函数

转载 作者:行者123 更新时间:2023-11-30 20:22:46 26 4
gpt4 key购买 nike

经过多次搜索,我找到了最适合我需要的功能here

这是代码:

/* UTF-8 to ISO-8859-1/ISO-8859-15 mapper.
* Return 0..255 for valid ISO-8859-15 code points, 256 otherwise.
*/
static inline unsigned int to_latin9(const unsigned int code)
{
//printf("\ncode = %d", code);
/* Code points 0 to U+00FF are the same in both. */
if (code < 256U) {
return code;
}
switch (code) {
case 0x0152U: return 188U; /* U+0152 = 0xBC: OE ligature */
case 0x0153U: return 189U; /* U+0153 = 0xBD: oe ligature */
case 0x0160U: return 166U; /* U+0160 = 0xA6: S with caron */
case 0x0161U: return 168U; /* U+0161 = 0xA8: s with caron */
case 0x0178U: return 190U; /* U+0178 = 0xBE: Y with diaresis */
case 0x017DU: return 180U; /* U+017D = 0xB4: Z with caron */
case 0x017EU: return 184U; /* U+017E = 0xB8: z with caron */
case 0x20ACU: return 164U; /* U+20AC = 0xA4: Euro */
default: return 256U;
}
}

/* Convert an UTF-8 string to ISO-8859-15.
* All invalid sequences are ignored.
* Note: output == input is allowed,
* but input < output < input + length
* is not.
* Output has to have room for (length+1) chars, including the trailing NUL byte.
*/
size_t utf8_to_latin9(char *const output, const char *const input, const size_t length)
{
unsigned char *out = (unsigned char *)output;
const unsigned char *in = (const unsigned char *)input;
const unsigned char *const end = (const unsigned char *)input + length;
unsigned int c;

while (in < end)
if (*in < 128)
*(out++) = *(in++); /* Valid codepoint */
else
if (*in < 192)
in++; /* 10000000 .. 10111111 are invalid */
else
if (*in < 224) { /* 110xxxxx 10xxxxxx */
if (in + 1 >= end)
break;
if ((in[1] & 192U) == 128U) {
c = to_latin9( (((unsigned int)(in[0] & 0x1FU)) << 6U)
| ((unsigned int)(in[1] & 0x3FU)) );
if (c < 256)
*(out++) = c;
}
in += 2;

} else
if (*in < 240) { /* 1110xxxx 10xxxxxx 10xxxxxx */
if (in + 2 >= end)
break;
if ((in[1] & 192U) == 128U &&
(in[2] & 192U) == 128U) {
c = to_latin9( (((unsigned int)(in[0] & 0x0FU)) << 12U)
| (((unsigned int)(in[1] & 0x3FU)) << 6U)
| ((unsigned int)(in[2] & 0x3FU)) );
if (c < 256)
*(out++) = c;
}
in += 3;

} else
if (*in < 248) { /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
if (in + 3 >= end)
break;
if ((in[1] & 192U) == 128U &&
(in[2] & 192U) == 128U &&
(in[3] & 192U) == 128U) {
c = to_latin9( (((unsigned int)(in[0] & 0x07U)) << 18U)
| (((unsigned int)(in[1] & 0x3FU)) << 12U)
| (((unsigned int)(in[2] & 0x3FU)) << 6U)
| ((unsigned int)(in[3] & 0x3FU)) );
if (c < 256)
*(out++) = c;
}
in += 4;

} else
if (*in < 252) { /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
if (in + 4 >= end)
break;
if ((in[1] & 192U) == 128U &&
(in[2] & 192U) == 128U &&
(in[3] & 192U) == 128U &&
(in[4] & 192U) == 128U) {
c = to_latin9( (((unsigned int)(in[0] & 0x03U)) << 24U)
| (((unsigned int)(in[1] & 0x3FU)) << 18U)
| (((unsigned int)(in[2] & 0x3FU)) << 12U)
| (((unsigned int)(in[3] & 0x3FU)) << 6U)
| ((unsigned int)(in[4] & 0x3FU)) );
if (c < 256)
*(out++) = c;
}
in += 5;

} else
if (*in < 254) { /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
if (in + 5 >= end)
break;
if ((in[1] & 192U) == 128U &&
(in[2] & 192U) == 128U &&
(in[3] & 192U) == 128U &&
(in[4] & 192U) == 128U &&
(in[5] & 192U) == 128U) {
c = to_latin9( (((unsigned int)(in[0] & 0x01U)) << 30U)
| (((unsigned int)(in[1] & 0x3FU)) << 24U)
| (((unsigned int)(in[2] & 0x3FU)) << 18U)
| (((unsigned int)(in[3] & 0x3FU)) << 12U)
| (((unsigned int)(in[4] & 0x3FU)) << 6U)
| ((unsigned int)(in[5] & 0x3FU)) );
if (c < 256)
*(out++) = c;
}
in += 6;

} else
in++; /* 11111110 and 11111111 are invalid */

/* Terminate the output string. */
*out = '\0';

return (size_t)(out - (unsigned char *)output);
}

这工作完美。

但是这个函数需要 1 个 buffer_input 并返回第二个 buffer_output。为了优化它,我只想使用一个 buffer_input 并且该函数返回修改后的 buffer_input

不知道可不可以。我没有足够的知识来自己做这件事(我很难理解这段代码)。

很难编辑此代码来实现我想要实现的目标吗?

目前我这样使用:

utf8_to_latin9(buffer_output, buffer_input, length)

编辑后我想这样使用:

utf8_to_latin9(buffer_input, buffer_input, length)

utf8_to_latin9(buffer_input, length)

最佳答案

没有更正(或检查)您的代码是否有任何问题,我只是将其修改为就地工作。请注意我关于 utf8_to_latin9() 中错误转换的评论。

转换为就地操作非常简单:

切换到一个缓冲区参数,使 out 指向输入缓冲区的开头(而不是单独的输出缓冲区):

< size_t utf8_to_latin9(char *const output, const char *const input, const size_t length)
< {
< unsigned char *out = (unsigned char *)output;
----
> size_t utf8_to_latin9(char *const input, const size_t length)
> {
> unsigned char *out = (unsigned char *)input;

在转换结束时,根据输入缓冲区(而不是单独的输出缓冲区)计算返回值:

< return (size_t)(out - (unsigned char *)output);
----
> return (size_t)(out - (unsigned char *)input);

由于 UTF-8 输入缓冲区的消耗速度不可能比 Latin9 输出缓冲区的填充速度,因此这是相当安全的。

<小时/>

我的测试main():

int main()
{
char input[256] = {0};
/* ¤ *//* € *//* some kanji */
strcpy( input, "\xc2\xa4\xe2\x82\xac\xf0\xaf\xa4\xa9" );
utf8_to_latin9( input, 9 );
printf( "%hhx ", input[0] );
printf( "%hhx ", input[1] );
printf( "%hhx ", input[2] );
printf( "%hhx ", input[3] );
printf( "%hhx ", input[4] );
printf( "%hhx ", input[5] );
printf( "%hhx ", input[6] );
printf( "%hhx ", input[7] );
printf( "%hhx\n", input[8] );
return 0;
}

输出:

a4 a4 0 82 ac f0 af a4 a9

这就是我所期望的(展示我评论过的转换问题)。

关于c - 使用只有一个参数的字符编码函数,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/38589590/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com