gpt4 book ai didi

c - 从命令行参数获取 unicode 符号

转载 作者:太空宇宙 更新时间:2023-11-04 04:12:13 30 4
gpt4 key购买 nike

我通过命令行参数将 unicode 符号传递给程序。

$ ./program ●

程序应该返回这个符号的代码。

#include <stdio.h>  

int main(int argc, char *argv[])
{
wchar_t glyph;

glyph = *((wchar_t *) argv[1]);
printf("%u\n", glyph);
}

●符号代码为9679 (HEX 25cf),程序返回9410530argv[1] 参数的长度是 3 个字节,而不是 4 个字节(unicode 符号是 32 位),它包含 8f 97 e2 \0 字节。如何正确获取符号代码?

最佳答案

使用 mbstowcs() 将 UTF-8 编码字符从多字节字符串转换为宽字符的解决方案。

#include <stdio.h>
#include <stdlib.h>
#include <locale.h>

int main(int argc, char *argv[])
{
wchar_t u;

/* Set locale according to the environment variables */
if (setlocale(LC_ALL, "") == NULL) {
perror("setlocale");
exit(EXIT_FAILURE);
}

/* Convert the multibyte character string in argv[1] to a
wide character */
if (mbstowcs(&u, argv[1], 1) == (size_t) -1) {
perror("mbstowcs");
exit(EXIT_FAILURE);
}

printf("%u\n", u);
}

另一种解决方案是手动解码 UTF-8 字符。代码是从 st (suckless terminal emulator) 导入的。

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>

#define UTF_INVALID 0xFFFD
#define UTF_SIZ 4

typedef unsigned char uchar;
typedef uint_least32_t Rune;

#define LEN(a) (sizeof(a) / sizeof(a)[0])
#define BETWEEN(x, a, b) ((a) <= (x) && (x) <= (b))

static uchar utfbyte[UTF_SIZ + 1] = {0x80, 0, 0xC0, 0xE0, 0xF0};
static uchar utfmask[UTF_SIZ + 1] = {0xC0, 0x80, 0xE0, 0xF0, 0xF8};
static Rune utfmin[UTF_SIZ + 1] = { 0, 0, 0x80, 0x800, 0x10000};
static Rune utfmax[UTF_SIZ + 1] = {0x10FFFF, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF};

Rune
utf8decodebyte(char c, size_t *i)
{
for (*i = 0; *i < LEN(utfmask); ++(*i))
if (((uchar)c & utfmask[*i]) == utfbyte[*i])
return (uchar)c & ~utfmask[*i];

return 0;
}

size_t
utf8validate(Rune *u, size_t i)
{
if (!BETWEEN(*u, utfmin[i], utfmax[i]) || BETWEEN(*u, 0xD800, 0xDFFF))
*u = UTF_INVALID;
for (i = 1; *u > utfmax[i]; ++i)
;

return i;
}

size_t
utf8decode(const char *c, Rune *u, size_t clen)
{
size_t i, j, len, type;
Rune udecoded;

*u = UTF_INVALID;
if (!clen)
return 0;
udecoded = utf8decodebyte(c[0], &len);
if (!BETWEEN(len, 1, UTF_SIZ))
return 1;
for (i = 1, j = 1; i < clen && j < len; ++i, ++j) {
udecoded = (udecoded << 6) | utf8decodebyte(c[i], &type);
if (type != 0)
return j;
}
if (j < len)
return 0;
*u = udecoded;
utf8validate(u, len);

return len;
}


int main(int argc, char *argv[])
{
Rune u;

utf8decode(argv[1], &u, UTF_SIZ);
printf("%u\n", u);
}

关于c - 从命令行参数获取 unicode 符号,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/55870029/

30 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com