gpt4 book ai didi

c++ - Quex:为标识符定义 UTF8 正则表达式

转载 作者:行者123 更新时间:2023-11-30 05:13:34 25 4
gpt4 key购买 nike

我正在升级以前使用的 Ecmascript 引擎Quex 0.64.8Quex 0.67.5。我有词法分析器正在运行,它现在似乎只能检测 ANSI token ,而不是像以前那样的 UTF-8 token 。

本质上,我所做的是提供 --codec utf8 标志,同时运行 Quex 并使用以下代码来识别标识符:

PATTERN_IDSTART  [^0-9+\-<>*()\[\]?=&|~\\/\^%!{}\n\t\r"':;,. ]

PATTERN_IDPART {PATTERN_IDSTART}|{PATTERN_DIGIT}

PATTERN_ID {PATTERN_IDSTART}{PATTERN_IDPART}*

我的想法是,而不是指定我定义的所有允许的标记相反,这是 Not Acceptable ,并排除了那些。新的词法分析器检测诸如“test1”或“safari”之类的标识符就好了,但似乎对“日本语”和“Örjan”有疑问。我也只使用utf-8并且不要使用 ICU 或 Iconv。

感觉好像我在这里误解了什么。任何帮助解决这个问题将不胜感激。

编辑:

了解我使用以下参数运行 Quex 可能会有用:

-i ${CMAKE_CURRENT_SOURCE_DIR}/ecmascript.qx
--analyzer-class ecmascript_lexer
--foreign-token-id-file ${BISON_ECMASCRIPT_PARSER_OUTPUT_HEADER}
--token-id-prefix TOK_
--template-compression
--codec utf8 //--encoding utf8 since Quex 0.67.5
--buffer-element-size 1
--buffer-element-type char
--odir ${CMAKE_CURRENT_BINARY_DIR}/generated
--language c++
--warning-on-outrun

编辑 2:

自从utf-8解析后我没能重新创建一个小例子在示例中工作。因此我创建了一个独立的我的 ecmascript 引擎的词法分析器部分的版本,希望能更容易地看出问题所在。

我不再清楚我的问题是否真的相关解析 utf8 token 。很可能有些地方出了问题在我的 .qx 文件中......无论哪种方式,这是独立版本我的 ecmascript 词法分析器。

CMakeLists.txt

cmake_minimum_required(VERSION 2.8)

project(ecmascript CXX)

if(MSVC)
add_definitions(-D_CRT_SECURE_NO_WARNINGS)
endif()

set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)

set(QUEX_NAMES "quex")

if(CMAKE_HOST_WIN32)
set(QUEX_NAMES "quex.bat" ${QUEX_NAMES})
else()
set(QUEX_NAMES "quex-exe.py" ${QUEX_NAMES})
endif()

find_program(QUEX NAMES ${QUEX_NAMES} REQUIRED
HINTS ENV QUEX_PATH DOC "Path to Quex's executable."
NO_DEFAULT_PATH)

find_path(QUEX_INCLUDE_DIR quex/core.py REQUIRED
HINTS ENV QUEX_PATH DOC "Path to Quex's include directory"
NO_DEFAULT_PATH)

file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/generated)

add_definitions(-DQUEX_OPTION_LINE_NUMBER_COUNTING
-DQUEX_OPTION_ASSERTS_DISABLED)


add_definitions(-DQUEX_SETTING_BUFFER_SIZE=1024) # Sätter bufferstorleken på lexern

set(ECMASCRIPT_LEXER ${CMAKE_CURRENT_BINARY_DIR}/generated/ecmascript_lexer)

add_custom_command(OUTPUT ${ECMASCRIPT_LEXER}
${ECMASCRIPT_LEXER}.cpp
${ECMASCRIPT_LEXER}-token
${ECMASCRIPT_LEXER}-configuration
${ECMASCRIPT_LEXER}-token_ids
COMMAND ${QUEX} -i ${CMAKE_CURRENT_SOURCE_DIR}/ecmascript.qx
--analyzer-class ecmascript_lexer # Namnet på lexern
--foreign-token-id-file ${CMAKE_CURRENT_SOURCE_DIR}/ecmascript_yacc.hpp # token-id genereras av bison
--token-id-prefix TOK_ # Custom prefix för tokens (se ecmascript.y för detaljer)
--template-compression # Optimera lägesövergångar om möjligt
--encoding utf8 # Basera lexern på teckentabell UTF8
--buffer-element-size 1 # Använd en datatyp som är 1 byte stor
--buffer-element-type uint8_t
--odir ${CMAKE_CURRENT_BINARY_DIR}/generated # Berätta var alla genererade filer ska ligga
--language c++
DEPENDS ecmascript.qx VERBATIM
COMMENT "Generating ecmascript lexer..."
MAIN_DEPENDENCY ${CMAKE_CURRENT_SOURCE_DIR}/ecmascript.qx) # Detta styr i vilken ordning Quex och Bison körs

include_directories(${QUEX_INCLUDE_DIR}
${CMAKE_CURRENT_SOURCE_DIR}
${CMAKE_CURRENT_BINARY_DIR}/generated)

set(es_lexer ${ECMASCRIPT_LEXER}
${ECMASCRIPT_LEXER}.cpp
_main.cpp)

set(es_generated ${es_lexer} ecmascript_yacc.hpp)

add_executable(es_lexer ${es_generated})

ecmascript.qx

header {
#include <quex/code_base/extra/accumulator/Accumulator>

#include "ecmascript_yacc.hpp"
#include <cstdlib>
#include <cstdio>

#define BACKSPACE '\x08'
#define TAB '\x09'
#define NEWLINE '\x0A'
#define VERTICALTAB '\x0B'
#define FORMFEED '\x0C'
#define CARRIAGERETURN '\x0D'
#define DOUBLEQUOTE '\x22'
#define SINGLEQUOTE '\x27'
#define DOUBLEBACKSLASH '\x5C'
#define NULLTERM '\x00'
}

footer {
#include <quex/code_base/extra/accumulator/Accumulator.i>
}

define {
PATTERN_NEWLINE [\n\r]

PATTERN_DIGIT [0-9]
PATTERN_NOZDIGIT [1-9]
PATTERN_DECINTLIT "0"|{PATTERN_NOZDIGIT}{PATTERN_DIGIT}*
PATTERN_EXPIND "e"|"E"
PATTERN_SIGNEDINT {PATTERN_DIGIT}+|"+"{PATTERN_DIGIT}+|"-"{PATTERN_DIGIT}+
PATTERN_EXPPART {PATTERN_EXPIND}{PATTERN_SIGNEDINT}

PATTERN_DECNUMBER {PATTERN_DECINTLIT}"."{PATTERN_DIGIT}*{PATTERN_EXPPART}?|"."{PATTERN_DIGIT}+{PATTERN_EXPPART}?|{PATTERN_DECINTLIT}{PATTERN_EXPPART}?

PATTERN_HEXDIGIT [0-9a-fA-F]
PATTERN_HEXNUMBER "0x"{PATTERN_HEXDIGIT}+|"0X"{PATTERN_HEXDIGIT}+

PATTERN_UNIESCSEQ \\u{PATTERN_HEXDIGIT}{4}

PATTERN_STRING "\""(\\"\""|[^"])*"\""

PATTERN_DOUBLE_QUOTE_STRING_DELIMITER "\""

PATTERN_SINGLE_QUOTE_STRING_DELIMITER "'"

PATTERN_SINGLELINE_COMMENT "//"[^\n\r]*

PATTERN_IDSTART [^0-9+\-<>*()\[\]?=&|~\\/\^%!{}\n\t\r"':;,. ]

PATTERN_IDPART {PATTERN_IDSTART}|{PATTERN_DIGIT}

PATTERN_ID {PATTERN_IDSTART}{PATTERN_IDPART}*
}

mode EOF : <inheritable: only> {
on_end_of_stream {
self_send(TOK_LINETERM);

self_send(TOK_TERMINATION);
}
}

mode RestrictedProduction : EOF
<skip: [ \t]>
{
{PATTERN_NEWLINE}{
self_send(';');
self << Program;
}

on_failure {
self.undo();
self << Program;
}
}

mode StringHelper : EOF
<inheritable: only>
{
on_entry {
self_send(TOK_QUOTE);
}

on_exit {
if(self.accumulator.text.begin != self.accumulator.text.end)
self_send(TOK_STRLITPART);

self_accumulator_flush(TOK_QUOTE);
}

{PATTERN_NEWLINE} => '\n';

"\\b" { self_accumulator_add_character(BACKSPACE); }

"\\t" { self_accumulator_add_character(TAB); }



"\\n" { self_accumulator_add_character(NEWLINE); }

"\\v" { self_accumulator_add_character(VERTICALTAB); }

"\\f" { self_accumulator_add_character(FORMFEED); }

"\\r" { self_accumulator_add_character(CARRIAGERETURN); }

"\\\"" { self_accumulator_add_character(DOUBLEQUOTE); }

"\\'" { self_accumulator_add_character(SINGLEQUOTE); }

"\\\\" { self_accumulator_add_character(DOUBLEBACKSLASH); }

"\\0" { self_accumulator_add_character(NULLTERM); }

"\\x"{PATTERN_HEXDIGIT}{2}
{
{
unsigned long ulResult = strtoul(reinterpret_cast<char*>(Lexeme+2),0,16);
uint8_t *const pBuffer = reinterpret_cast<uint8_t*>(&ulResult);
self_accumulator_add(pBuffer,pBuffer+2);
}
}

on_failure {
self_accumulator_add(Lexeme, LexemeEnd);
}
}

mode SingleQuoteString : StringHelper
{
{PATTERN_SINGLE_QUOTE_STRING_DELIMITER}
{
// Om vi hittade slutet på strängen så växlar vi tillbaka till Program-läget
self << Program;
}
}

mode DoubleQuoteString : StringHelper
{
{PATTERN_DOUBLE_QUOTE_STRING_DELIMITER}
{
// Om vi hittade slutet på strängen så växlar vi tillbaka till Program-läget
self << Program;
}
}

mode PrefixHelper : EOF
<skip: [ \t]> // Ignorera whitespace
{
on_entry {
self.seek_backward(3);
}

{PATTERN_NEWLINE}
{
if(self.iParaCount == 0)
self_send(';');
}

"++"
{
self_send(TOK_PLUSPLUS);
self << Program;
}

"--"
{
self_send(TOK_MINUSMINUS);
self << Program;
}

on_failure {
(void)Lexeme;
}
}

mode Operators : <inheritable: only>
{
"||" => TOK_OR;
"&&" => TOK_AND;
"++" { self << PrefixHelper; }
"--" { self << PrefixHelper; }
"===" => TOK_EQEQEQ;
"==" => TOK_EQEQ;
"!==" => TOK_NEQEQ;
"!=" => TOK_NEQ;
"*=" => TOK_MULTEQ;
"/=" => TOK_DIVEQ;
"%=" => TOK_MODEQ;
"+=" => TOK_PLUSEQ;
"\-=" => TOK_MINUSEQ;
">>>=" => TOK_GTGTGTEQ;
">>>" => TOK_GTGTGT;
"<<=" => TOK_LTLTEQ;
">>=" => TOK_GTGTEQ;
"<<" => TOK_LTLT;
">>" => TOK_GTGT;
"<=" => TOK_LTE;
">=" => TOK_GTE;
"&=" => TOK_AMPEQ;
"^=" => TOK_CIRCEQ;
"|=" => TOK_PIPEEQ;

['='] => '=';
['!'] => '!';
['('] { self_send('('); ++self.iParaCount; }
['+'] => '+';
['\-'] => '-';
['*'] => '*';
['/'] => '/';
['%'] => '%';
['<'] => '<';
['>'] => '>';
['\['] => '[';
['\]'] => ']';
['.'] => '.';
[','] => ',';
['?'] => '?';
[':'] => ':';
['~'] => '~';
['&'] => '&';
['^'] => '^';
['|'] => '|';
['{'] => '{';
[';'] => ';';
[')'] { self_send(')'); --self.iParaCount; }
['}'] { self_send(TOK_LINETERM); self_send('}'); }
}

mode Keywords : <inheritable: only>
{
function => TOK_FUNCTION;
return { self_send(TOK_RETURN); self << RestrictedProduction; }
var => TOK_VAR;
null => TOK_NULL;
true => TOK_TRUE;
false => TOK_FALSE;
instanceof => TOK_INSTANCEOF;
in => TOK_IN;
delete => TOK_DELETE;
void => TOK_VOID;
typeof => TOK_TYPEOF;
this => TOK_THIS;
if => TOK_IF;
else => TOK_ELSE;
with => TOK_WITH;
throw { self_send(TOK_THROW); self << RestrictedProduction; }
try => TOK_TRY;
catch => TOK_CATCH;
finally => TOK_FINALLY;
for => TOK_FOR;
break { self_send(TOK_BREAK); self << RestrictedProduction; }
continue { self_send(TOK_CONTINUE); self << RestrictedProduction; }
while => TOK_WHILE;
do => TOK_DO;
switch => TOK_SWITCH;
case => TOK_CASE;
default => TOK_DEFAULT;
new => TOK_NEW;
synchronized => TOK_SYNCHRONIZED;
}

mode Values : <inheritable: only>
{
{PATTERN_DECNUMBER} => TOK_DECLIT(Lexeme);

{PATTERN_HEXNUMBER} => TOK_HEXINTLIT(Lexeme);

{PATTERN_DOUBLE_QUOTE_STRING_DELIMITER} { self << DoubleQuoteString; }

{PATTERN_SINGLE_QUOTE_STRING_DELIMITER} { self << SingleQuoteString; }
}

mode Identifiers : <inheritable: only>
{
{PATTERN_ID} => TOK_ID(Lexeme);
}

mode Program : Keywords,
Identifiers,
Values,
Operators,
EOF
<skip: [ \t]>
<skip_range: "/*" "*/">
{
{PATTERN_NEWLINE}
{
if(self.iParaCount == 0)
self_send(TOK_LINETERM);
}

{PATTERN_SINGLELINE_COMMENT}
{}
}

body {

void push_token(const unsigned int uiToken)
{
self.uiLastToken = self.uiCurrentToken;
self.uiCurrentToken = uiToken;
}

bool use_auto_semi() const
{ return uiLastToken == TOK_LINETERM; }

unsigned int uiLastToken,
uiCurrentToken;

int iParaCount;

quex::Token* pLastID;

QUEX_NAME(Accumulator) accumulator;
}

constructor {
self.uiLastToken = 0;
self.uiCurrentToken = 0;
self.iParaCount = 0;
self.pLastID = 0;

if(!QUEX_NAME(Accumulator_construct)(&me->accumulator, me)) {
return false;
}
}

destructor {
QUEX_NAME(Accumulator_destruct)(&me->accumulator);
}

start = Program;

ecmascript_yacc.hpp

   #ifndef YY_ECMASCRIPT_YY_C_USERS_PATRIKJ_WORK_GIT_ECMASCRIPT_BUILD_VC14_X64_GENERATED_ECMASCRIPT_YACC_HPP_INCLUDED
# define YY_ECMASCRIPT_YY_C_USERS_PATRIKJ_WORK_GIT_ECMASCRIPT_BUILD_VC14_X64_GENERATED_ECMASCRIPT_YACC_HPP_INCLUDED

/* Token type. */
#ifndef YYTOKENTYPE
# define YYTOKENTYPE
enum yytokentype
{
TOK_TERMINATION = 0,
TOK_UNINITIALIZED = 1,
TOK_ID = 258,
TOK_NULL = 259,
TOK_TRUE = 260,
TOK_FALSE = 261,
TOK_DECLIT = 262,
TOK_HEXINTLIT = 263,
TOK_OR = 264,
TOK_AND = 265,
TOK_PLUSPLUS = 266,
TOK_MINUSMINUS = 267,
TOK_EQEQ = 268,
TOK_NEQ = 269,
TOK_EQEQEQ = 270,
TOK_NEQEQ = 271,
TOK_LTE = 272,
TOK_GTE = 273,
TOK_INSTANCEOF = 274,
TOK_IN = 275,
TOK_STRLITPART = 276,
TOK_QUOTE = 277,
TOK_VOID = 278,
TOK_TYPEOF = 279,
TOK_DELETE = 280,
TOK_THIS = 281,
TOK_LTLT = 282,
TOK_GTGT = 283,
TOK_GTGTGT = 284,
TOK_MULTEQ = 285,
TOK_DIVEQ = 286,
TOK_MODEQ = 287,
TOK_PLUSEQ = 288,
TOK_MINUSEQ = 289,
TOK_LTLTEQ = 290,
TOK_GTGTEQ = 291,
TOK_GTGTGTEQ = 292,
TOK_AMPEQ = 293,
TOK_CIRCEQ = 294,
TOK_PIPEEQ = 295,
TOK_IF = 296,
TOK_ELSE = 297,
TOK_RETURN = 298,
TOK_VAR = 299,
TOK_WITH = 300,
TOK_THROW = 301,
TOK_TRY = 302,
TOK_CATCH = 303,
TOK_FINALLY = 304,
TOK_FOR = 305,
TOK_BREAK = 306,
TOK_CONTINUE = 307,
TOK_WHILE = 308,
TOK_DO = 309,
TOK_SWITCH = 310,
TOK_CASE = 311,
TOK_DEFAULT = 312,
TOK_NEW = 313,
TOK_FUNCTION = 314,
TOK_SYNCHRONIZED = 315,
TOK_LINETERM = 316
};
#endif

/* Value type. */
#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
typedef int YYSTYPE;
# define YYSTYPE_IS_TRIVIAL 1
# define YYSTYPE_IS_DECLARED 1
#endif

#endif /* !YY_ECMASCRIPT_YY_C_USERS_PATRIKJ_WORK_GIT_ECMASCRIPT_BUILD_VC14_X64_GENERATED_ECMASCRIPT_YACC_HPP_INCLUDED */

_main.cpp

#include <iostream>

#include "ecmascript_lexer"

/****************************************************************************************/
void print_token(quex::Token* token)
{
std::cout << token->get_string() << std::endl;
}

/****************************************************************************************/
int main(int argc, char** argv)
{
quex::Token* token = 0;
quex::ecmascript_lexer qlex;

quex::ecmascript_lexer *lexer = quex::ecmascript_lexer::from_file_name("id_test.js", 0);

while(lexer->error_code == E_Error_None)
{
get_token:

lexer->receive(&token);

if(!token)
break;

print_token(token);

lexer->push_token(token->type_id());

if(token->type_id() == TOK_LINETERM)
goto get_token;

if(token->type_id() == TOK_ID)
lexer->pLastID = token;

if(token->type_id() == TOK_TERMINATION)
break;
}

delete lexer;
return 0;
}

id_test.js//用于测试词法分析器

test1 = safari;
myFunc()

function t(){}

if(test1 < 23)
return myFunc(45);

myFunc();

svenskaåäö();

var kalleö = 34;

var _us=kalleö;

_us = 678

日本語 = "Nihongo" // Japanska språkets namn

$myCar = _us

var new1 = kalleö ? t();

"kalleÖ, _us and $myCar should be ignored here"

الفصحى = "Arabiska"

/*
var new1 = kalleÖ ? t();

"kalleÖ, _us and $myCar should be ignored here"
*/

// var new1 = kalleÖ ? t();
대한민국 = 45;
대한민국X45 = "Value of: 대한민국" + 대한민국;

ärta="ärta + 2"

mix帝With대한민국 = "success?";

Örjan;

önes;
cake;

Россия;
РоссияX;
РоссияX
XРоссия;
XРоссия;

始皇帝 = "The First emperor"
始皇帝x2 = "The First emperor, twice?"

最好的问候,

帕特里克·J

最佳答案

我建议您特别依赖 Unicode 属性ID_StartID_Continue,这样您的 .qx 文件包含

define {
ID_START \P{ID_Start}
ID_CONTINUE \P{ID_Continue}
ID {ID_START}{ID_CONTINUE}*
}

Quex 然后对 UCS 数据库进行采样,你就不用担心了特定的代码点。

此外,如果您只想支持一个子集,请使用intersection 来剪切出所需的 UCS 范围,如下例所示:

...
ID_START [: intersection([\X900-\X970], \P{ID_Start}) :]
ID_CONTINUE [: intersection([\X900-\X970], \P{ID_Continue}) :]
...

PS,您的解决方案并非完全错误。给定一个文件 example.qx:

define {
PATTERN_IDSTART [^0-9+\-<>*()\[\]?=&|~\\/\^%!{}\n\t\r"':;,. ]
PATTERN_IDPART {PATTERN_IDSTART}|[0-9]
PATTERN_ID {PATTERN_IDSTART}{PATTERN_IDPART}*
}

token { ID; WS; }

mode X {
{PATTERN_ID} => QUEX_TKN_ID(Lexeme);
[ \n\t] => QUEX_TKN_WS(Lexeme);
}

还有一些用户文件“example.c”:

#include <stdio.h>

#include "EasyLexer.h"

void
print_token(quex_Token* token_p)
{
const size_t BufferSize = 1024;
char buffer[1024];
printf("%s \n", QUEX_NAME_TOKEN(get_string)(token_p, buffer, BufferSize));
}

int
main(int argc, char** argv)
{
quex_Token* token_p = NULL;
quex_EasyLexer qlex;

quex_EasyLexer_from_file_name(&qlex, "example.txt", NULL);

while( qlex.error_code == E_Error_None ) {
quex_EasyLexer_receive(&qlex, &token_p);
if( ! token_p ) break;

print_token(token_p);
if( token_p->_id == QUEX_TKN_TERMINATION ) break;
}

quex_EasyLexer_destruct(&qlex);
return 0;
}

然后在命令行执行:

> quex -i tmp.qx --encoding utf8 --language C -o EasyLexer
> gcc -I$QUEX_PATH example.c EasyLexer.c -o example
> ./example example.txt

交付

ID 'Örjan' 
WS '\n'
ID '日本語'
WS '\n'

假设文件“example.txt”是 UTF-8 编码的并且包含

Örjan
日本語

我不知道还能说什么。我理解错了吗?

关于c++ - Quex:为标识符定义 UTF8 正则表达式,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/43888553/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com