gpt4 book ai didi

javascript - 读取代码点时出现偏移问题

转载 作者:行者123 更新时间:2023-11-30 21:15:38 25 4
gpt4 key购买 nike

简历:我目前正在编写一个将源代码转换为标记的 ActionScript 3 词法分析器。我选择通过代码点来解释输入,一个带有可选代理项对的字符串,该字符串包含在 UString 类中。在后台,我使用 UStringPos 类缓存最后读取的位置。

我测试了它如何扫描标识符 "huehuehue" 与...

'use strict';

import {Lexer} from 'core/Lexer';
import {UString} from 'utils/UString';
import ErrorHandler from 'core/ErrorHandler';

const errorHandler = new ErrorHandler(true);

// Tell the length to the `Lexer` manually.
const lexer = new Lexer(
new UString('huehuehue'), 9, errorHandler);

// Scan first token
lexer.next();

const id = lexer.lookahead.value;

console.log(
id,
id.length
);

它应该记录 "huehuehue", 9,但这是另一个故事...

为什么它缺少最后一个 'e'?与扫描相关的最内层方法是 Lexer#getCommonIdentifier。顺便说一句,我已经测试了我的 UString 部分,它工作正常。

词法分析器相关定义

/*
* Class that turns AS3 code into tokens.
*/
export class Lexer
{
/*
* @param {UString} source
* @param {Number} length
* @param {ErrorHandler} errorHandler
*/
constructor(source, length, errorHandler)
{
this.source = source;
this.length = length;
this.index = 0;
this.lineStart = 0;
this.lineNumber = 1;
this.comments = [];

this.errorHandler = errorHandler;

this.previousToken = null;
this.token = null;
this.lookahead = null;

this._special = [];
}

/*
* Verifies the end of file.
*/
eof()
{
return this.index >= this.length;
}

/*
* Advance the previous, current and lookahead tokens.
* The lexer however does not depend on these tokens.
*/
next()
{
this.previousToken = this.token;
this.token = this.lookahead;
this.lookahead = this.lex();
}

/*
* Consumes the next token and return it.
*/
lex()
{
this.consumeWhiteSpaces();

while (this.consumeComment())
this.consumeWhiteSpaces();

let cp = this.source.codePointAt(this.index);

let pureIdentifier =
Character.isIdentifierStart(cp);

if (pureIdentifier || (cp === 0x5C))
return this.scanIdentifierOrKeyword(!pureIdentifier);

if (this.eof())
{
let loc = [ this.index, this.lineNumber ];
return new Token(TokenType.EOF, loc, loc, '<end>');
}
}

/*
* Scan an identifier, keyword or boolean literal.
*/
scanIdentifierOrKeyword(usingEscape)
{
const start = this.index;
let id;

/* Like Esprima does: only identifiers containing
* escapes need some overheads. */
if (usingEscape)
{
id = this.getEscapedIdentifier(
String.fromCodePoint(this.scanUnicodeEscapeSequence()));
}
else
id = this.getCommonIdentifier();

return new Token(
TokenType.IDENTIFIER,
[ start , this.lineNumber ],
[ this.index, this.lineNumber ],
id
);
}

/*
* Interprets an identifier. If any escape appears, switches to
* getEscapedIdentifier().
*/
getCommonIdentifier()
{
const start = this.source.position.offset;
let cp = 0;

// Jump the starting symbol.
++this.index;

while (!this.eof())
{
cp = this.source.codePointAt(this.index);

if (Character.isIdentifierPart(cp))
++this.index;

// Switches to escape-minded task...
else if (cp === 0x5C)
return this.getUnicodeEscapedIdentifier(
this.source.string.slice(
start, this.source.position.offset
)
);

else break;
}
return this.source.string.slice(
start, this.source.position.offset
);
}

/* ... */
}

utils/UString.js

'use strict';

/*
* String wrapper with methods _based_ on code points.
*/
export class UString
{
/*
* Constructs the {UString}.
*
* @param {String} s String to be wrapped.
*/
constructor(s)
{
/*
* @type {String}
*/
this.string = s;

/*
* Tracks the last accessed position.
*
* @type {UStringPos}
*/
this.position = new UStringPos(0, 0);
}

/*
* Reads a code point at specific index.
*
* @param {Number} index
* @return {Number}
*/
codePointAt(index)
{
this.position.walk(this.string, index);
return this.string.codePointAt(this.position.offset);
}

/*
* Slices the internal string by code point indices.
*
* @param {Number} i
* @param {Number} j
* @return {String}
*/
slice(i, j)
{
this.position.walk(this.string, i);
i = this.position.offset;

this.position.walk(this.string, j);
j = this.position.offset;

return this.string.slice(i, j);
}
};

/*
* Class that tracks the position of a code point on a string.
*/
export class UStringPos
{
/*
* Constructs the {UStringPos}.
*
* @param {Number} index The initial index.
* @param {Number} offset The initial offset.
*/
constructor(index, offset)
{
/*
* @type {Number}
*/
this.index = index;

/*
* @type {Number}
*/
this.offset = offset;
}

/*
* Walks to the given index.
*
* @param {String} s
* @param {Number} index
* @note No backward. Track the previous position instead.
* @return {void}
*/
walk(s, index)
{
for (; this.index < index; ++this.index)
this.offset += (
this._usingSurrogates(
s.charCodeAt(this.offset)
) ? 2 : 1
);
}

/*
* @private
*/
_usingSurrogates(ch)
{
return (ch >= 0xD800) && (ch <= 0xDBFF);
}
};

有什么吗?

最佳答案

好的。所以这是 this.source.position.offset 的问题:当我执行 ++this.index 时,我的 UStringPos 的偏移量没有'更新。问题出在切片上。

    this.source.string.slice(
start, this.source.position.offset
);

这个切片是基于偏移量的,因为我必须跟踪标识符开始的前一个偏移量。

解决方案

我可以使用我自己的 UString 类的切片,并将第一个参数用作偏移量,将最后一个参数用作普通索引。

'use strict';

export class UString
{
// ...

/*
* Slices the internal string by using a pair of
* offset and code point indices.
*
* @param {Number} i Offset
* @param {Number} j
* @return {String}
*/
slice(i, j)
{
this.position.walk(this.string, j);
j = this.position.offset;

return this.string.slice(i, j);
}

};

关于javascript - 读取代码点时出现偏移问题,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/45721257/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com