gpt4 book ai didi

ANTLR 实现 python 之类的缩进依赖语法的最简单方法是什么?

转载 作者:行者123 更新时间:2023-12-03 12:11:56 28 4
gpt4 key购买 nike

我正在尝试实现 python 之类的依赖缩进的语法。

源示例:

ABC QWE
CDE EFG
EFG CDE
ABC
QWE ZXC

如我所见,我需要的是实现两个标记 INDENT 和 DEDENT,所以我可以写如下:
grammar mygrammar;
text: (ID | block)+;
block: INDENT (ID|block)+ DEDENT;
INDENT: ????;
DEDENT: ????;

有没有什么简单的方法可以使用 ANTLR 来实现这一点?

(如果可能的话,我更喜欢使用标准的 ANTLR 词法分析器。)

最佳答案

我不知道最简单的处理方法是什么,但以下是一种相对简单的方法。每当您在词法分析器中匹配换行符时,可选择匹配一个或多个空格。如果换行后有空格,则将这些空格的长度与当前的缩进大小进行比较。如果它大于当前缩进大小,则发出 Indent token ,如果它小于当前缩进大小,则发出 Dedent token ,如果相同,则不执行任何操作。

您还需要发出许多 Dedent文件末尾的标记让每个 Indent有一个匹配的 Dedent token 。

为了使其正常工作,您 必须在输入源文件中添加前导和尾随换行符!

ANTRL3

快速演示:

grammar PyEsque;

options {
output=AST;
}

tokens {
BLOCK;
}

@lexer::members {

private int previousIndents = -1;
private int indentLevel = 0;
java.util.Queue<Token> tokens = new java.util.LinkedList<Token>();

@Override
public void emit(Token t) {
state.token = t;
tokens.offer(t);
}

@Override
public Token nextToken() {
super.nextToken();
return tokens.isEmpty() ? Token.EOF_TOKEN : tokens.poll();
}

private void jump(int ttype) {
indentLevel += (ttype == Dedent ? -1 : 1);
emit(new CommonToken(ttype, "level=" + indentLevel));
}
}

parse
: block EOF -> block
;

block
: Indent block_atoms Dedent -> ^(BLOCK block_atoms)
;

block_atoms
: (Id | block)+
;

NewLine
: NL SP?
{
int n = $SP.text == null ? 0 : $SP.text.length();
if(n > previousIndents) {
jump(Indent);
previousIndents = n;
}
else if(n < previousIndents) {
jump(Dedent);
previousIndents = n;
}
else if(input.LA(1) == EOF) {
while(indentLevel > 0) {
jump(Dedent);
}
}
else {
skip();
}
}
;

Id
: ('a'..'z' | 'A'..'Z')+
;

SpaceChars
: SP {skip();}
;

fragment NL : '\r'? '\n' | '\r';
fragment SP : (' ' | '\t')+;
fragment Indent : ;
fragment Dedent : ;

您可以使用该类测试解析器:

import org.antlr.runtime.*;
import org.antlr.runtime.tree.*;
import org.antlr.stringtemplate.*;

public class Main {
public static void main(String[] args) throws Exception {
PyEsqueLexer lexer = new PyEsqueLexer(new ANTLRFileStream("in.txt"));
PyEsqueParser parser = new PyEsqueParser(new CommonTokenStream(lexer));
CommonTree tree = (CommonTree)parser.parse().getTree();
DOTTreeGenerator gen = new DOTTreeGenerator();
StringTemplate st = gen.toDOT(tree);
System.out.println(st);
}
}

如果您现在将以下内容放入名为 in.txt 的文件中:
AAA AAAAA  BBB BB B  BB BBBBB BB    CCCCCC C CC  BB BBBBBB    C CCC      DDD DD D      DDD D DDD

(Note the leading and trailing line breaks!)

then you'll see output that corresponds to the following AST:

enter image description here

Note that my demo wouldn't produce enough dedents in succession, like dedenting from ccc to aaa (2 dedent tokens are needed):

aaa
bbb
ccc
aaa

您需要调整 else if(n < previousIndents) { ... } 中的代码根据 n 之间的差异,可能会发出 1 个以上的 dedent token 和 previousIndents .在我的头顶上,这可能是这样的:

 else if(n < previousIndents) {
// Note: assuming indent-size is 2. Jumping from previousIndents=6
// to n=2 will result in emitting 2 `Dedent` tokens
int numDedents = (previousIndents - n) / 2;
while(numDedents-- > 0) {
jump(Dedent);
}
previousIndents = n;
}

ANTLR4

对于 ANTLR4,请执行以下操作:
grammar Python3;

tokens { INDENT, DEDENT }

@lexer::members {
// A queue where extra tokens are pushed on (see the NEWLINE lexer rule).
private java.util.LinkedList<Token> tokens = new java.util.LinkedList<>();
// The stack that keeps track of the indentation level.
private java.util.Stack<Integer> indents = new java.util.Stack<>();
// The amount of opened braces, brackets and parenthesis.
private int opened = 0;
// The most recently produced token.
private Token lastToken = null;
@Override
public void emit(Token t) {
super.setToken(t);
tokens.offer(t);
}

@Override
public Token nextToken() {
// Check if the end-of-file is ahead and there are still some DEDENTS expected.
if (_input.LA(1) == EOF && !this.indents.isEmpty()) {
// Remove any trailing EOF tokens from our buffer.
for (int i = tokens.size() - 1; i >= 0; i--) {
if (tokens.get(i).getType() == EOF) {
tokens.remove(i);
}
}

// First emit an extra line break that serves as the end of the statement.
this.emit(commonToken(Python3Parser.NEWLINE, "\n"));

// Now emit as much DEDENT tokens as needed.
while (!indents.isEmpty()) {
this.emit(createDedent());
indents.pop();
}

// Put the EOF back on the token stream.
this.emit(commonToken(Python3Parser.EOF, "<EOF>"));
}

Token next = super.nextToken();

if (next.getChannel() == Token.DEFAULT_CHANNEL) {
// Keep track of the last token on the default channel.
this.lastToken = next;
}

return tokens.isEmpty() ? next : tokens.poll();
}

private Token createDedent() {
CommonToken dedent = commonToken(Python3Parser.DEDENT, "");
dedent.setLine(this.lastToken.getLine());
return dedent;
}

private CommonToken commonToken(int type, String text) {
int stop = this.getCharIndex() - 1;
int start = text.isEmpty() ? stop : stop - text.length() + 1;
return new CommonToken(this._tokenFactorySourcePair, type, DEFAULT_TOKEN_CHANNEL, start, stop);
}

// Calculates the indentation of the provided spaces, taking the
// following rules into account:
//
// "Tabs are replaced (from left to right) by one to eight spaces
// such that the total number of characters up to and including
// the replacement is a multiple of eight [...]"
//
// -- https://docs.python.org/3.1/reference/lexical_analysis.html#indentation
static int getIndentationCount(String spaces) {
int count = 0;
for (char ch : spaces.toCharArray()) {
switch (ch) {
case '\t':
count += 8 - (count % 8);
break;
default:
// A normal space char.
count++;
}
}

return count;
}

boolean atStartOfInput() {
return super.getCharPositionInLine() == 0 && super.getLine() == 1;
}
}

single_input
: NEWLINE
| simple_stmt
| compound_stmt NEWLINE
;

// more parser rules

NEWLINE
: ( {atStartOfInput()}? SPACES
| ( '\r'? '\n' | '\r' ) SPACES?
)
{
String newLine = getText().replaceAll("[^\r\n]+", "");
String spaces = getText().replaceAll("[\r\n]+", "");
int next = _input.LA(1);
if (opened > 0 || next == '\r' || next == '\n' || next == '#') {
// If we're inside a list or on a blank line, ignore all indents,
// dedents and line breaks.
skip();
}
else {
emit(commonToken(NEWLINE, newLine));
int indent = getIndentationCount(spaces);
int previous = indents.isEmpty() ? 0 : indents.peek();
if (indent == previous) {
// skip indents of the same size as the present indent-size
skip();
}
else if (indent > previous) {
indents.push(indent);
emit(commonToken(Python3Parser.INDENT, spaces));
}
else {
// Possibly emit more than 1 DEDENT token.
while(!indents.isEmpty() && indents.peek() > indent) {
this.emit(createDedent());
indents.pop();
}
}
}
}
;

// more lexer rules

取自: https://github.com/antlr/grammars-v4/blob/master/python3/Python3.g4

关于ANTLR 实现 python 之类的缩进依赖语法的最简单方法是什么?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/8642154/

28 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com