2017-04-25 1 views
2

Ich benutze antlr4, um ein Python-Ziel mit der Python3.g4-Grammatikdatei aus dem Antlr-Grammatik-Repo zu generieren. Die generierte Python3Lexer.py-Datei enthielt Java-Code, den ich in Python übersetzen musste. Hier sind die beiden Java-Segmente es ausgegeben wird, können Sie sie finden sowohl in der python3 Grammatikdatei here alsoÜbersetzen von Java zu Python in antlr4 python3 target

// A queue where extra tokens are pushed on (see the NEWLINE lexer rule). 
private java.util.LinkedList<Token> tokens = new java.util.LinkedList<>(); 

// The stack that keeps track of the indentation level. 
private java.util.Stack<Integer> indents = new java.util.Stack<>(); 

// The amount of opened braces, brackets and parenthesis. 
private int opened = 0; 

// The most recently produced token. 
private Token lastToken = null; 

@Override 
public void emit(Token t) { 
    super.setToken(t); 
    tokens.offer(t); 
} 

@Override 
public Token nextToken() { 

    // Check if the end-of-file is ahead and there are still some DEDENTS expected. 
    if (_input.LA(1) == EOF && !this.indents.isEmpty()) { 

    // Remove any trailing EOF tokens from our buffer. 
    for (int i = tokens.size() - 1; i >= 0; i--) { 
     if (tokens.get(i).getType() == EOF) { 
      tokens.remove(i); 
     } 
    } 

    // First emit an extra line break that serves as the end of the statement. 
    this.emit(commonToken(Python3Parser.NEWLINE, "\n")); 

    // Now emit as much DEDENT tokens as needed. 
    while (!indents.isEmpty()) { 
     this.emit(createDedent()); 
     indents.pop(); 
    } 

    // Put the EOF back on the token stream. 
    this.emit(commonToken(Python3Parser.EOF, "<EOF>")); 
    } 

    Token next = super.nextToken(); 

    if (next.getChannel() == Token.DEFAULT_CHANNEL) { 
     // Keep track of the last token on the default channel. 
     this.lastToken = next; 
    } 

    return tokens.isEmpty() ? next : tokens.poll(); 
} 

private Token createDedent() { 
    CommonToken dedent = commonToken(Python3Parser.DEDENT, ""); 
    dedent.setLine(this.lastToken.getLine()); 
    return dedent; 
} 

private CommonToken commonToken(int type, String text) { 
    int stop = this.getCharIndex() - 1; 
    int start = text.isEmpty() ? stop : stop - text.length() + 1; 
    return new CommonToken(this._tokenFactorySourcePair, type, DEFAULT_TOKEN_CHANNEL, start, stop); 
} 

static int getIndentationCount(String spaces) { 

    int count = 0; 

    for (char ch : spaces.toCharArray()) { 
     switch (ch) { 
     case '\t': 
      count += 8 - (count % 8); 
      break; 
     default: 
      // A normal space char. 
      count++; 
     } 
    } 

    return count; 
} 

boolean atStartOfInput() { 
    return super.getCharPositionInLine() == 0 && super.getLine() == 1; 
} 

und

String newLine = getText().replaceAll("[^\r\n\f]+", ""); 
String spaces = getText().replaceAll("[\r\n\f]+", ""); 
int next = _input.LA(1); 

if (opened > 0 || next == '\r' || next == '\n' || next == '\f' || next == '#') { 
    // If we're inside a list or on a blank line, ignore all indents, 
    // dedents and line breaks. 
    skip(); 
} 
else { 
    emit(commonToken(NEWLINE, newLine)); 

    int indent = getIndentationCount(spaces); 
    int previous = indents.isEmpty() ? 0 : indents.peek(); 

    if (indent == previous) { 
     // skip indents of the same size as the present indent-size 
     skip(); 
    } 
    else if (indent > previous) { 
     indents.push(indent); 
     emit(commonToken(Python3Parser.INDENT, spaces)); 
    } 
    else { 
     // Possibly emit more than 1 DEDENT token. 
     while(!indents.isEmpty() && indents.peek() > indent) { 
      this.emit(createDedent()); 
      indents.pop(); 
     } 
    } 
} 

übersetzte ich diese selbst zu:

# A queue where extra tokens are pushed on (see the NEWLINE lexer rule). 
tokens = deque() 

# The stack that keeps track of the indentation level. 
# https://docs.python.org/3/tutorial/datastructures.html#using-lists-as-stacks 
indents = [] 

# The amount of opened braces, brackets and parenthesis. 
opened = 0 

# The most recently produced token. 
lastToken = None 

def emit(self, t): 
    self._token = t 
    self.tokens.append(t) 

def nextToken(self): 

    # Check if the end-of-file is ahead and there are still some DEDENTS expected. 
    if self._input.LA(1) == Token.EOF and self.indents.size() != 0: 

    # Remove any trailing EOF tokens from our buffer. 
    for i in range(tokens.size() - 1, 0, -1): 
     if self.tokens[i].getType() == Token.EOF: 
     self.tokens.remove(i) 

    # First emit an extra line break that serves as the end of the statement. 
    self.emit(commonToken(Python3Parser.NEWLINE, "\n")) 

    # Now emit as much DEDENT tokens as needed. 
    while self.indents.size() != 0: 
     self.emit(createDedent()) 
     self.indents.pop() 

    # Put the EOF back on the token stream. 
    self.emit(commonToken(Python3Parser.EOF, "<EOF>")) 

    next = self.nextToken() 

    if next.getChannel() == Token.DEFAULT_CHANNEL: 
    # Keep track of the last token on the default channel. 
    self.lastToken = next 

    return next if self.tokens.size() == 0 else self.tokens.popleft() 

def createDedent(): 
    dedent = commonToken(Python3Parser.DEDENT, "") 
    dedent.setLine(self.lastToken.getLine()) 
    return dedent 

def commonToken(self, type, text): 
    stop = self.getCharIndex() - 1 
    start = stop if text.size() == 0 else stop - text.size() + 1 
    return CommonToken(self._tokenFactorySourcePair, type, DEFAULT_TOKEN_CHANNEL, start, stop) 

def getIndentationCount(spaces): 

    count = 0 

    for ch in spaces: 
    if ch == '\t': 
     count += 8 - (count % 8) 
     break 
    else: 
     # A normal space char. 
     count = count + 1 

    return count 

def atStartOfInput(self): 
    return self.getCharPositionInLine() == 0 and self.getLine() == 1 

und

newLine = getText().replaceAll("[^\r\n\f]+", "") 
spaces = getText().replaceAll("[\r\n\f]+", "") 
next = self._input.LA(1) 

if opened > 0 or next == '\r' or next == '\n' or next == '\f' or next == '#': 
    # If we're inside a list or on a blank line, ignore all indents, 
    # dedents and line breaks. 
    skip() 
else: 
    emit(commonToken(NEWLINE, newLine)) 

indent = getIndentationCount(spaces) 
previous = 0 if indents.isEmpty() else indents.peek() 

if indent == previous: 
    # skip indents of the same size as the present indent-size 
    skip() 
elif indent > previous: 
    indents.push(indent) 
    emit(commonToken(Python3Parser.INDENT, spaces)) 
else: 
    # Possibly emit more than 1 DEDENT token. 
    while not indents.isEmpty() and indents.peek() > indent: 
    self.emit(createDedent()) 
    indents.pop() 

und das ist mein Python-Skript, die antlr Ausgabe mit dem Python innen anstelle des Java-Schnipsel laufen. mit dem Ran-Befehl python main.py test.py

import sys 
from antlr4 import * 
from Python3Lexer import Python3Lexer 
from Python3Parser import Python3Parser 
from Python3Listener import Python3Listener 

class FuncPrinter(Python3Listener): 
    def enterFuncdef(self, ctx): 
    print("Oh, a func") 

def main(argv): 
    input = FileStream(argv[1]) 
    lexer = Python3Lexer(input) 
    stream = CommonTokenStream(lexer) 
    parser = Python3Parser(stream) 
    tree = parser.funcdef() 

    printer = KeyPrinter() 
    walker = ParseTreeWalker() 
    walker.walk(printer, tree) 

if __name__ == '__main__': 
    main(sys.argv) 

Es Fehler und drucken Sie die folgende Spur

Traceback (most recent call last): 
    File "main.py", line 24, in <module> 
    main(sys.argv) 
    File "main.py", line 17, in main 
    tree = parser.parameters() 
    File "...\antler-test\Python3Parser.py", line 1297, in parameters 
    self.enterRule(localctx, 14, self.RULE_parameters) 
    File "...\antler-test\antlr4\Parser.py", line 358, in enterRule 
    self._ctx.start = self._input.LT(1) 
    File "...\antler-test\antlr4\CommonTokenStream.py", line 61, in LT 
    self.lazyInit() 
    File "...\antler-test\antlr4\BufferedTokenStream.py", line 186, in lazyInit 
    self.setup() 
    File "...\antler-test\antlr4\BufferedTokenStream.py", line 189, in setup 
    self.sync(0) 
    File "...\antler-test\antlr4\BufferedTokenStream.py", line 111, in sync 
    fetched = self.fetch(n) 
    File "...\antler-test\antlr4\BufferedTokenStream.py", line 123, in fetch 
    t = self.tokenSource.nextToken() 
    File "...\antler-test\Python3Lexer.py", line 698, in nextToken 
    next = self.nextToken() 
    File "...\antler-test\Python3Lexer.py", line 698, in nextToken 
    next = self.nextToken() 
    File "...\antler-test\Python3Lexer.py", line 698, in nextToken 
    next = self.nextToken() 
    [Previous line repeated 985 more times] 
    File "...\antler-test\Python3Lexer.py", line 680, in nextToken 
    if self._input.LA(1) == Token.EOF and self.indents.size() != 0: 
    File "...\antler-test\antlr4\InputStream.py", line 49, in LA 
    if offset==0: 
RecursionError: maximum recursion depth exceeded in comparison 

die Eingabedatei wie folgt aussieht:

def fun1(): 
    return None 

def fun2(): 
    return None 

Ich bin nicht sicher, ob ich den Python falsch übersetzt oder der rekursive Algorithmus ist einfach zu viel für python, aber ich kann auch nicht herausfinden, wie der Algorithmus zu ändern, für die nexttoken Methode iterativ sein, da es nicht Schwanz recursiv ist e. Vielleicht könnte das jemand herausfinden? Oder gibt es ein anderes Problem mit dem, was ich mache?

Antwort

2

Ich arbeite seit ein paar Tagen genau am selben Thema.

Das ist nicht so einfach. Die Python-Laufzeit hat nicht genau die gleiche API wie die Java-API. Python-Laufzeit ist weniger verwendet und ziemlich unvollständig.

Ich musste einige Workarounds verwenden, aber es scheint zu funktionieren. Hier ist mein Code:

tokens { INDENT, DEDENT } 

@lexer::members { 

    # A queue where extra tokens are pushed on (see the NEWLINE lexer rule). 
    self.tokens = [] 

    # The stack that keeps track of the indentation level. 
    self.indents = [] 

    # The amount of opened braces, brackets and parenthesis. 
    self.opened = 0 

    # The most recently produced token. 
    self.last_token = None 

def emitToken(self, t): 
    super().emitToken(t) 
    self.tokens.append(t) 

def nextToken(self): 
    if self._input.LA(1) == Token.EOF and len(self.indents) > 0: 
     # Remove any trailing EOF tokens from our buffer. 
     while len(self.tokens) > 0 and self.tokens[-1].type == Token.EOF: 
      del self.tokens[-1] 

     # First emit an extra line break that serves as the end of the statement. 
     self.emitToken(self.common_token(Python3Lexer.NEWLINE, "\n")); 

     # Now emit as much DEDENT tokens as needed. 
     while len(self.indents) != 0: 
      self.emitToken(self.create_dedent()) 
      del self.indents[-1] 

     # Put the EOF back on the token stream. 
     self.emitToken(self.common_token(Token.EOF, "<EOF>")); 

    next = super().nextToken(); 

    if next.channel == Token.DEFAULT_CHANNEL: 
     # Keep track of the last token on the default channel. 
     self.last_token = next 

    if len(self.tokens) == 0: 
     return next 
    else: 
     t = self.tokens[0] 
     del self.tokens[0] 
     return t 

def create_dedent(self): 
    from Python3Parser import Python3Parser 
    dedent = self.common_token(Python3Parser.DEDENT, "") 
    dedent.line = self.last_token.line 
    return dedent 

def common_token(self, _type, text): 
    from antlr4.Token import CommonToken 
    stop = self.getCharIndex() - 1 
    if len(self.text) == 0: 
     start = stop 
    else: 
     start = stop - len(self.text) + 1 
    return CommonToken(self._tokenFactorySourcePair, _type, Lexer.DEFAULT_TOKEN_CHANNEL, start, stop) 

## Calculates the indentation of the provided spaces, taking the 
## following rules into account: 
## 
## "Tabs are replaced (from left to right) by one to eight spaces 
## such that the total number of characters up to and including 
## the replacement is a multiple of eight [...]" 
## 
## -- https://docs.python.org/3.1/reference/lexical_analysis.html#indentation 
def getIndentationCount(self, spaces): 
    count = 0 
    for ch in spaces: 
     if ch == '\t': 
      count += 8 - (count % 8) 
     else: 
      count += 1 
    return count 

def atStartOfInput(self): 
    return self._interp.column == 0 and self._interp.line == 1 

} 

Und für den NEWLINE Lexer Teil:

NEWLINE 
: ({self.atStartOfInput()}? SPACES 
    | ('\r'? '\n' | '\r' | '\f') SPACES? 
    ) 

    { 
    import re 
    from Python3Parser import Python3Parser 
    new_line = re.sub(r"[^\r\n\f]+", "", self._interp.getText(self._input)) #.replaceAll("[^\r\n\f]+", "") 
    spaces = re.sub(r"[\r\n\f]+", "", self._interp.getText(self._input)) #.replaceAll("[\r\n\f]+", "") 
    next = self._input.LA(1) 

    if self.opened > 0 or next == '\r' or next == '\n' or next == '\f' or next == '#': 
     self.skip() 
    else: 
     self.emitToken(self.common_token(self.NEWLINE, new_line)) 

     indent = self.getIndentationCount(spaces) 
     if len(self.indents) == 0: 
      previous = 0 
     else: 
      previous = self.indents[-1] 

     if indent == previous: 
      self.skip() 
     elif indent > previous: 
      self.indents.append(indent) 
      self.emitToken(self.common_token(Python3Parser.INDENT, spaces)) 
     else: 
      while len(self.indents) > 0 and self.indents[-1] > indent: 
       self.emitToken(self.create_dedent()) 
       del self.indents[-1] 

    }; 

Sie müssen auch in der gesamten Datei ersetzen die Lexer id "str" ​​durch "string" (zum Beispiel), weil str ist ein Schlüsselwort in Python.

2

Ihr Python-Code sagt

next = self.nextToken() 

Aber Ihr Java-Code sagt:

Token next = super.nextToken(); 

Beachten Sie, dass super nicht das gleiche ist wie self. Sie wahrscheinlich so etwas wie bedeuten:

next = super().nextToken()