Heisenbug or the true story of undefined behavior, part 2
22 Dec 2019 - Development Original link
In the previous part, we’ve discussed the problem that was presented in RedBaron
. Today I want to show you my old work, my old little investigation about another strange bug(or feature?) in Baron
. Baron
is a part of RedBaron
library, literally, RedBaron
relies on Baron
Full Syntax Tree which it’s giving for next manipulations.
Baron
had a problem with CommentNode’s position in the FST tree.
This bug broke RedBaron’s nodes recognition.
CommentNode’s position problem
Below I’ll tell you a story about it and start from examples. Let’s roll sleeves up and get down to business.
Take a look at examples:
- First
def main():
print("main")
# cmt
if __name__ == "__main__":
main()
- Second
def main():
print("main")
a = 1
# cmt
if __name__ == "__main__":
main()
I used dump method (.help()
) provided by the RedBaron to get the presentation of internal tree structures
- Tree for the first case
0 -----------------------------------------------------
DefNode()
# identifiers: def, def_, defnode, funcdef, funcdef_
# default test value: name
name=u'main'
decorators ->
arguments ->
value ->
* PrintNode()
# identifiers: print, print_, printnode
destination ->
None
value ->
* AssociativeParenthesisNode()
# identifiers: associative_parenthesis, associative_parenthesis_, associativeparenthesis, associativeparenthesisnode
value ->
StringNode()
# identifiers: string, string_, stringnode
value=u'"main"'
* EndlNode()
# identifiers: endl, endl_, endlnode
value=u'\n'
indent=''
* CommentNode()
# identifiers: comment, comment_, commentnode
value=u'# cmt'
* EndlNode()
# identifiers: endl, endl_, endlnode
value=u'\n'
indent=''
1 -----------------------------------------------------
IfelseblockNode()
# identifiers: ifelseblock, ifelseblock_, ifelseblocknode
value ->
* IfNode()
# identifiers: if, if_, ifnode
test ->
ComparisonNode()
# identifiers: comparison, comparison_, comparisonnode
first ->
NameNode()
# identifiers: name, name_, namenode
value=u'__name__'
value ->
ComparisonOperatorNode()
# identifiers: comparison_operator, comparison_operator_, comparisonoperator, comparisonoperatornode
first=u'=='
second=''
second ->
StringNode()
# identifiers: string, string_, stringnode
value=u'"__main__"'
value ->
* AtomtrailersNode()
# identifiers: atomtrailers, atomtrailers_, atomtrailersnode
value ->
* NameNode()
# identifiers: name, name_, namenode
value=u'main'
* CallNode()
# identifiers: call, call_, callnode
value ->
None
def main():
print("main")
# cmt
if __name__ == "__main__":
main()
- Tree for the second case
0 -----------------------------------------------------
DefNode()
# identifiers: def, def_, defnode, funcdef, funcdef_
# default test value: name
name=u'main'
decorators ->
arguments ->
value ->
* PrintNode()
# identifiers: print, print_, printnode
destination ->
None
value ->
* AssociativeParenthesisNode()
# identifiers: associative_parenthesis, associative_parenthesis_, associativeparenthesis, associativeparenthesisnode
value ->
StringNode()
# identifiers: string, string_, stringnode
value=u'"main"'
* EndlNode()
# identifiers: endl, endl_, endlnode
value=u'\n'
indent=''
1 -----------------------------------------------------
AssignmentNode()
# identifiers: assign, assignment, assignment_, assignmentnode
operator=''
target ->
NameNode()
# identifiers: name, name_, namenode
value=u'a'
value ->
IntNode()
# identifiers: int, int_, intnode
value=u'1'
2 -----------------------------------------------------
EndlNode()
# identifiers: endl, endl_, endlnode
value=u'\n'
indent=''
3 -----------------------------------------------------
CommentNode()
# identifiers: comment, comment_, commentnode
value=u'# cmt'
4 -----------------------------------------------------
EndlNode()
# identifiers: endl, endl_, endlnode
value=u'\n'
indent=''
5 -----------------------------------------------------
EndlNode()
# identifiers: endl, endl_, endlnode
value=u'\n'
indent=''
6 -----------------------------------------------------
IfelseblockNode()
# identifiers: ifelseblock, ifelseblock_, ifelseblocknode
value ->
* IfNode()
# identifiers: if, if_, ifnode
test ->
ComparisonNode()
# identifiers: comparison, comparison_, comparisonnode
first ->
NameNode()
# identifiers: name, name_, namenode
value=u'__name__'
value ->
ComparisonOperatorNode()
# identifiers: comparison_operator, comparison_operator_, comparisonoperator, comparisonoperatornode
first=u'=='
second=''
second ->
StringNode()
# identifiers: string, string_, stringnode
value=u'"__main__"'
value ->
* AtomtrailersNode()
# identifiers: atomtrailers, atomtrailers_, atomtrailersnode
value ->
* NameNode()
# identifiers: name, name_, namenode
value=u'main'
* CallNode()
# identifiers: call, call_, callnode
value ->
None
def main():
print("main")
a = 1
# cmt
if __name__ == "__main__":
main()
In the first excerpt what we see is not what we get - CommentNode
is not in the right position.</br>
The right position in the first example is out from the DefNode
of main()
. But it has been dumped(.dumps()
) correctly.
Let me show you another example, what if I comment a=1
:
0 -----------------------------------------------------
DefNode()
# identifiers: def, def_, defnode, funcdef, funcdef_
# default test value: name
name=u'main'
decorators ->
arguments ->
value ->
* PrintNode()
# identifiers: print, print_, printnode
destination ->
None
value ->
* AssociativeParenthesisNode()
# identifiers: associative_parenthesis, associative_parenthesis_, associativeparenthesis, associativeparenthesisnode
value ->
StringNode()
# identifiers: string, string_, stringnode
value=u'"main"'
* EndlNode()
# identifiers: endl, endl_, endlnode
value=u'\n'
indent=''
* CommentNode()
# identifiers: comment, comment_, commentnode
value=u'# a = 1'
* CommentNode()
# identifiers: comment, comment_, commentnode
value=u'# cmt'
* EndlNode()
# identifiers: endl, endl_, endlnode
value=u'\n'
indent=''
1 -----------------------------------------------------
IfelseblockNode()
# identifiers: ifelseblock, ifelseblock_, ifelseblocknode
value ->
* IfNode()
# identifiers: if, if_, ifnode
test ->
ComparisonNode()
# identifiers: comparison, comparison_, comparisonnode
first ->
NameNode()
# identifiers: name, name_, namenode
value=u'__name__'
value ->
ComparisonOperatorNode()
# identifiers: comparison_operator, comparison_operator_, comparisonoperator, comparisonoperatornode
first=u'=='
second=''
second ->
StringNode()
# identifiers: string, string_, stringnode
value=u'"__main__"'
value ->
* AtomtrailersNode()
# identifiers: atomtrailers, atomtrailers_, atomtrailersnode
value ->
* NameNode()
# identifiers: name, name_, namenode
value=u'main'
* CallNode()
# identifiers: call, call_, callnode
value ->
None
def main():
print("main")
# a = 1
# cmt
if __name__ == "__main__":
main()
Here we have the same issue like in the first case. Pay attention to the next fact that between two CommentNode
s, there is no EndlNode
. It has been missed.
Let’s look on Baron’s FST for these cases:
- Baron’s FST for the first case
[{'arguments': [],
'decorators': [],
'fifth_formatting': [],
'first_formatting': [{'type': 'space', 'value': u' '}],
'fourth_formatting': [],
'name': u'main',
'second_formatting': [],
'sixth_formatting': [],
'third_formatting': [],
'type': 'def',
'value': [{'formatting': [],
'indent': u' ',
'type': 'endl',
'value': u'\n'},
{'destination': None,
'destination_formatting': [],
'formatting': [],
'type': 'print',
'value': [{'first_formatting': [],
'fourth_formatting': [],
'second_formatting': [],
'third_formatting': [],
'type': 'associative_parenthesis',
'value': {'first_formatting': [],
'second_formatting': [],
'type': 'string',
'value': u'"main"'}}]},
{'formatting': [], 'indent': '', 'type': 'endl', 'value': u'\n'},
{'formatting': [], 'indent': '', 'type': 'endl', 'value': u'\n'}]},
{'formatting': [], 'type': 'comment', 'value': u'# cmt'},
{'formatting': [], 'indent': '', 'type': 'endl', 'value': u'\n'},
{'formatting': [], 'indent': '', 'type': 'endl', 'value': u'\n'},
{'type': 'ifelseblock',
'value': [{'first_formatting': [{'type': 'space', 'value': u' '}],
'second_formatting': [],
'test': {'first': {'type': 'name', 'value': u'__name__'},
'first_formatting': [{'type': 'space', 'value': u' '}],
'second': {'first_formatting': [],
'second_formatting': [],
'type': 'string',
'value': u'"__main__"'},
'second_formatting': [{'type': 'space', 'value': u' '}],
'type': 'comparison',
'value': {'first': u'==',
'formatting': [],
'second': '',
'type': 'comparison_operator'}},
'third_formatting': [],
'type': 'if',
'value': [{'formatting': [],
'indent': u' ',
'type': 'endl',
'value': u'\n'},
{'type': 'atomtrailers',
'value': [{'type': 'name', 'value': u'main'},
{'first_formatting': [],
'fourth_formatting': [],
'second_formatting': [],
'third_formatting': [],
'type': 'call',
'value': []}]},
{'formatting': [], 'indent': '', 'type': 'endl', 'value': u'\n'}]}]}]
- Baron’s FST for the second case
[{'arguments': [],
'decorators': [],
'fifth_formatting': [],
'first_formatting': [{'type': 'space', 'value': u' '}],
'fourth_formatting': [],
'name': u'main',
'second_formatting': [],
'sixth_formatting': [],
'third_formatting': [],
'type': 'def',
'value': [{'formatting': [],
'indent': u' ',
'type': 'endl',
'value': u'\n'},
{'destination': None,
'destination_formatting': [],
'formatting': [],
'type': 'print',
'value': [{'first_formatting': [],
'fourth_formatting': [],
'second_formatting': [],
'third_formatting': [],
'type': 'associative_parenthesis',
'value': {'first_formatting': [],
'second_formatting': [],
'type': 'string',
'value': u'"main"'}}]},
{'formatting': [], 'indent': '', 'type': 'endl', 'value': u'\n'},
{'formatting': [], 'indent': '', 'type': 'endl', 'value': u'\n'}]},
{'first_formatting': [{'type': 'space', 'value': u' '}],
'operator': '',
'second_formatting': [{'type': 'space', 'value': u' '}],
'target': {'type': 'name', 'value': u'a'},
'type': 'assignment',
'value': {'section': 'number', 'type': 'int', 'value': u'1'}},
{'formatting': [], 'indent': '', 'type': 'endl', 'value': u'\n'},
{'formatting': [], 'type': 'comment', 'value': u'# cmt'},
{'formatting': [], 'indent': '', 'type': 'endl', 'value': u'\n'},
{'formatting': [], 'indent': '', 'type': 'endl', 'value': u'\n'},
{'type': 'ifelseblock',
'value': [{'first_formatting': [{'type': 'space', 'value': u' '}],
'second_formatting': [],
'test': {'first': {'type': 'name', 'value': u'__name__'},
'first_formatting': [{'type': 'space', 'value': u' '}],
'second': {'first_formatting': [],
'second_formatting': [],
'type': 'string',
'value': u'"__main__"'},
'second_formatting': [{'type': 'space', 'value': u' '}],
'type': 'comparison',
'value': {'first': u'==',
'formatting': [],
'second': '',
'type': 'comparison_operator'}},
'third_formatting': [],
'type': 'if',
'value': [{'formatting': [],
'indent': u' ',
'type': 'endl',
'value': u'\n'},
{'type': 'atomtrailers',
'value': [{'type': 'name', 'value': u'main'},
{'first_formatting': [],
'fourth_formatting': [],
'second_formatting': [],
'third_formatting': [],
'type': 'call',
'value': []}]},
{'formatting': [], 'indent': '', 'type': 'endl', 'value': u'\n'}]}]}]
- Baron’s FST for the second case(where I have commented assignment node):
[{'arguments': [],
'decorators': [],
'fifth_formatting': [],
'first_formatting': [{'type': 'space', 'value': u' '}],
'fourth_formatting': [],
'name': u'main',
'second_formatting': [],
'sixth_formatting': [],
'third_formatting': [],
'type': 'def',
'value': [{'formatting': [],
'indent': u' ',
'type': 'endl',
'value': u'\n'},
{'destination': None,
'destination_formatting': [],
'formatting': [],
'type': 'print',
'value': [{'first_formatting': [],
'fourth_formatting': [],
'second_formatting': [],
'third_formatting': [],
'type': 'associative_parenthesis',
'value': {'first_formatting': [],
'second_formatting': [],
'type': 'string',
'value': u'"main"'}}]},
{'formatting': [], 'indent': '', 'type': 'endl', 'value': u'\n'},
{'formatting': [], 'indent': '', 'type': 'endl', 'value': u'\n'}]},
{'formatting': [], 'type': 'comment', 'value': u'#a = 1'},
{'formatting': [], 'indent': '', 'type': 'endl', 'value': u'\n'},
{'formatting': [], 'type': 'comment', 'value': u'# cmt'},
{'formatting': [], 'indent': '', 'type': 'endl', 'value': u'\n'},
{'formatting': [], 'indent': '', 'type': 'endl', 'value': u'\n'},
{'type': 'ifelseblock',
'value': [{'first_formatting': [{'type': 'space', 'value': u' '}],
'second_formatting': [],
'test': {'first': {'type': 'name', 'value': u'__name__'},
'first_formatting': [{'type': 'space', 'value': u' '}],
'second': {'first_formatting': [],
'second_formatting': [],
'type': 'string',
'value': u'"__main__"'},
'second_formatting': [{'type': 'space', 'value': u' '}],
'type': 'comparison',
'value': {'first': u'==',
'formatting': [],
'second': '',
'type': 'comparison_operator'}},
'third_formatting': [],
'type': 'if',
'value': [{'formatting': [],
'indent': u' ',
'type': 'endl',
'value': u'\n'},
{'type': 'atomtrailers',
'value': [{'type': 'name', 'value': u'main'},
{'first_formatting': [],
'fourth_formatting': [],
'second_formatting': [],
'third_formatting': [],
'type': 'call',
'value': []}]},
{'formatting': [], 'indent': '', 'type': 'endl', 'value': u'\n'}]}]}]
I’ve been surprised a little bit because all seems to be good. But a problem was deeper.
During debugging I’ve recognized that the RedBaron has already received a broken sequence of tokens. Consequently, the problem is occurring between two layers and not affected a dump function in all cases.
Let’s trace back functions calls for one source where(I’m thinking) a problem occurs:
__init__,:redbaron.py:37 ->
parse,baron.py:49 ->
tokenize, baron.py:70 ->
group, formatting_grouper.py:108 ->
group_generator, formatting_grouper.py:118
A code inside of
tokenize
function has to be refactored to set up a breakpoint
Before a space_group
function call:
00 = {tuple} <type 'tuple'>: (u'DEF', u'def')
01 = {tuple} <type 'tuple'>: ('SPACE', u' ')
02 = {tuple} <type 'tuple'>: ('NAME', u'main')
03 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
04 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
05 = {tuple} <type 'tuple'>: ('COLON', u':')
06 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
07 = {tuple} <type 'tuple'>: ('SPACE', u' ')
08 = {tuple} <type 'tuple'>: (u'PRINT', u'print')
09 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
10 = {tuple} <type 'tuple'>: ('STRING', u'"main"')
11 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
12 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
13 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
14 = {tuple} <type 'tuple'>: ('COMMENT', u'# cmt')
15 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
16 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
17 = {tuple} <type 'tuple'>: (u'IF', u'if')
18 = {tuple} <type 'tuple'>: ('SPACE', u' ')
19 = {tuple} <type 'tuple'>: ('NAME', u'__name__')
20 = {tuple} <type 'tuple'>: ('SPACE', u' ')
21 = {tuple} <type 'tuple'>: ('EQUAL_EQUAL', u'==')
22 = {tuple} <type 'tuple'>: ('SPACE', u' ')
23 = {tuple} <type 'tuple'>: ('STRING', u'"__main__"')
24 = {tuple} <type 'tuple'>: ('COLON', u':')
25 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
26 = {tuple} <type 'tuple'>: ('SPACE', u' ')
27 = {tuple} <type 'tuple'>: ('NAME', u'main')
28 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
29 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
30 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
31 = {tuple} <type 'tuple'>: ('ENDMARKER', '')
- After a
space_group
function call
00 = {tuple} <type 'tuple'>: (u'DEF', u'def', [], [('SPACE', u' ')])
01 = {tuple} <type 'tuple'>: ('NAME', u'main')
02 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
03 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
04 = {tuple} <type 'tuple'>: ('COLON', u':')
05 = {tuple} <type 'tuple'>: ('ENDL', u'\n', [], [('SPACE', u' ')])
06 = {tuple} <type 'tuple'>: (u'PRINT', u'print')
07 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
08 = {tuple} <type 'tuple'>: ('STRING', u'"main"')
09 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
10 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
11 = {tuple} <type 'tuple'>: ('ENDL', u'\n', [], [('COMMENT', u'# cmt')])
12 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
13 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
14 = {tuple} <type 'tuple'>: (u'IF', u'if', [], [('SPACE', u' ')])
15 = {tuple} <type 'tuple'>: ('NAME', u'__name__')
16 = {tuple} <type 'tuple'>: ('EQUAL_EQUAL', u'==', [('SPACE', u' ')], [('SPACE', u' ')])
17 = {tuple} <type 'tuple'>: ('STRING', u'"__main__"')
18 = {tuple} <type 'tuple'>: ('COLON', u':')
19 = {tuple} <type 'tuple'>: ('ENDL', u'\n', [], [('SPACE', u' ')])
20 = {tuple} <type 'tuple'>: ('NAME', u'main')
21 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
22 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
23 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
24 = {tuple} <type 'tuple'>: ('ENDMARKER', '')
And I’ve repeated it again for the second case:
- Before
space_group
function call:
00 = {tuple} <type 'tuple'>: (u'DEF', u'def')
01 = {tuple} <type 'tuple'>: ('SPACE', u' ')
02 = {tuple} <type 'tuple'>: ('NAME', u'main')
03 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
04 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
05 = {tuple} <type 'tuple'>: ('COLON', u':')
06 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
07 = {tuple} <type 'tuple'>: ('SPACE', u' ')
08 = {tuple} <type 'tuple'>: (u'PRINT', u'print')
09 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
10 = {tuple} <type 'tuple'>: ('STRING', u'"main"')
11 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
12 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
13 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
14 = {tuple} <type 'tuple'>: ('NAME', u'a')
15 = {tuple} <type 'tuple'>: ('SPACE', u' ')
16 = {tuple} <type 'tuple'>: ('EQUAL', u'=')
17 = {tuple} <type 'tuple'>: ('SPACE', u' ')
18 = {tuple} <type 'tuple'>: ('INT', u'1')
19 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
20 = {tuple} <type 'tuple'>: ('COMMENT', u'# cmt')
21 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
22 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
23 = {tuple} <type 'tuple'>: (u'IF', u'if')
24 = {tuple} <type 'tuple'>: ('SPACE', u' ')
25 = {tuple} <type 'tuple'>: ('NAME', u'__name__')
26 = {tuple} <type 'tuple'>: ('SPACE', u' ')
27 = {tuple} <type 'tuple'>: ('EQUAL_EQUAL', u'==')
28 = {tuple} <type 'tuple'>: ('SPACE', u' ')
29 = {tuple} <type 'tuple'>: ('STRING', u'"__main__"')
30 = {tuple} <type 'tuple'>: ('COLON', u':')
31 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
32 = {tuple} <type 'tuple'>: ('SPACE', u' ')
33 = {tuple} <type 'tuple'>: ('NAME', u'main')
34 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
35 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
36 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
37 = {tuple} <type 'tuple'>: ('ENDMARKER', '')
- After
space_group
function call:
00 = {tuple} <type 'tuple'>: (u'DEF', u'def', [], [('SPACE', u' ')])
03 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
07 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
06 = {tuple} <type 'tuple'>: (u'PRINT', u'print')
23 = {tuple} <type 'tuple'>: ('ENDL', u'\n', [], [('SPACE', u' ')])
17 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
05 = {tuple} <type 'tuple'>: ('ENDL', u'\n', [], [('SPACE', u' ')])
10 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
24 = {tuple} <type 'tuple'>: ('NAME', u'main')
02 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
12 = {tuple} <type 'tuple'>: ('NAME', u'a')
19 = {tuple} <type 'tuple'>: ('NAME', u'__name__')
14 = {tuple} <type 'tuple'>: ('INT', u'1')
27 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
18 = {tuple} <type 'tuple'>: (u'IF', u'if', [], [('SPACE', u' ')])
11 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
08 = {tuple} <type 'tuple'>: ('STRING', u'"main"')
04 = {tuple} <type 'tuple'>: ('COLON', u':')
22 = {tuple} <type 'tuple'>: ('COLON', u':')
13 = {tuple} <type 'tuple'>: ('EQUAL', u'=', [('SPACE', u' ')], [('SPACE', u' ')])
15 = {tuple} <type 'tuple'>: ('ENDL', u'\n', [], [('COMMENT', u'# cmt')])
16 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
01 = {tuple} <type 'tuple'>: ('NAME', u'main')
25 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
28 = {tuple} <type 'tuple'>: ('ENDMARKER', '')
09 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
20 = {tuple} <type 'tuple'>: ('EQUAL_EQUAL', u'==', [('SPACE', u' ')], [('SPACE', u' ')])
26 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
21 = {tuple} <type 'tuple'>: ('STRING', u'"__main__"')
In both excerpts, a COMMENT
was being attached to an ENDL
.(surprise!)
In next steps, I’ve got the next results from two calls: python inner_group()
and python mark_indentaion()
for the second case
- inner_group()
00 = {tuple} <type 'tuple'>: (u'DEF', u'def', [], [('SPACE', u' ')])
01 = {tuple} <type 'tuple'>: ('NAME', u'main')
02 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
03 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
04 = {tuple} <type 'tuple'>: ('COLON', u':')
05 = {tuple} <type 'tuple'>: ('ENDL', u'\n', [], [('SPACE', u' ')])
06 = {tuple} <type 'tuple'>: (u'PRINT', u'print')
07 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
08 = {tuple} <type 'tuple'>: ('STRING', u'"main"')
09 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
10 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
11 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
12 = {tuple} <type 'tuple'>: ('NAME', u'a')
13 = {tuple} <type 'tuple'>: ('EQUAL', u'=', [('SPACE', u' ')], [('SPACE', u' ')])
14 = {tuple} <type 'tuple'>: ('INT', u'1')
15 = {tuple} <type 'tuple'>: ('ENDL', u'\n', [], [('COMMENT', u'# cmt')])
16 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
17 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
18 = {tuple} <type 'tuple'>: (u'IF', u'if', [], [('SPACE', u' ')])
19 = {tuple} <type 'tuple'>: ('NAME', u'__name__')
20 = {tuple} <type 'tuple'>: ('EQUAL_EQUAL', u'==', [('SPACE', u' ')], [('SPACE', u' ')])
21 = {tuple} <type 'tuple'>: ('STRING', u'"__main__"')
22 = {tuple} <type 'tuple'>: ('COLON', u':')
23 = {tuple} <type 'tuple'>: ('ENDL', u'\n', [], [('SPACE', u' ')])
24 = {tuple} <type 'tuple'>: ('NAME', u'main')
25 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
26 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
27 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
28 = {tuple} <type 'tuple'>: ('ENDMARKER', '')
- mark_indentation()
00 = {tuple} <type 'tuple'>: (u'DEF', u'def', [], [('SPACE', u' ')])
01 = {tuple} <type 'tuple'>: ('NAME', u'main')
02 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
03 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
04 = {tuple} <type 'tuple'>: ('COLON', u':')
05 = {tuple} <type 'tuple'>: ('ENDL', u'\n', [], [('SPACE', u' ')])
06 = {tuple} <type 'tuple'>: ('INDENT', '')
07 = {tuple} <type 'tuple'>: (u'PRINT', u'print')
08 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
09 = {tuple} <type 'tuple'>: ('STRING', u'"main"')
10 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
11 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
12 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
13 = {tuple} <type 'tuple'>: ('DEDENT', '')
14 = {tuple} <type 'tuple'>: ('NAME', u'a')
15 = {tuple} <type 'tuple'>: ('EQUAL', u'=', [('SPACE', u' ')], [('SPACE', u' ')])
16 = {tuple} <type 'tuple'>: ('INT', u'1')
17 = {tuple} <type 'tuple'>: ('ENDL', u'\n', [], [('COMMENT', u'# cmt')])
18 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
19 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
20 = {tuple} <type 'tuple'>: (u'IF', u'if', [], [('SPACE', u' ')])
21 = {tuple} <type 'tuple'>: ('NAME', u'__name__')
22 = {tuple} <type 'tuple'>: ('EQUAL_EQUAL', u'==', [('SPACE', u' ')], [('SPACE', u' ')])
23 = {tuple} <type 'tuple'>: ('STRING', u'"__main__"')
24 = {tuple} <type 'tuple'>: ('COLON', u':')
25 = {tuple} <type 'tuple'>: ('ENDL', u'\n', [], [('SPACE', u' ')])
26 = {tuple} <type 'tuple'>: ('INDENT', '')
27 = {tuple} <type 'tuple'>: ('NAME', u'main')
28 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
29 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
30 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
31 = {tuple} <type 'tuple'>: ('DEDENT', '')
32 = {tuple} <type 'tuple'>: ('ENDMARKER', '')
Although COMMENT
was being attached to ENDL
, all seems well. Ok, but for the first case:
- inner_group()
00 = {tuple} <type 'tuple'>: (u'DEF', u'def', [], [('SPACE', u' ')])
01 = {tuple} <type 'tuple'>: ('NAME', u'main')
02 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
03 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
04 = {tuple} <type 'tuple'>: ('COLON', u':')
05 = {tuple} <type 'tuple'>: ('ENDL', u'\n', [], [('SPACE', u' ')])
06 = {tuple} <type 'tuple'>: (u'PRINT', u'print')
07 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
08 = {tuple} <type 'tuple'>: ('STRING', u'"main"')
09 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
10 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
11 = {tuple} <type 'tuple'>: ('ENDL', u'\n', [], [('COMMENT', u'# cmt')])
12 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
13 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
14 = {tuple} <type 'tuple'>: (u'IF', u'if', [], [('SPACE', u' ')])
15 = {tuple} <type 'tuple'>: ('NAME', u'__name__')
16 = {tuple} <type 'tuple'>: ('EQUAL_EQUAL', u'==', [('SPACE', u' ')], [('SPACE', u' ')])
17 = {tuple} <type 'tuple'>: ('STRING', u'"__main__"')
18 = {tuple} <type 'tuple'>: ('COLON', u':')
19 = {tuple} <type 'tuple'>: ('ENDL', u'\n', [], [('SPACE', u' ')])
20 = {tuple} <type 'tuple'>: ('NAME', u'main')
21 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
22 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
23 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
24 = {tuple} <type 'tuple'>: ('ENDMARKER', '')
- mark_indentation()
00 = {tuple} <type 'tuple'>: (u'DEF', u'def', [], [('SPACE', u' ')])
01 = {tuple} <type 'tuple'>: ('NAME', u'main')
02 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
03 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
04 = {tuple} <type 'tuple'>: ('COLON', u':')
05 = {tuple} <type 'tuple'>: ('ENDL', u'\n', [], [('SPACE', u' ')])
06 = {tuple} <type 'tuple'>: ('INDENT', '')
07 = {tuple} <type 'tuple'>: (u'PRINT', u'print')
08 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
09 = {tuple} <type 'tuple'>: ('STRING', u'"main"')
10 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
11 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
12 = {tuple} <type 'tuple'>: ('ENDL', u'\n', [], [('COMMENT', u'# cmt')])
13 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
14 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
15 = {tuple} <type 'tuple'>: ('DEDENT', '')
16 = {tuple} <type 'tuple'>: (u'IF', u'if', [], [('SPACE', u' ')])
17 = {tuple} <type 'tuple'>: ('NAME', u'__name__')
18 = {tuple} <type 'tuple'>: ('EQUAL_EQUAL', u'==', [('SPACE', u' ')], [('SPACE', u' ')])
19 = {tuple} <type 'tuple'>: ('STRING', u'"__main__"')
20 = {tuple} <type 'tuple'>: ('COLON', u':')
21 = {tuple} <type 'tuple'>: ('ENDL', u'\n', [], [('SPACE', u' ')])
22 = {tuple} <type 'tuple'>: ('INDENT', '')
23 = {tuple} <type 'tuple'>: ('NAME', u'main')
24 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
25 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
26 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
27 = {tuple} <type 'tuple'>: ('DEDENT', '')
28 = {tuple} <type 'tuple'>: ('ENDMARKER', '')
I got it!
DEDENT
has not been being set correctly.
How to fix it?
There were two ways to fix it. Fix mark_indentation
or space_group
.
I reviewed both cases and picked out space_group
function.
Because mark_indentation
function just takes sequence after modification of space_group
.
And a COMMENT
node that is being attached inside of ENDL
, seems not a good choice.
From my point of view,
CommentNode
is the independent element of the tree. It doesn’t relevant to the code if we would like to implement an interpreter. But for purposes of analysis, we have to save all helpful information from the lexer level( intohidden
layer) to the parser’s level and AST.
Below my fix is presented(https://github.com/rojaster/baron/commit/7a2cf9e79862cf0f0a10ef3c74a6fcf704251175):
formatting_grouper.py
def group_generator(sequence): .... markers = ("SPACE") # ("SPACE","COMMENT") it fixes a CommentNode position while True: .... if current[0] in markers and iterator.show_next() and iterator.show_next()[0] in GROUP_SPACE_BEFORE: ....
Code must be simple as possible, but not so much to be stupid. That is called - experience, skills, art
And as a result I got next:
- Samples:
t_buffer = textwrap.dedent(u"""\
def main():
print("main")
# cmt
if __name__ == "__main__":
main()
""")
t_buffer0 = textwrap.dedent(u"""\
def main():
print("main")
a = 1
# cmt
if __name__ == "__main__":
main()
""")
t_buffer1 = textwrap.dedent(u"""\
#!/usr/bin/env python
#! -*- coding: utf-8 -*-
def main():
# comment before
print("main") # inline comment
#comment after
# a = 1
# cmt
if __name__ == "__main__":
main()
""")
rbt_buffer = RedBaron(t_buffer)
rbt_buffer0 = RedBaron(t_buffer0)
rbt_buffer1 = RedBaron(t_buffer1)
print(rbt_buffer.help(deep=True))
print(rbt_buffer.dumps())
print("\n")
print("0"*100)
print(rbt_buffer0.help(deep=True))
print(rbt_buffer0.dumps())
print("\n")
print("1"*100)
print(rbt_buffer1.help(deep=True))
print(rbt_buffer1.dumps())
print("\n")
- Samples trees
0 -----------------------------------------------------
DefNode()
# identifiers: def, def_, defnode, funcdef, funcdef_
# default test value: name
name=u'main'
decorators ->
arguments ->
value ->
* PrintNode()
# identifiers: print, print_, printnode
destination ->
None
value ->
* AssociativeParenthesisNode()
# identifiers: associative_parenthesis, associative_parenthesis_, associativeparenthesis, associativeparenthesisnode
value ->
StringNode()
# identifiers: string, string_, stringnode
value=u'"main"'
* EndlNode()
# identifiers: endl, endl_, endlnode
value=u'\n'
indent=''
1 -----------------------------------------------------
CommentNode()
# identifiers: comment, comment_, commentnode
value=u'# cmt'
2 -----------------------------------------------------
EndlNode()
# identifiers: endl, endl_, endlnode
value=u'\n'
indent=''
3 -----------------------------------------------------
EndlNode()
# identifiers: endl, endl_, endlnode
value=u'\n'
indent=''
4 -----------------------------------------------------
IfelseblockNode()
# identifiers: ifelseblock, ifelseblock_, ifelseblocknode
value ->
* IfNode()
# identifiers: if, if_, ifnode
test ->
ComparisonNode()
# identifiers: comparison, comparison_, comparisonnode
first ->
NameNode()
# identifiers: name, name_, namenode
value=u'__name__'
value ->
ComparisonOperatorNode()
# identifiers: comparison_operator, comparison_operator_, comparisonoperator, comparisonoperatornode
first=u'=='
second=''
second ->
StringNode()
# identifiers: string, string_, stringnode
value=u'"__main__"'
value ->
* AtomtrailersNode()
# identifiers: atomtrailers, atomtrailers_, atomtrailersnode
value ->
* NameNode()
# identifiers: name, name_, namenode
value=u'main'
* CallNode()
# identifiers: call, call_, callnode
value ->
None
def main():
print("main")
# cmt
if __name__ == "__main__":
main()
0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
0 -----------------------------------------------------
DefNode()
# identifiers: def, def_, defnode, funcdef, funcdef_
# default test value: name
name=u'main'
decorators ->
arguments ->
value ->
* PrintNode()
# identifiers: print, print_, printnode
destination ->
None
value ->
* AssociativeParenthesisNode()
# identifiers: associative_parenthesis, associative_parenthesis_, associativeparenthesis, associativeparenthesisnode
value ->
StringNode()
# identifiers: string, string_, stringnode
value=u'"main"'
* EndlNode()
# identifiers: endl, endl_, endlnode
value=u'\n'
indent=''
1 -----------------------------------------------------
AssignmentNode()
# identifiers: assign, assignment, assignment_, assignmentnode
operator=''
target ->
NameNode()
# identifiers: name, name_, namenode
value=u'a'
value ->
IntNode()
# identifiers: int, int_, intnode
value=u'1'
2 -----------------------------------------------------
EndlNode()
# identifiers: endl, endl_, endlnode
value=u'\n'
indent=''
3 -----------------------------------------------------
CommentNode()
# identifiers: comment, comment_, commentnode
value=u'# cmt'
4 -----------------------------------------------------
EndlNode()
# identifiers: endl, endl_, endlnode
value=u'\n'
indent=''
5 -----------------------------------------------------
EndlNode()
# identifiers: endl, endl_, endlnode
value=u'\n'
indent=''
6 -----------------------------------------------------
IfelseblockNode()
# identifiers: ifelseblock, ifelseblock_, ifelseblocknode
value ->
* IfNode()
# identifiers: if, if_, ifnode
test ->
ComparisonNode()
# identifiers: comparison, comparison_, comparisonnode
first ->
NameNode()
# identifiers: name, name_, namenode
value=u'__name__'
value ->
ComparisonOperatorNode()
# identifiers: comparison_operator, comparison_operator_, comparisonoperator, comparisonoperatornode
first=u'=='
second=''
second ->
StringNode()
# identifiers: string, string_, stringnode
value=u'"__main__"'
value ->
* AtomtrailersNode()
# identifiers: atomtrailers, atomtrailers_, atomtrailersnode
value ->
* NameNode()
# identifiers: name, name_, namenode
value=u'main'
* CallNode()
# identifiers: call, call_, callnode
value ->
None
def main():
print("main")
a = 1
# cmt
if __name__ == "__main__":
main()
1111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111
0 -----------------------------------------------------
CommentNode()
# identifiers: comment, comment_, commentnode
value=u'#!/usr/bin/env python'
1 -----------------------------------------------------
EndlNode()
# identifiers: endl, endl_, endlnode
value=u'\n'
indent=''
2 -----------------------------------------------------
CommentNode()
# identifiers: comment, comment_, commentnode
value=u'#! -*- coding: utf-8 -*-'
3 -----------------------------------------------------
EndlNode()
# identifiers: endl, endl_, endlnode
value=u'\n'
indent=''
4 -----------------------------------------------------
DefNode()
# identifiers: def, def_, defnode, funcdef, funcdef_
# default test value: name
name=u'main'
decorators ->
arguments ->
value ->
* CommentNode()
# identifiers: comment, comment_, commentnode
value=u'# comment before'
* PrintNode()
# identifiers: print, print_, printnode
destination ->
None
value ->
* AssociativeParenthesisNode()
# identifiers: associative_parenthesis, associative_parenthesis_, associativeparenthesis, associativeparenthesisnode
value ->
StringNode()
# identifiers: string, string_, stringnode
value=u'"main"'
* CommentNode()
# identifiers: comment, comment_, commentnode
value=u'# inline comment'
* EndlNode()
# identifiers: endl, endl_, endlnode
value=u'\n'
indent=u' '
* CommentNode()
# identifiers: comment, comment_, commentnode
value=u'#comment after'
* EndlNode()
# identifiers: endl, endl_, endlnode
value=u'\n'
indent=''
5 -----------------------------------------------------
CommentNode()
# identifiers: comment, comment_, commentnode
value=u'# a = 1'
6 -----------------------------------------------------
EndlNode()
# identifiers: endl, endl_, endlnode
value=u'\n'
indent=''
7 -----------------------------------------------------
CommentNode()
# identifiers: comment, comment_, commentnode
value=u'# cmt'
8 -----------------------------------------------------
EndlNode()
# identifiers: endl, endl_, endlnode
value=u'\n'
indent=''
9 -----------------------------------------------------
EndlNode()
# identifiers: endl, endl_, endlnode
value=u'\n'
indent=''
10 -----------------------------------------------------
IfelseblockNode()
# identifiers: ifelseblock, ifelseblock_, ifelseblocknode
value ->
* IfNode()
# identifiers: if, if_, ifnode
test ->
ComparisonNode()
# identifiers: comparison, comparison_, comparisonnode
first ->
NameNode()
# identifiers: name, name_, namenode
value=u'__name__'
value ->
ComparisonOperatorNode()
# identifiers: comparison_operator, comparison_operator_, comparisonoperator, comparisonoperatornode
first=u'=='
second=''
second ->
StringNode()
# identifiers: string, string_, stringnode
value=u'"__main__"'
value ->
* AtomtrailersNode()
# identifiers: atomtrailers, atomtrailers_, atomtrailersnode
value ->
* NameNode()
# identifiers: name, name_, namenode
value=u'main'
* CallNode()
# identifiers: call, call_, callnode
value ->
None
#!/usr/bin/env python
#! -*- coding: utf-8 -*-
def main():
# comment before
print("main") # inline comment
#comment after
# a = 1
# cmt
if __name__ == "__main__":
main()
Process finished with exit code 0
Looks correct. Some questions still exist but it’s the kind of hack or patch and it doesn’t break existent tests. Let’s call it solution. </br> Thus the result satisfies my expectations and requirements.</br> Thank you for reading and your time, dear reader!