Heisenbug or the true story of undefined behavior, part 2

22 Dec 2019 - Development Original link

In the previous part, we’ve discussed the problem that was presented in RedBaron. Today I want to show you my old work, my old little investigation about another strange bug(or feature?) in Baron. Baron is a part of RedBaron library, literally, RedBaron relies on Baron Full Syntax Tree which it’s giving for next manipulations. Baron had a problem with CommentNode’s position in the FST tree. This bug broke RedBaron’s nodes recognition.

CommentNode’s position problem

Below I’ll tell you a story about it and start from examples. Let’s roll sleeves up and get down to business.

Take a look at examples:

  • First
    def main():
        print("main")

    # cmt

    if __name__ == "__main__":
        main()
  • Second
    def main():
        print("main")

    a = 1
    # cmt

    if __name__ == "__main__":
        main()

I used dump method (.help()) provided by the RedBaron to get the presentation of internal tree structures

  • Tree for the first case
0 -----------------------------------------------------
DefNode()
  # identifiers: def, def_, defnode, funcdef, funcdef_
  # default test value: name
  name=u'main'
  decorators ->
  arguments ->
  value ->
    * PrintNode()
        # identifiers: print, print_, printnode
        destination ->
          None
        value ->
          * AssociativeParenthesisNode()
              # identifiers: associative_parenthesis, associative_parenthesis_, associativeparenthesis, associativeparenthesisnode
              value ->
                StringNode()
                  # identifiers: string, string_, stringnode
                  value=u'"main"'
    * EndlNode()
        # identifiers: endl, endl_, endlnode
        value=u'\n'
        indent=''
    * CommentNode()
        # identifiers: comment, comment_, commentnode
        value=u'# cmt'
    * EndlNode()
        # identifiers: endl, endl_, endlnode
        value=u'\n'
        indent=''
1 -----------------------------------------------------
IfelseblockNode()
  # identifiers: ifelseblock, ifelseblock_, ifelseblocknode
  value ->
    * IfNode()
        # identifiers: if, if_, ifnode
        test ->
          ComparisonNode()
            # identifiers: comparison, comparison_, comparisonnode
            first ->
              NameNode()
                # identifiers: name, name_, namenode
                value=u'__name__'
            value ->
              ComparisonOperatorNode()
                # identifiers: comparison_operator, comparison_operator_, comparisonoperator, comparisonoperatornode
                first=u'=='
                second=''
            second ->
              StringNode()
                # identifiers: string, string_, stringnode
                value=u'"__main__"'
        value ->
          * AtomtrailersNode()
              # identifiers: atomtrailers, atomtrailers_, atomtrailersnode
              value ->
                * NameNode()
                    # identifiers: name, name_, namenode
                    value=u'main'
                * CallNode()
                    # identifiers: call, call_, callnode
                    value ->
None
def main():
    print("main")

# cmt

if __name__ == "__main__":
    main()

  • Tree for the second case
0 -----------------------------------------------------
DefNode()
  # identifiers: def, def_, defnode, funcdef, funcdef_
  # default test value: name
  name=u'main'
  decorators ->
  arguments ->
  value ->
    * PrintNode()
        # identifiers: print, print_, printnode
        destination ->
          None
        value ->
          * AssociativeParenthesisNode()
              # identifiers: associative_parenthesis, associative_parenthesis_, associativeparenthesis, associativeparenthesisnode
              value ->
                StringNode()
                  # identifiers: string, string_, stringnode
                  value=u'"main"'
    * EndlNode()
        # identifiers: endl, endl_, endlnode
        value=u'\n'
        indent=''
1 -----------------------------------------------------
AssignmentNode()
  # identifiers: assign, assignment, assignment_, assignmentnode
  operator=''
  target ->
    NameNode()
      # identifiers: name, name_, namenode
      value=u'a'
  value ->
    IntNode()
      # identifiers: int, int_, intnode
      value=u'1'
2 -----------------------------------------------------
EndlNode()
  # identifiers: endl, endl_, endlnode
  value=u'\n'
  indent=''
3 -----------------------------------------------------
CommentNode()
  # identifiers: comment, comment_, commentnode
  value=u'# cmt'
4 -----------------------------------------------------
EndlNode()
  # identifiers: endl, endl_, endlnode
  value=u'\n'
  indent=''
5 -----------------------------------------------------
EndlNode()
  # identifiers: endl, endl_, endlnode
  value=u'\n'
  indent=''
6 -----------------------------------------------------
IfelseblockNode()
  # identifiers: ifelseblock, ifelseblock_, ifelseblocknode
  value ->
    * IfNode()
        # identifiers: if, if_, ifnode
        test ->
          ComparisonNode()
            # identifiers: comparison, comparison_, comparisonnode
            first ->
              NameNode()
                # identifiers: name, name_, namenode
                value=u'__name__'
            value ->
              ComparisonOperatorNode()
                # identifiers: comparison_operator, comparison_operator_, comparisonoperator, comparisonoperatornode
                first=u'=='
                second=''
            second ->
              StringNode()
                # identifiers: string, string_, stringnode
                value=u'"__main__"'
        value ->
          * AtomtrailersNode()
              # identifiers: atomtrailers, atomtrailers_, atomtrailersnode
              value ->
                * NameNode()
                    # identifiers: name, name_, namenode
                    value=u'main'
                * CallNode()
                    # identifiers: call, call_, callnode
                    value ->
None
def main():
    print("main")

a = 1
# cmt

if __name__ == "__main__":
    main()

In the first excerpt what we see is not what we get - CommentNode is not in the right position.</br> The right position in the first example is out from the DefNode of main(). But it has been dumped(.dumps()) correctly.

Let me show you another example, what if I comment a=1:

0 -----------------------------------------------------
DefNode()
  # identifiers: def, def_, defnode, funcdef, funcdef_
  # default test value: name
  name=u'main'
  decorators ->
  arguments ->
  value ->
    * PrintNode()
        # identifiers: print, print_, printnode
        destination ->
          None
        value ->
          * AssociativeParenthesisNode()
              # identifiers: associative_parenthesis, associative_parenthesis_, associativeparenthesis, associativeparenthesisnode
              value ->
                StringNode()
                  # identifiers: string, string_, stringnode
                  value=u'"main"'
    * EndlNode()
        # identifiers: endl, endl_, endlnode
        value=u'\n'
        indent=''
    * CommentNode()
        # identifiers: comment, comment_, commentnode
        value=u'# a = 1'
    * CommentNode()
        # identifiers: comment, comment_, commentnode
        value=u'# cmt'
    * EndlNode()
        # identifiers: endl, endl_, endlnode
        value=u'\n'
        indent=''
1 -----------------------------------------------------
IfelseblockNode()
  # identifiers: ifelseblock, ifelseblock_, ifelseblocknode
  value ->
    * IfNode()
        # identifiers: if, if_, ifnode
        test ->
          ComparisonNode()
            # identifiers: comparison, comparison_, comparisonnode
            first ->
              NameNode()
                # identifiers: name, name_, namenode
                value=u'__name__'
            value ->
              ComparisonOperatorNode()
                # identifiers: comparison_operator, comparison_operator_, comparisonoperator, comparisonoperatornode
                first=u'=='
                second=''
            second ->
              StringNode()
                # identifiers: string, string_, stringnode
                value=u'"__main__"'
        value ->
          * AtomtrailersNode()
              # identifiers: atomtrailers, atomtrailers_, atomtrailersnode
              value ->
                * NameNode()
                    # identifiers: name, name_, namenode
                    value=u'main'
                * CallNode()
                    # identifiers: call, call_, callnode
                    value ->
None
def main():
    print("main")

# a = 1
# cmt

if __name__ == "__main__":
    main()

Here we have the same issue like in the first case. Pay attention to the next fact that between two CommentNodes, there is no EndlNode. It has been missed.

Let’s look on Baron’s FST for these cases:

  • Baron’s FST for the first case
[{'arguments': [],
  'decorators': [],
  'fifth_formatting': [],
  'first_formatting': [{'type': 'space', 'value': u' '}],
  'fourth_formatting': [],
  'name': u'main',
  'second_formatting': [],
  'sixth_formatting': [],
  'third_formatting': [],
  'type': 'def',
  'value': [{'formatting': [],
    'indent': u'    ',
    'type': 'endl',
    'value': u'\n'},
   {'destination': None,
    'destination_formatting': [],
    'formatting': [],
    'type': 'print',
    'value': [{'first_formatting': [],
      'fourth_formatting': [],
      'second_formatting': [],
      'third_formatting': [],
      'type': 'associative_parenthesis',
      'value': {'first_formatting': [],
       'second_formatting': [],
       'type': 'string',
       'value': u'"main"'}}]},
   {'formatting': [], 'indent': '', 'type': 'endl', 'value': u'\n'},
   {'formatting': [], 'indent': '', 'type': 'endl', 'value': u'\n'}]},
 {'formatting': [], 'type': 'comment', 'value': u'# cmt'},
 {'formatting': [], 'indent': '', 'type': 'endl', 'value': u'\n'},
 {'formatting': [], 'indent': '', 'type': 'endl', 'value': u'\n'},
 {'type': 'ifelseblock',
  'value': [{'first_formatting': [{'type': 'space', 'value': u' '}],
    'second_formatting': [],
    'test': {'first': {'type': 'name', 'value': u'__name__'},
     'first_formatting': [{'type': 'space', 'value': u' '}],
     'second': {'first_formatting': [],
      'second_formatting': [],
      'type': 'string',
      'value': u'"__main__"'},
     'second_formatting': [{'type': 'space', 'value': u' '}],
     'type': 'comparison',
     'value': {'first': u'==',
      'formatting': [],
      'second': '',
      'type': 'comparison_operator'}},
    'third_formatting': [],
    'type': 'if',
    'value': [{'formatting': [],
      'indent': u'    ',
      'type': 'endl',
      'value': u'\n'},
     {'type': 'atomtrailers',
      'value': [{'type': 'name', 'value': u'main'},
       {'first_formatting': [],
        'fourth_formatting': [],
        'second_formatting': [],
        'third_formatting': [],
        'type': 'call',
        'value': []}]},
     {'formatting': [], 'indent': '', 'type': 'endl', 'value': u'\n'}]}]}]

  • Baron’s FST for the second case
[{'arguments': [],
  'decorators': [],
  'fifth_formatting': [],
  'first_formatting': [{'type': 'space', 'value': u' '}],
  'fourth_formatting': [],
  'name': u'main',
  'second_formatting': [],
  'sixth_formatting': [],
  'third_formatting': [],
  'type': 'def',
  'value': [{'formatting': [],
    'indent': u'    ',
    'type': 'endl',
    'value': u'\n'},
   {'destination': None,
    'destination_formatting': [],
    'formatting': [],
    'type': 'print',
    'value': [{'first_formatting': [],
      'fourth_formatting': [],
      'second_formatting': [],
      'third_formatting': [],
      'type': 'associative_parenthesis',
      'value': {'first_formatting': [],
       'second_formatting': [],
       'type': 'string',
       'value': u'"main"'}}]},
   {'formatting': [], 'indent': '', 'type': 'endl', 'value': u'\n'},
   {'formatting': [], 'indent': '', 'type': 'endl', 'value': u'\n'}]},
 {'first_formatting': [{'type': 'space', 'value': u' '}],
  'operator': '',
  'second_formatting': [{'type': 'space', 'value': u' '}],
  'target': {'type': 'name', 'value': u'a'},
  'type': 'assignment',
  'value': {'section': 'number', 'type': 'int', 'value': u'1'}},
 {'formatting': [], 'indent': '', 'type': 'endl', 'value': u'\n'},
 {'formatting': [], 'type': 'comment', 'value': u'# cmt'},
 {'formatting': [], 'indent': '', 'type': 'endl', 'value': u'\n'},
 {'formatting': [], 'indent': '', 'type': 'endl', 'value': u'\n'},
 {'type': 'ifelseblock',
  'value': [{'first_formatting': [{'type': 'space', 'value': u' '}],
    'second_formatting': [],
    'test': {'first': {'type': 'name', 'value': u'__name__'},
     'first_formatting': [{'type': 'space', 'value': u' '}],
     'second': {'first_formatting': [],
      'second_formatting': [],
      'type': 'string',
      'value': u'"__main__"'},
     'second_formatting': [{'type': 'space', 'value': u' '}],
     'type': 'comparison',
     'value': {'first': u'==',
      'formatting': [],
      'second': '',
      'type': 'comparison_operator'}},
    'third_formatting': [],
    'type': 'if',
    'value': [{'formatting': [],
      'indent': u'    ',
      'type': 'endl',
      'value': u'\n'},
     {'type': 'atomtrailers',
      'value': [{'type': 'name', 'value': u'main'},
       {'first_formatting': [],
        'fourth_formatting': [],
        'second_formatting': [],
        'third_formatting': [],
        'type': 'call',
        'value': []}]},
     {'formatting': [], 'indent': '', 'type': 'endl', 'value': u'\n'}]}]}]
  • Baron’s FST for the second case(where I have commented assignment node):
[{'arguments': [],
  'decorators': [],
  'fifth_formatting': [],
  'first_formatting': [{'type': 'space', 'value': u' '}],
  'fourth_formatting': [],
  'name': u'main',
  'second_formatting': [],
  'sixth_formatting': [],
  'third_formatting': [],
  'type': 'def',
  'value': [{'formatting': [],
    'indent': u'    ',
    'type': 'endl',
    'value': u'\n'},
   {'destination': None,
    'destination_formatting': [],
    'formatting': [],
    'type': 'print',
    'value': [{'first_formatting': [],
      'fourth_formatting': [],
      'second_formatting': [],
      'third_formatting': [],
      'type': 'associative_parenthesis',
      'value': {'first_formatting': [],
       'second_formatting': [],
       'type': 'string',
       'value': u'"main"'}}]},
   {'formatting': [], 'indent': '', 'type': 'endl', 'value': u'\n'},
   {'formatting': [], 'indent': '', 'type': 'endl', 'value': u'\n'}]},
 {'formatting': [], 'type': 'comment', 'value': u'#a = 1'},
 {'formatting': [], 'indent': '', 'type': 'endl', 'value': u'\n'},
 {'formatting': [], 'type': 'comment', 'value': u'# cmt'},
 {'formatting': [], 'indent': '', 'type': 'endl', 'value': u'\n'},
 {'formatting': [], 'indent': '', 'type': 'endl', 'value': u'\n'},
 {'type': 'ifelseblock',
  'value': [{'first_formatting': [{'type': 'space', 'value': u' '}],
    'second_formatting': [],
    'test': {'first': {'type': 'name', 'value': u'__name__'},
     'first_formatting': [{'type': 'space', 'value': u' '}],
     'second': {'first_formatting': [],
      'second_formatting': [],
      'type': 'string',
      'value': u'"__main__"'},
     'second_formatting': [{'type': 'space', 'value': u' '}],
     'type': 'comparison',
     'value': {'first': u'==',
      'formatting': [],
      'second': '',
      'type': 'comparison_operator'}},
    'third_formatting': [],
    'type': 'if',
    'value': [{'formatting': [],
      'indent': u'    ',
      'type': 'endl',
      'value': u'\n'},
     {'type': 'atomtrailers',
      'value': [{'type': 'name', 'value': u'main'},
       {'first_formatting': [],
        'fourth_formatting': [],
        'second_formatting': [],
        'third_formatting': [],
        'type': 'call',
        'value': []}]},
     {'formatting': [], 'indent': '', 'type': 'endl', 'value': u'\n'}]}]}]

I’ve been surprised a little bit because all seems to be good. But a problem was deeper.

During debugging I’ve recognized that the RedBaron has already received a broken sequence of tokens. Consequently, the problem is occurring between two layers and not affected a dump function in all cases.

Let’s trace back functions calls for one source where(I’m thinking) a problem occurs:

__init__,:redbaron.py:37 -> 
parse,baron.py:49 -> 
tokenize, baron.py:70 -> 
group, formatting_grouper.py:108 -> 
group_generator, formatting_grouper.py:118

A code inside of tokenize function has to be refactored to set up a breakpoint

Before a space_group function call:

00 = {tuple} <type 'tuple'>: (u'DEF', u'def')
01 = {tuple} <type 'tuple'>: ('SPACE', u' ')
02 = {tuple} <type 'tuple'>: ('NAME', u'main')
03 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
04 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
05 = {tuple} <type 'tuple'>: ('COLON', u':')
06 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
07 = {tuple} <type 'tuple'>: ('SPACE', u'    ')
08 = {tuple} <type 'tuple'>: (u'PRINT', u'print')
09 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
10 = {tuple} <type 'tuple'>: ('STRING', u'"main"')
11 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
12 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
13 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
14 = {tuple} <type 'tuple'>: ('COMMENT', u'# cmt')
15 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
16 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
17 = {tuple} <type 'tuple'>: (u'IF', u'if')
18 = {tuple} <type 'tuple'>: ('SPACE', u' ')
19 = {tuple} <type 'tuple'>: ('NAME', u'__name__')
20 = {tuple} <type 'tuple'>: ('SPACE', u' ')
21 = {tuple} <type 'tuple'>: ('EQUAL_EQUAL', u'==')
22 = {tuple} <type 'tuple'>: ('SPACE', u' ')
23 = {tuple} <type 'tuple'>: ('STRING', u'"__main__"')
24 = {tuple} <type 'tuple'>: ('COLON', u':')
25 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
26 = {tuple} <type 'tuple'>: ('SPACE', u'    ')
27 = {tuple} <type 'tuple'>: ('NAME', u'main')
28 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
29 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
30 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
31 = {tuple} <type 'tuple'>: ('ENDMARKER', '')
  • After a space_group function call
00 = {tuple} <type 'tuple'>: (u'DEF', u'def', [], [('SPACE', u' ')])
01 = {tuple} <type 'tuple'>: ('NAME', u'main')
02 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
03 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
04 = {tuple} <type 'tuple'>: ('COLON', u':')
05 = {tuple} <type 'tuple'>: ('ENDL', u'\n', [], [('SPACE', u'    ')])
06 = {tuple} <type 'tuple'>: (u'PRINT', u'print')
07 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
08 = {tuple} <type 'tuple'>: ('STRING', u'"main"')
09 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
10 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
11 = {tuple} <type 'tuple'>: ('ENDL', u'\n', [], [('COMMENT', u'# cmt')])
12 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
13 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
14 = {tuple} <type 'tuple'>: (u'IF', u'if', [], [('SPACE', u' ')])
15 = {tuple} <type 'tuple'>: ('NAME', u'__name__')
16 = {tuple} <type 'tuple'>: ('EQUAL_EQUAL', u'==', [('SPACE', u' ')], [('SPACE', u' ')])
17 = {tuple} <type 'tuple'>: ('STRING', u'"__main__"')
18 = {tuple} <type 'tuple'>: ('COLON', u':')
19 = {tuple} <type 'tuple'>: ('ENDL', u'\n', [], [('SPACE', u'    ')])
20 = {tuple} <type 'tuple'>: ('NAME', u'main')
21 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
22 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
23 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
24 = {tuple} <type 'tuple'>: ('ENDMARKER', '')

And I’ve repeated it again for the second case:

  • Before space_group function call:
00 = {tuple} <type 'tuple'>: (u'DEF', u'def')
01 = {tuple} <type 'tuple'>: ('SPACE', u' ')
02 = {tuple} <type 'tuple'>: ('NAME', u'main')
03 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
04 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
05 = {tuple} <type 'tuple'>: ('COLON', u':')
06 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
07 = {tuple} <type 'tuple'>: ('SPACE', u'    ')
08 = {tuple} <type 'tuple'>: (u'PRINT', u'print')
09 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
10 = {tuple} <type 'tuple'>: ('STRING', u'"main"')
11 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
12 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
13 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
14 = {tuple} <type 'tuple'>: ('NAME', u'a')
15 = {tuple} <type 'tuple'>: ('SPACE', u' ')
16 = {tuple} <type 'tuple'>: ('EQUAL', u'=')
17 = {tuple} <type 'tuple'>: ('SPACE', u' ')
18 = {tuple} <type 'tuple'>: ('INT', u'1')
19 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
20 = {tuple} <type 'tuple'>: ('COMMENT', u'# cmt')
21 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
22 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
23 = {tuple} <type 'tuple'>: (u'IF', u'if')
24 = {tuple} <type 'tuple'>: ('SPACE', u' ')
25 = {tuple} <type 'tuple'>: ('NAME', u'__name__')
26 = {tuple} <type 'tuple'>: ('SPACE', u' ')
27 = {tuple} <type 'tuple'>: ('EQUAL_EQUAL', u'==')
28 = {tuple} <type 'tuple'>: ('SPACE', u' ')
29 = {tuple} <type 'tuple'>: ('STRING', u'"__main__"')
30 = {tuple} <type 'tuple'>: ('COLON', u':')
31 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
32 = {tuple} <type 'tuple'>: ('SPACE', u'    ')
33 = {tuple} <type 'tuple'>: ('NAME', u'main')
34 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
35 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
36 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
37 = {tuple} <type 'tuple'>: ('ENDMARKER', '')
  • After space_group function call:
00 = {tuple} <type 'tuple'>: (u'DEF', u'def', [], [('SPACE', u' ')])
03 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
07 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
06 = {tuple} <type 'tuple'>: (u'PRINT', u'print')
23 = {tuple} <type 'tuple'>: ('ENDL', u'\n', [], [('SPACE', u'    ')])
17 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
05 = {tuple} <type 'tuple'>: ('ENDL', u'\n', [], [('SPACE', u'    ')])
10 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
24 = {tuple} <type 'tuple'>: ('NAME', u'main')
02 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
12 = {tuple} <type 'tuple'>: ('NAME', u'a')
19 = {tuple} <type 'tuple'>: ('NAME', u'__name__')
14 = {tuple} <type 'tuple'>: ('INT', u'1')
27 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
18 = {tuple} <type 'tuple'>: (u'IF', u'if', [], [('SPACE', u' ')])
11 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
08 = {tuple} <type 'tuple'>: ('STRING', u'"main"')
04 = {tuple} <type 'tuple'>: ('COLON', u':')
22 = {tuple} <type 'tuple'>: ('COLON', u':')
13 = {tuple} <type 'tuple'>: ('EQUAL', u'=', [('SPACE', u' ')], [('SPACE', u' ')])
15 = {tuple} <type 'tuple'>: ('ENDL', u'\n', [], [('COMMENT', u'# cmt')])
16 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
01 = {tuple} <type 'tuple'>: ('NAME', u'main')
25 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
28 = {tuple} <type 'tuple'>: ('ENDMARKER', '')
09 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
20 = {tuple} <type 'tuple'>: ('EQUAL_EQUAL', u'==', [('SPACE', u' ')], [('SPACE', u' ')])
26 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
21 = {tuple} <type 'tuple'>: ('STRING', u'"__main__"')

In both excerpts, a COMMENT was being attached to an ENDL.(surprise!)

In next steps, I’ve got the next results from two calls: python inner_group() and python mark_indentaion() for the second case

  • inner_group()
00 = {tuple} <type 'tuple'>: (u'DEF', u'def', [], [('SPACE', u' ')])
01 = {tuple} <type 'tuple'>: ('NAME', u'main')
02 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
03 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
04 = {tuple} <type 'tuple'>: ('COLON', u':')
05 = {tuple} <type 'tuple'>: ('ENDL', u'\n', [], [('SPACE', u'    ')])
06 = {tuple} <type 'tuple'>: (u'PRINT', u'print')
07 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
08 = {tuple} <type 'tuple'>: ('STRING', u'"main"')
09 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
10 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
11 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
12 = {tuple} <type 'tuple'>: ('NAME', u'a')
13 = {tuple} <type 'tuple'>: ('EQUAL', u'=', [('SPACE', u' ')], [('SPACE', u' ')])
14 = {tuple} <type 'tuple'>: ('INT', u'1')
15 = {tuple} <type 'tuple'>: ('ENDL', u'\n', [], [('COMMENT', u'# cmt')])
16 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
17 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
18 = {tuple} <type 'tuple'>: (u'IF', u'if', [], [('SPACE', u' ')])
19 = {tuple} <type 'tuple'>: ('NAME', u'__name__')
20 = {tuple} <type 'tuple'>: ('EQUAL_EQUAL', u'==', [('SPACE', u' ')], [('SPACE', u' ')])
21 = {tuple} <type 'tuple'>: ('STRING', u'"__main__"')
22 = {tuple} <type 'tuple'>: ('COLON', u':')
23 = {tuple} <type 'tuple'>: ('ENDL', u'\n', [], [('SPACE', u'    ')])
24 = {tuple} <type 'tuple'>: ('NAME', u'main')
25 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
26 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
27 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
28 = {tuple} <type 'tuple'>: ('ENDMARKER', '')
  • mark_indentation()
00 = {tuple} <type 'tuple'>: (u'DEF', u'def', [], [('SPACE', u' ')])
01 = {tuple} <type 'tuple'>: ('NAME', u'main')
02 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
03 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
04 = {tuple} <type 'tuple'>: ('COLON', u':')
05 = {tuple} <type 'tuple'>: ('ENDL', u'\n', [], [('SPACE', u'    ')])
06 = {tuple} <type 'tuple'>: ('INDENT', '')
07 = {tuple} <type 'tuple'>: (u'PRINT', u'print')
08 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
09 = {tuple} <type 'tuple'>: ('STRING', u'"main"')
10 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
11 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
12 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
13 = {tuple} <type 'tuple'>: ('DEDENT', '')
14 = {tuple} <type 'tuple'>: ('NAME', u'a')
15 = {tuple} <type 'tuple'>: ('EQUAL', u'=', [('SPACE', u' ')], [('SPACE', u' ')])
16 = {tuple} <type 'tuple'>: ('INT', u'1')
17 = {tuple} <type 'tuple'>: ('ENDL', u'\n', [], [('COMMENT', u'# cmt')])
18 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
19 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
20 = {tuple} <type 'tuple'>: (u'IF', u'if', [], [('SPACE', u' ')])
21 = {tuple} <type 'tuple'>: ('NAME', u'__name__')
22 = {tuple} <type 'tuple'>: ('EQUAL_EQUAL', u'==', [('SPACE', u' ')], [('SPACE', u' ')])
23 = {tuple} <type 'tuple'>: ('STRING', u'"__main__"')
24 = {tuple} <type 'tuple'>: ('COLON', u':')
25 = {tuple} <type 'tuple'>: ('ENDL', u'\n', [], [('SPACE', u'    ')])
26 = {tuple} <type 'tuple'>: ('INDENT', '')
27 = {tuple} <type 'tuple'>: ('NAME', u'main')
28 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
29 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
30 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
31 = {tuple} <type 'tuple'>: ('DEDENT', '')
32 = {tuple} <type 'tuple'>: ('ENDMARKER', '')

Although COMMENT was being attached to ENDL, all seems well. Ok, but for the first case:

  • inner_group()
00 = {tuple} <type 'tuple'>: (u'DEF', u'def', [], [('SPACE', u' ')])
01 = {tuple} <type 'tuple'>: ('NAME', u'main')
02 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
03 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
04 = {tuple} <type 'tuple'>: ('COLON', u':')
05 = {tuple} <type 'tuple'>: ('ENDL', u'\n', [], [('SPACE', u'    ')])
06 = {tuple} <type 'tuple'>: (u'PRINT', u'print')
07 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
08 = {tuple} <type 'tuple'>: ('STRING', u'"main"')
09 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
10 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
11 = {tuple} <type 'tuple'>: ('ENDL', u'\n', [], [('COMMENT', u'# cmt')])
12 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
13 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
14 = {tuple} <type 'tuple'>: (u'IF', u'if', [], [('SPACE', u' ')])
15 = {tuple} <type 'tuple'>: ('NAME', u'__name__')
16 = {tuple} <type 'tuple'>: ('EQUAL_EQUAL', u'==', [('SPACE', u' ')], [('SPACE', u' ')])
17 = {tuple} <type 'tuple'>: ('STRING', u'"__main__"')
18 = {tuple} <type 'tuple'>: ('COLON', u':')
19 = {tuple} <type 'tuple'>: ('ENDL', u'\n', [], [('SPACE', u'    ')])
20 = {tuple} <type 'tuple'>: ('NAME', u'main')
21 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
22 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
23 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
24 = {tuple} <type 'tuple'>: ('ENDMARKER', '')
  • mark_indentation()
00 = {tuple} <type 'tuple'>: (u'DEF', u'def', [], [('SPACE', u' ')])
01 = {tuple} <type 'tuple'>: ('NAME', u'main')
02 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
03 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
04 = {tuple} <type 'tuple'>: ('COLON', u':')
05 = {tuple} <type 'tuple'>: ('ENDL', u'\n', [], [('SPACE', u'    ')])
06 = {tuple} <type 'tuple'>: ('INDENT', '')
07 = {tuple} <type 'tuple'>: (u'PRINT', u'print')
08 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
09 = {tuple} <type 'tuple'>: ('STRING', u'"main"')
10 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
11 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
12 = {tuple} <type 'tuple'>: ('ENDL', u'\n', [], [('COMMENT', u'# cmt')])
13 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
14 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
15 = {tuple} <type 'tuple'>: ('DEDENT', '')
16 = {tuple} <type 'tuple'>: (u'IF', u'if', [], [('SPACE', u' ')])
17 = {tuple} <type 'tuple'>: ('NAME', u'__name__')
18 = {tuple} <type 'tuple'>: ('EQUAL_EQUAL', u'==', [('SPACE', u' ')], [('SPACE', u' ')])
19 = {tuple} <type 'tuple'>: ('STRING', u'"__main__"')
20 = {tuple} <type 'tuple'>: ('COLON', u':')
21 = {tuple} <type 'tuple'>: ('ENDL', u'\n', [], [('SPACE', u'    ')])
22 = {tuple} <type 'tuple'>: ('INDENT', '')
23 = {tuple} <type 'tuple'>: ('NAME', u'main')
24 = {tuple} <type 'tuple'>: ('LEFT_PARENTHESIS', u'(')
25 = {tuple} <type 'tuple'>: ('RIGHT_PARENTHESIS', u')')
26 = {tuple} <type 'tuple'>: ('ENDL', u'\n')
27 = {tuple} <type 'tuple'>: ('DEDENT', '')
28 = {tuple} <type 'tuple'>: ('ENDMARKER', '')

I got it! DEDENT has not been being set correctly.

How to fix it?

There were two ways to fix it. Fix mark_indentation or space_group.

I reviewed both cases and picked out space_group function. Because mark_indentation function just takes sequence after modification of space_group. And a COMMENT node that is being attached inside of ENDL, seems not a good choice.

From my point of view, CommentNode is the independent element of the tree. It doesn’t relevant to the code if we would like to implement an interpreter. But for purposes of analysis, we have to save all helpful information from the lexer level( into hidden layer) to the parser’s level and AST.

Below my fix is presented(https://github.com/rojaster/baron/commit/7a2cf9e79862cf0f0a10ef3c74a6fcf704251175):

  • formatting_grouper.py
    def group_generator(sequence):
    ....
      markers = ("SPACE") # ("SPACE","COMMENT") it fixes a CommentNode position
      while True:
    ....
          if current[0] in markers and iterator.show_next() and iterator.show_next()[0] in GROUP_SPACE_BEFORE:
    ....
    

Code must be simple as possible, but not so much to be stupid. That is called - experience, skills, art

And as a result I got next:

  • Samples:
t_buffer = textwrap.dedent(u"""\
    def main():
        print("main")

    # cmt

    if __name__ == "__main__":
        main()
""")

t_buffer0 = textwrap.dedent(u"""\
    def main():
        print("main")

    a = 1
    # cmt

    if __name__ == "__main__":
        main()
""")

t_buffer1 = textwrap.dedent(u"""\
    #!/usr/bin/env python
    #! -*- coding: utf-8 -*-
    def main():
        # comment before
        print("main") # inline comment

        #comment after

    # a = 1
    # cmt

    if __name__ == "__main__":
        main()
""")

rbt_buffer = RedBaron(t_buffer)
rbt_buffer0 = RedBaron(t_buffer0)
rbt_buffer1 = RedBaron(t_buffer1)

print(rbt_buffer.help(deep=True))
print(rbt_buffer.dumps())
print("\n")

print("0"*100)

print(rbt_buffer0.help(deep=True))
print(rbt_buffer0.dumps())
print("\n")

print("1"*100)
print(rbt_buffer1.help(deep=True))
print(rbt_buffer1.dumps())
print("\n")
  • Samples trees
0 -----------------------------------------------------
DefNode()
  # identifiers: def, def_, defnode, funcdef, funcdef_
  # default test value: name
  name=u'main'
  decorators ->
  arguments ->
  value ->
    * PrintNode()
        # identifiers: print, print_, printnode
        destination ->
          None
        value ->
          * AssociativeParenthesisNode()
              # identifiers: associative_parenthesis, associative_parenthesis_, associativeparenthesis, associativeparenthesisnode
              value ->
                StringNode()
                  # identifiers: string, string_, stringnode
                  value=u'"main"'
    * EndlNode()
        # identifiers: endl, endl_, endlnode
        value=u'\n'
        indent=''
1 -----------------------------------------------------
CommentNode()
  # identifiers: comment, comment_, commentnode
  value=u'# cmt'
2 -----------------------------------------------------
EndlNode()
  # identifiers: endl, endl_, endlnode
  value=u'\n'
  indent=''
3 -----------------------------------------------------
EndlNode()
  # identifiers: endl, endl_, endlnode
  value=u'\n'
  indent=''
4 -----------------------------------------------------
IfelseblockNode()
  # identifiers: ifelseblock, ifelseblock_, ifelseblocknode
  value ->
    * IfNode()
        # identifiers: if, if_, ifnode
        test ->
          ComparisonNode()
            # identifiers: comparison, comparison_, comparisonnode
            first ->
              NameNode()
                # identifiers: name, name_, namenode
                value=u'__name__'
            value ->
              ComparisonOperatorNode()
                # identifiers: comparison_operator, comparison_operator_, comparisonoperator, comparisonoperatornode
                first=u'=='
                second=''
            second ->
              StringNode()
                # identifiers: string, string_, stringnode
                value=u'"__main__"'
        value ->
          * AtomtrailersNode()
              # identifiers: atomtrailers, atomtrailers_, atomtrailersnode
              value ->
                * NameNode()
                    # identifiers: name, name_, namenode
                    value=u'main'
                * CallNode()
                    # identifiers: call, call_, callnode
                    value ->
None
def main():
    print("main")

# cmt

if __name__ == "__main__":
    main()



0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
0 -----------------------------------------------------
DefNode()
  # identifiers: def, def_, defnode, funcdef, funcdef_
  # default test value: name
  name=u'main'
  decorators ->
  arguments ->
  value ->
    * PrintNode()
        # identifiers: print, print_, printnode
        destination ->
          None
        value ->
          * AssociativeParenthesisNode()
              # identifiers: associative_parenthesis, associative_parenthesis_, associativeparenthesis, associativeparenthesisnode
              value ->
                StringNode()
                  # identifiers: string, string_, stringnode
                  value=u'"main"'
    * EndlNode()
        # identifiers: endl, endl_, endlnode
        value=u'\n'
        indent=''
1 -----------------------------------------------------
AssignmentNode()
  # identifiers: assign, assignment, assignment_, assignmentnode
  operator=''
  target ->
    NameNode()
      # identifiers: name, name_, namenode
      value=u'a'
  value ->
    IntNode()
      # identifiers: int, int_, intnode
      value=u'1'
2 -----------------------------------------------------
EndlNode()
  # identifiers: endl, endl_, endlnode
  value=u'\n'
  indent=''
3 -----------------------------------------------------
CommentNode()
  # identifiers: comment, comment_, commentnode
  value=u'# cmt'
4 -----------------------------------------------------
EndlNode()
  # identifiers: endl, endl_, endlnode
  value=u'\n'
  indent=''
5 -----------------------------------------------------
EndlNode()
  # identifiers: endl, endl_, endlnode
  value=u'\n'
  indent=''
6 -----------------------------------------------------
IfelseblockNode()
  # identifiers: ifelseblock, ifelseblock_, ifelseblocknode
  value ->
    * IfNode()
        # identifiers: if, if_, ifnode
        test ->
          ComparisonNode()
            # identifiers: comparison, comparison_, comparisonnode
            first ->
              NameNode()
                # identifiers: name, name_, namenode
                value=u'__name__'
            value ->
              ComparisonOperatorNode()
                # identifiers: comparison_operator, comparison_operator_, comparisonoperator, comparisonoperatornode
                first=u'=='
                second=''
            second ->
              StringNode()
                # identifiers: string, string_, stringnode
                value=u'"__main__"'
        value ->
          * AtomtrailersNode()
              # identifiers: atomtrailers, atomtrailers_, atomtrailersnode
              value ->
                * NameNode()
                    # identifiers: name, name_, namenode
                    value=u'main'
                * CallNode()
                    # identifiers: call, call_, callnode
                    value ->
None
def main():
    print("main")

a = 1
# cmt

if __name__ == "__main__":
    main()



1111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111
0 -----------------------------------------------------
CommentNode()
  # identifiers: comment, comment_, commentnode
  value=u'#!/usr/bin/env python'
1 -----------------------------------------------------
EndlNode()
  # identifiers: endl, endl_, endlnode
  value=u'\n'
  indent=''
2 -----------------------------------------------------
CommentNode()
  # identifiers: comment, comment_, commentnode
  value=u'#! -*- coding: utf-8 -*-'
3 -----------------------------------------------------
EndlNode()
  # identifiers: endl, endl_, endlnode
  value=u'\n'
  indent=''
4 -----------------------------------------------------
DefNode()
  # identifiers: def, def_, defnode, funcdef, funcdef_
  # default test value: name
  name=u'main'
  decorators ->
  arguments ->
  value ->
    * CommentNode()
        # identifiers: comment, comment_, commentnode
        value=u'# comment before'
    * PrintNode()
        # identifiers: print, print_, printnode
        destination ->
          None
        value ->
          * AssociativeParenthesisNode()
              # identifiers: associative_parenthesis, associative_parenthesis_, associativeparenthesis, associativeparenthesisnode
              value ->
                StringNode()
                  # identifiers: string, string_, stringnode
                  value=u'"main"'
    * CommentNode()
        # identifiers: comment, comment_, commentnode
        value=u'# inline comment'
    * EndlNode()
        # identifiers: endl, endl_, endlnode
        value=u'\n'
        indent=u'    '
    * CommentNode()
        # identifiers: comment, comment_, commentnode
        value=u'#comment after'
    * EndlNode()
        # identifiers: endl, endl_, endlnode
        value=u'\n'
        indent=''
5 -----------------------------------------------------
CommentNode()
  # identifiers: comment, comment_, commentnode
  value=u'# a = 1'
6 -----------------------------------------------------
EndlNode()
  # identifiers: endl, endl_, endlnode
  value=u'\n'
  indent=''
7 -----------------------------------------------------
CommentNode()
  # identifiers: comment, comment_, commentnode
  value=u'# cmt'
8 -----------------------------------------------------
EndlNode()
  # identifiers: endl, endl_, endlnode
  value=u'\n'
  indent=''
9 -----------------------------------------------------
EndlNode()
  # identifiers: endl, endl_, endlnode
  value=u'\n'
  indent=''
10 -----------------------------------------------------
IfelseblockNode()
  # identifiers: ifelseblock, ifelseblock_, ifelseblocknode
  value ->
    * IfNode()
        # identifiers: if, if_, ifnode
        test ->
          ComparisonNode()
            # identifiers: comparison, comparison_, comparisonnode
            first ->
              NameNode()
                # identifiers: name, name_, namenode
                value=u'__name__'
            value ->
              ComparisonOperatorNode()
                # identifiers: comparison_operator, comparison_operator_, comparisonoperator, comparisonoperatornode
                first=u'=='
                second=''
            second ->
              StringNode()
                # identifiers: string, string_, stringnode
                value=u'"__main__"'
        value ->
          * AtomtrailersNode()
              # identifiers: atomtrailers, atomtrailers_, atomtrailersnode
              value ->
                * NameNode()
                    # identifiers: name, name_, namenode
                    value=u'main'
                * CallNode()
                    # identifiers: call, call_, callnode
                    value ->
None
#!/usr/bin/env python
#! -*- coding: utf-8 -*-
def main():
    # comment before
    print("main") # inline comment

    #comment after

# a = 1
# cmt

if __name__ == "__main__":
    main()




Process finished with exit code 0

Looks correct. Some questions still exist but it’s the kind of hack or patch and it doesn’t break existent tests. Let’s call it solution. </br> Thus the result satisfies my expectations and requirements.</br> Thank you for reading and your time, dear reader!