antlr · teverett · Jul 7, 2020 · Jul 7, 2020 · Jul 7, 2020 · Jul 7, 2020
diff --git a/...r4-formatter-standalone/src/main/java/com/khubla/antlr4formatter/StandaloneFormatter.java b/...r4-formatter-standalone/src/main/java/com/khubla/antlr4formatter/StandaloneFormatter.java
@@ -66,6 +66,5 @@ private static void showHelpAndExit(Options options) {
       System.exit(1);
    }
 
-   private StandaloneFormatter() {
-   }
+   private StandaloneFormatter() {}
 }
diff --git a/antlr4-formatter/src/main/antlr4/ANTLRv4Lexer.g4 b/antlr4-formatter/src/main/antlr4/ANTLRv4Lexer.g4
@@ -40,13 +40,19 @@ lexer grammar ANTLRv4Lexer;
 options { superClass = LexerAdaptor; }
 import LexBasic;
 // Standard set of fragments
+
 tokens { TOKEN_REF , RULE_REF , LEXER_CHAR_SET }
 channels { OFF_CHANNEL , COMMENT }
 // ======================================================
+
 // Lexer specification
+
 //
+
 // -------------------------
+
 // Comments
+
 DOC_COMMENT
    : DocComment
    ;
@@ -59,20 +65,28 @@ LINE_COMMENT
    : LineComment -> channel (COMMENT)
    ;
    // -------------------------
+
    // Integer
+
    //
-
+   
 INT
    : DecimalNumeral
    ;
    // -------------------------
+
    // Literal string
+
    //
+
    // ANTLR makes no distinction between a single character literal and a
+
    // multi-character string. All literals are single quote delimited and
+
    // may contain unicode escape sequences of the form \uxxxx, where x
+
    // is a valid hexadecimal number (per Unicode standard).
-
+   
 STRING_LITERAL
    : SQuoteLiteral
    ;
@@ -81,29 +95,40 @@ UNTERMINATED_STRING_LITERAL
    : USQuoteLiteral
    ;
    // -------------------------
+
    // Arguments
+
    //
+
    // Certain argument lists, such as those specifying call parameters
+
    // to a rule invocation, or input parameters to a rule specification
+
    // are contained within square brackets.
-
+   
 BEGIN_ARGUMENT
    : LBrack
    { handleBeginArgument(); }
    ;
    // -------------------------
+
    // Actions
-
+   
 BEGIN_ACTION
    : LBrace -> pushMode (Action)
    ;
    // -------------------------
+
    // Keywords
+
    //
+
    // Keywords may not be used as labels for rules or in any other context where
+
    // they would be ambiguous with the keyword vs some other identifier.  OPTIONS,
+
    // TOKENS, & CHANNELS blocks are handled idiomatically in dedicated lexical modes.
-
+   
 OPTIONS
    : 'options' -> pushMode (Options)
    ;
@@ -172,8 +197,9 @@ MODE
    : 'mode'
    ;
    // -------------------------
+
    // Punctuation
-
+   
 COLON
    : Colon
    ;
@@ -266,40 +292,57 @@ NOT
    : Tilde
    ;
    // -------------------------
+
    // Identifiers - allows unicode rule/token names
-
+   
 ID
    : Id
    ;
    // -------------------------
+
    // Whitespace
-
+   
 WS
    : Ws+ -> channel (OFF_CHANNEL)
    ;
    // -------------------------
+
    // Illegal Characters
+
    //
+
    // This is an illegal character trap which is always the last rule in the
+
    // lexer specification. It matches a single character of any value and being
+
    // the last rule in the file will match when no other rule knows what to do
+
    // about the character. It is reported as an error but is not passed on to the
+
    // parser. This means that the parser to deal with the gramamr file anyway
+
    // but we will not try to analyse or code generate from a file with lexical
+
    // errors.
+
    //
+
    // Comment this rule out to allow the error to be propagated to the parser
-
+   
 ERRCHAR
    : . -> channel (HIDDEN)
    ;
    // ======================================================
+
    // Lexer modes
+
    // -------------------------
+
    // Arguments
-
+   
 mode Argument;
 // E.g., [int x, List<String> a[]]
+
 NESTED_ARGUMENT
    : LBrack -> type (ARGUMENT_CONTENT) , pushMode (Argument)
    ;
@@ -321,7 +364,7 @@ END_ARGUMENT
    { handleEndArgument(); }
    ;
    // added this to return non-EOF token type here. EOF does something weird
-
+   
 UNTERMINATED_ARGUMENT
    : EOF -> popMode
    ;
@@ -330,15 +373,23 @@ ARGUMENT_CONTENT
    : .
    ;
    // -------------------------
+
    // Actions
+
    //
+
    // Many language targets use {} as block delimiters and so we
+
    // must recursively match {} delimited blocks to balance the
+
    // braces. Additionally, we must make some assumptions about
+
    // literal string representation in the target language. We assume
+
    // that they are delimited by ' or " and so consume these
+
    // in their own alts so as not to inadvertantly match {}.
-
+   
 mode Action;
 NESTED_ACTION
    : LBrace -> type (ACTION_CONTENT) , pushMode (Action)
@@ -381,7 +432,7 @@ ACTION_CONTENT
    : .
    ;
    // -------------------------
-
+   
 mode Options;
 OPT_DOC_COMMENT
    : DocComment -> type (DOC_COMMENT) , channel (COMMENT)
@@ -435,7 +486,7 @@ OPT_WS
    : Ws+ -> type (WS) , channel (OFF_CHANNEL)
    ;
    // -------------------------
-
+   
 mode Tokens;
 TOK_DOC_COMMENT
    : DocComment -> type (DOC_COMMENT) , channel (COMMENT)
@@ -473,9 +524,10 @@ TOK_WS
    : Ws+ -> type (WS) , channel (OFF_CHANNEL)
    ;
    // -------------------------
-
+   
 mode Channels;
 // currently same as Tokens mode; distinguished by keyword
+
 CHN_DOC_COMMENT
    : DocComment -> type (DOC_COMMENT) , channel (COMMENT)
    ;
@@ -512,7 +564,7 @@ CHN_WS
    : Ws+ -> type (WS) , channel (OFF_CHANNEL)
    ;
    // -------------------------
-
+   
 mode LexerCharSet;
 LEXER_CHAR_SET_BODY
    : (~ [\]\\] | EscAny)+ -> more
@@ -526,8 +578,9 @@ UNTERMINATED_CHAR_SET
    : EOF -> popMode
    ;
    // ------------------------------------------------------------------------------
+
    // Grammar specific Keywords, Punctuation, etc.
-
+   
 fragment Id
    : NameStartChar NameChar*
    ;