From 6074e38a0769cfdd5051aec41e7f8001787dd2e8 Mon Sep 17 00:00:00 2001
From: Abdelrahman Aly Abounegm <abounegm.abdelrahman@gmail.com>
Date: Sun, 14 Jul 2024 04:18:23 +0300
Subject: [PATCH 01/14] Add indentation-aware TokenBuilder and Lexer

This is useful for languages where indentation is used as a block delimiter, such as Python
---
 packages/langium/src/parser/lexer.ts         |  40 +++
 packages/langium/src/parser/token-builder.ts | 259 +++++++++++++++++++
 2 files changed, 299 insertions(+)

diff --git a/packages/langium/src/parser/lexer.ts b/packages/langium/src/parser/lexer.ts
index eec23ea12..33296acc3 100644
--- a/packages/langium/src/parser/lexer.ts
+++ b/packages/langium/src/parser/lexer.ts
@@ -6,6 +6,7 @@
 
 import type { ILexingError, IMultiModeLexerDefinition, IToken, TokenType, TokenTypeDictionary, TokenVocabulary } from 'chevrotain';
 import type { LangiumCoreServices } from '../services.js';
+import { IndentationAwareTokenBuilder } from "./token-builder.js";
 import { Lexer as ChevrotainLexer } from 'chevrotain';
 
 export interface LexerResult {
@@ -66,6 +67,45 @@ export class DefaultLexer implements Lexer {
     }
 }
 
+/**
+ * A lexer that is aware of indentation in the input text.
+ * The only purpose of this lexer is to reset the internal state of the {@link IndentationAwareTokenBuilder}
+ * between the tokenization of different text inputs.
+ *
+ * In your module, you can override the default lexer with this one as such:
+ * ```ts
+ * parser: {
+ *    TokenBuilder: () => new IndentationAwareTokenBuilder(),
+ *    Lexer: (services) => new IndentationAwareLexer(services),
+ * }
+ * ```
+ */
+export class IndentationAwareLexer extends DefaultLexer {
+    private indentationTokenBuilder?: IndentationAwareTokenBuilder;
+
+    constructor(services: LangiumCoreServices) {
+        super(services);
+        if (services.parser.TokenBuilder instanceof IndentationAwareTokenBuilder) {
+            this.indentationTokenBuilder = services.parser.TokenBuilder;
+        }
+    }
+
+    override tokenize(text: string): LexerResult {
+        const result = super.tokenize(text);
+
+        if (!this.indentationTokenBuilder) {
+            // A token builder other than the expected IndentationAwareTokenBuilder is used
+            return result;
+        }
+
+        // reset the indent stack between processing of different text inputs
+        const remainingDedents = this.indentationTokenBuilder.popRemainingDedents(text);
+        result.tokens.push(...remainingDedents);
+
+        return result;
+    }
+}
+
 /**
  * Returns a check whether the given TokenVocabulary is TokenType array
  */
diff --git a/packages/langium/src/parser/token-builder.ts b/packages/langium/src/parser/token-builder.ts
index 4320c5ca6..79d59d8b2 100644
--- a/packages/langium/src/parser/token-builder.ts
+++ b/packages/langium/src/parser/token-builder.ts
@@ -118,3 +118,262 @@ export class DefaultTokenBuilder implements TokenBuilder {
         }, []);
     }
 }
+
+
+export interface IndentationTokenBuilderOptions {
+    /**
+     * The name of the token used to denote indentation in the grammar.
+     * A possible definition in the grammar could look like this:
+     * ```langium
+     * terminal INDENT: ':synthetic-indent:';
+     * ```
+     *
+     * @default "INDENT"
+     */
+    indentTokenName: string;
+    /**
+     * The name of the token used to denote deindentation in the grammar.
+     * A possible definition in the grammar could look like this:
+     * ```langium
+     * terminal DEDENT: ':synthetic-dedent:';
+     * ```
+     *
+     * @default "DEDENT"
+     */
+    dedentTokenName: string;
+    /**
+     * The name of the token used to denote whitespace other than indentation and newlines in the grammar.
+     * A possible definition in the grammar could look like this:
+     * ```langium
+     * hidden terminal WS: /[ \t]+/;
+     * ```
+     *
+     * @default "WS"
+     */
+    whitespaceTokenName: string;
+}
+
+const indetationBuilderDefaultOptions: IndentationTokenBuilderOptions = {
+    indentTokenName: 'INDENT',
+    dedentTokenName: 'DEDENT',
+    whitespaceTokenName: 'WS',
+};
+
+/**
+ * A token builder that is sensitive to indentation in the input text.
+ * It will generate tokens for indentation and dedentation based on the indentation level.
+ *
+ * Inspired by https://github.com/chevrotain/chevrotain/blob/master/examples/lexer/python_indentation/python_indentation.js
+ */
+export class IndentationAwareTokenBuilder extends DefaultTokenBuilder {
+    private indentationStack: number[] = [0];
+    private options: IndentationTokenBuilderOptions;
+
+    /**
+     * The token type to be used for indentation tokens
+     */
+    protected indentTokenType: TokenType;
+
+    /**
+     * The token type to be used for dedentation tokens
+     */
+    protected dedentTokenType: TokenType;
+
+    /**
+     * A regular expression to match a series of tabs and/or spaces.
+     * Override this to customize what the indentation is allowed to consist of.
+     */
+    protected whitespaceRegExp = /[ \t]+/y;
+
+    constructor(options: Partial<IndentationTokenBuilderOptions> = indetationBuilderDefaultOptions) {
+        super();
+        this.options = {
+            ...indetationBuilderDefaultOptions,
+            ...options,
+        };
+
+        this.indentTokenType = createToken({
+            name: this.options.indentTokenName,
+            pattern: this.indentMatcher,
+            line_breaks: false,
+        });
+
+        this.dedentTokenType = createToken({
+            name: this.options.dedentTokenName,
+            pattern: this.dedentMatcher,
+            line_breaks: false,
+        });
+    }
+
+    override buildTokens(grammar: GrammarAST.Grammar, options?: TokenBuilderOptions | undefined) {
+        const tokenTypes = super.buildTokens(grammar, options);
+        if (!isTokenTypeArray(tokenTypes)) {
+            throw new Error('Invalid tokens built by default builder');
+        }
+
+        const {indentTokenName, dedentTokenName, whitespaceTokenName} = this.options;
+
+        // Rearrange tokens because whitespace (which is ignored) goes to the beginning by default, consuming indentation as well
+        // Order should be: dedent, indent, spaces
+        const dedent = tokenTypes.find(tok => tok.name === dedentTokenName);
+        const indent = tokenTypes.find(tok => tok.name === indentTokenName);
+        const ws = tokenTypes.find(tok => tok.name === whitespaceTokenName);
+        if (!dedent || !indent || !ws) {
+            throw new Error('Some indentation/whitespace tokens not found!');
+        }
+
+        const spaceTokens = [dedent, indent, ws];
+        const otherTokens = tokenTypes.filter(tok => !spaceTokens.includes(tok));
+        return [...spaceTokens, ...otherTokens];
+    }
+
+    private isStartOfLine(text: string, offset: number): boolean {
+        return offset === 0 || '\r\n'.includes(text[offset - 1]);
+    }
+
+    private matchWhitespace(text: string, offset: number) {
+        this.whitespaceRegExp.lastIndex = offset;
+        const match = this.whitespaceRegExp.exec(text);
+        return {
+            currIndentLevel: match?.[0].length ?? 0,
+            prevIndentLevel: this.indentationStack.at(-1)!,
+            match,
+        };
+    }
+
+    private createIndentationTokenInstance(tokenType: TokenType, text: string, image: string, offset: number) {
+        const lineNumber = text.substring(0, offset).split(/\r\n|\r|\n/).length;
+        return createTokenInstance(
+            tokenType,
+            image,
+            offset, offset + image.length,
+            lineNumber, lineNumber,
+            0, image.length,
+        );
+    }
+
+    /**
+     * A custom pattern for matching indents
+     *
+     * @param text The full input string.
+     * @param offset The offset at which to attempt a match
+     * @param tokens Previously scanned Tokens
+     * @param groups Token Groups
+     */
+    protected indentMatcher: CustomPatternMatcherFunc = (text, offset, tokens, groups) => {
+        const {indentTokenName} = this.options;
+
+        if (!this.isStartOfLine(text, offset)) {
+            return null;
+        }
+
+        const {currIndentLevel, prevIndentLevel, match} = this.matchWhitespace(text, offset);
+
+        if (currIndentLevel <= prevIndentLevel) {
+            // shallower indentation (should be matched by dedent)
+            // or same indentation level (should be matched by whitespace and ignored)
+            return null;
+        }
+
+        this.indentationStack.push(currIndentLevel);
+
+        const indentToken = this.createIndentationTokenInstance(
+            this.indentTokenType,
+            text,
+            match?.[0] ?? indentTokenName,
+            offset,
+        );
+        tokens.push(indentToken);
+
+        // Token already added, let the indentation now be consumed as whitespace and ignored
+        return null;
+    };
+
+    /**
+     * A custom pattern for matching dedents
+     *
+     * @param text The full input string.
+     * @param offset The offset at which to attempt a match
+     * @param tokens Previously scanned Tokens
+     * @param groups Token Groups
+     */
+    protected dedentMatcher: CustomPatternMatcherFunc = (text, offset, tokens, groups) => {
+        const {dedentTokenName} = this.options;
+
+        if (!this.isStartOfLine(text, offset)) {
+            return null;
+        }
+
+        const {currIndentLevel, prevIndentLevel, match} = this.matchWhitespace(text, offset);
+
+        if (currIndentLevel >= prevIndentLevel) {
+            // bigger indentation (should be matched by indent)
+            // or same indentation level (should be matched by whitespace and ignored)
+            return null;
+        }
+
+        const matchIndentIndex = this.indentationStack.lastIndexOf(currIndentLevel);
+
+        // Any dedent must match some previous indentation level.
+        if (matchIndentIndex === -1) {
+            console.error(`Invalid dedent level ${currIndentLevel} at offset: ${offset}. Current indetation stack: ${this.indentationStack}`);
+            // throwing an error would crash the language server
+            // TODO: find a way to report error diagnostics message
+            return null;
+        }
+
+        const numberOfDedents = this.indentationStack.length - matchIndentIndex - 1;
+
+        for (let i = 0; i < numberOfDedents; i++) {
+            const token = this.createIndentationTokenInstance(
+                this.dedentTokenType,
+                text,
+                match?.[0] ?? dedentTokenName,
+                offset,
+            );
+            tokens.push(token);
+            this.indentationStack.pop();
+        }
+
+        // Token already added, let the dedentation now be consumed as whitespace and ignored
+        return null;
+    };
+
+    protected override buildTerminalToken(terminal: GrammarAST.TerminalRule): TokenType {
+        const tokenType = super.buildTerminalToken(terminal);
+        const {indentTokenName, dedentTokenName, whitespaceTokenName} = this.options;
+
+        if (tokenType.name === indentTokenName) {
+            return this.indentTokenType;
+        } else if (tokenType.name === dedentTokenName) {
+            return this.dedentTokenType;
+        } else if (tokenType.name === whitespaceTokenName) {
+            return createToken({
+                name: whitespaceTokenName,
+                pattern: this.whitespaceRegExp,
+                group: Lexer.SKIPPED,
+            });
+        }
+
+        return tokenType;
+    }
+
+    /**
+     * Resets the indentation stack between different runs of the lexer
+     *
+     * @param text Full text that was tokenized
+     * @returns Remaining dedent tokens to match all previous indents at the end of the file
+     */
+    public popRemainingDedents(text: string) {
+        const remainingDedents: IToken[] = [];
+        while (this.indentationStack.length > 1) {
+            remainingDedents.push(
+                this.createIndentationTokenInstance(this.dedentTokenType, text, this.options.dedentTokenName, text.length)
+            );
+            this.indentationStack.pop();
+        }
+
+        this.indentationStack = [0];
+        return remainingDedents;
+    }
+}

From f4943e7449fc8a92f26875be517597c10f873e45 Mon Sep 17 00:00:00 2001
From: Abdelrahman Aly Abounegm <abounegm.abdelrahman@gmail.com>
Date: Sun, 14 Jul 2024 13:52:52 +0300
Subject: [PATCH 02/14] Replace double quotes with single quotes

For consistency with the rest of the codebase
---
 packages/langium/src/parser/lexer.ts         | 2 +-
 packages/langium/src/parser/token-builder.ts | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/packages/langium/src/parser/lexer.ts b/packages/langium/src/parser/lexer.ts
index 33296acc3..4f3d27a5c 100644
--- a/packages/langium/src/parser/lexer.ts
+++ b/packages/langium/src/parser/lexer.ts
@@ -6,7 +6,7 @@
 
 import type { ILexingError, IMultiModeLexerDefinition, IToken, TokenType, TokenTypeDictionary, TokenVocabulary } from 'chevrotain';
 import type { LangiumCoreServices } from '../services.js';
-import { IndentationAwareTokenBuilder } from "./token-builder.js";
+import { IndentationAwareTokenBuilder } from './token-builder.js';
 import { Lexer as ChevrotainLexer } from 'chevrotain';
 
 export interface LexerResult {
diff --git a/packages/langium/src/parser/token-builder.ts b/packages/langium/src/parser/token-builder.ts
index 79d59d8b2..72b491ce8 100644
--- a/packages/langium/src/parser/token-builder.ts
+++ b/packages/langium/src/parser/token-builder.ts
@@ -128,7 +128,7 @@ export interface IndentationTokenBuilderOptions {
      * terminal INDENT: ':synthetic-indent:';
      * ```
      *
-     * @default "INDENT"
+     * @default 'INDENT'
      */
     indentTokenName: string;
     /**
@@ -138,7 +138,7 @@ export interface IndentationTokenBuilderOptions {
      * terminal DEDENT: ':synthetic-dedent:';
      * ```
      *
-     * @default "DEDENT"
+     * @default 'DEDENT'
      */
     dedentTokenName: string;
     /**
@@ -148,7 +148,7 @@ export interface IndentationTokenBuilderOptions {
      * hidden terminal WS: /[ \t]+/;
      * ```
      *
-     * @default "WS"
+     * @default 'WS'
      */
     whitespaceTokenName: string;
 }

From 7d85c475411dbdf1ed11748c4c78766295c6bc81 Mon Sep 17 00:00:00 2001
From: Abdelrahman Abounegm <abounegm@yandex-team.ru>
Date: Sun, 14 Jul 2024 14:37:36 +0300
Subject: [PATCH 03/14] Add missing imports

---
 packages/langium/src/parser/token-builder.ts | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/packages/langium/src/parser/token-builder.ts b/packages/langium/src/parser/token-builder.ts
index 72b491ce8..ef8ef1e70 100644
--- a/packages/langium/src/parser/token-builder.ts
+++ b/packages/langium/src/parser/token-builder.ts
@@ -4,15 +4,16 @@
  * terms of the MIT License, which is available in the project root.
  ******************************************************************************/
 
-import type { CustomPatternMatcherFunc, TokenPattern, TokenType, TokenVocabulary } from 'chevrotain';
+import type { CustomPatternMatcherFunc, TokenPattern, TokenType, TokenVocabulary, IToken } from 'chevrotain';
 import type { AbstractRule, Grammar, Keyword, TerminalRule } from '../languages/generated/ast.js';
 import type { Stream } from '../utils/stream.js';
-import { Lexer } from 'chevrotain';
+import { createToken, createTokenInstance, Lexer } from 'chevrotain';
 import { isKeyword, isParserRule, isTerminalRule } from '../languages/generated/ast.js';
 import { streamAllContents } from '../utils/ast-utils.js';
 import { getAllReachableRules, terminalRegex } from '../utils/grammar-utils.js';
 import { getCaseInsensitivePattern, isWhitespace, partialMatches } from '../utils/regexp-utils.js';
 import { stream } from '../utils/stream.js';
+import { isTokenTypeArray } from './lexer.js';
 
 export interface TokenBuilderOptions {
     caseInsensitive?: boolean
@@ -119,7 +120,6 @@ export class DefaultTokenBuilder implements TokenBuilder {
     }
 }
 
-
 export interface IndentationTokenBuilderOptions {
     /**
      * The name of the token used to denote indentation in the grammar.
@@ -205,7 +205,7 @@ export class IndentationAwareTokenBuilder extends DefaultTokenBuilder {
         });
     }
 
-    override buildTokens(grammar: GrammarAST.Grammar, options?: TokenBuilderOptions | undefined) {
+    override buildTokens(grammar: Grammar, options?: TokenBuilderOptions | undefined) {
         const tokenTypes = super.buildTokens(grammar, options);
         if (!isTokenTypeArray(tokenTypes)) {
             throw new Error('Invalid tokens built by default builder');
@@ -260,7 +260,7 @@ export class IndentationAwareTokenBuilder extends DefaultTokenBuilder {
      * @param tokens Previously scanned Tokens
      * @param groups Token Groups
      */
-    protected indentMatcher: CustomPatternMatcherFunc = (text, offset, tokens, groups) => {
+    protected indentMatcher: CustomPatternMatcherFunc = (text, offset, tokens, _groups) => {
         const {indentTokenName} = this.options;
 
         if (!this.isStartOfLine(text, offset)) {
@@ -297,7 +297,7 @@ export class IndentationAwareTokenBuilder extends DefaultTokenBuilder {
      * @param tokens Previously scanned Tokens
      * @param groups Token Groups
      */
-    protected dedentMatcher: CustomPatternMatcherFunc = (text, offset, tokens, groups) => {
+    protected dedentMatcher: CustomPatternMatcherFunc = (text, offset, tokens, _groups) => {
         const {dedentTokenName} = this.options;
 
         if (!this.isStartOfLine(text, offset)) {
@@ -339,7 +339,7 @@ export class IndentationAwareTokenBuilder extends DefaultTokenBuilder {
         return null;
     };
 
-    protected override buildTerminalToken(terminal: GrammarAST.TerminalRule): TokenType {
+    protected override buildTerminalToken(terminal: TerminalRule): TokenType {
         const tokenType = super.buildTerminalToken(terminal);
         const {indentTokenName, dedentTokenName, whitespaceTokenName} = this.options;
 

From e3963fe72c3f3f0b904501cefcfb3a0ae79ff134 Mon Sep 17 00:00:00 2001
From: Abdelrahman Abounegm <abounegm@yandex-team.ru>
Date: Sun, 14 Jul 2024 22:29:42 +0300
Subject: [PATCH 04/14] Make all private fields protected and add JSDoc

---
 packages/langium/src/parser/lexer.ts         |  2 +-
 packages/langium/src/parser/token-builder.ts | 37 +++++++++++++++++---
 2 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/packages/langium/src/parser/lexer.ts b/packages/langium/src/parser/lexer.ts
index 4f3d27a5c..f80df9335 100644
--- a/packages/langium/src/parser/lexer.ts
+++ b/packages/langium/src/parser/lexer.ts
@@ -81,7 +81,7 @@ export class DefaultLexer implements Lexer {
  * ```
  */
 export class IndentationAwareLexer extends DefaultLexer {
-    private indentationTokenBuilder?: IndentationAwareTokenBuilder;
+    protected readonly indentationTokenBuilder?: IndentationAwareTokenBuilder;
 
     constructor(services: LangiumCoreServices) {
         super(services);
diff --git a/packages/langium/src/parser/token-builder.ts b/packages/langium/src/parser/token-builder.ts
index ef8ef1e70..74469662e 100644
--- a/packages/langium/src/parser/token-builder.ts
+++ b/packages/langium/src/parser/token-builder.ts
@@ -166,8 +166,12 @@ const indetationBuilderDefaultOptions: IndentationTokenBuilderOptions = {
  * Inspired by https://github.com/chevrotain/chevrotain/blob/master/examples/lexer/python_indentation/python_indentation.js
  */
 export class IndentationAwareTokenBuilder extends DefaultTokenBuilder {
-    private indentationStack: number[] = [0];
-    private options: IndentationTokenBuilderOptions;
+    /**
+     * The stack in which all the previous matched indentation levels are stored
+     * to understand how deep a the next tokens are nested.
+     */
+    protected indentationStack: number[] = [0];
+    protected options: IndentationTokenBuilderOptions;
 
     /**
      * The token type to be used for indentation tokens
@@ -227,11 +231,25 @@ export class IndentationAwareTokenBuilder extends DefaultTokenBuilder {
         return [...spaceTokens, ...otherTokens];
     }
 
-    private isStartOfLine(text: string, offset: number): boolean {
+    /**
+     * Helper function to check if the current position is the start of a new line.
+     *
+     * @param text The full input string.
+     * @param offset The current position at which to check
+     * @returns Whether the current position is the start of a new line
+     */
+    protected isStartOfLine(text: string, offset: number): boolean {
         return offset === 0 || '\r\n'.includes(text[offset - 1]);
     }
 
-    private matchWhitespace(text: string, offset: number) {
+    /**
+     * A helper function used in matching both indents and dedents.
+     *
+     * @param text The full input string.
+     * @param offset The current position at which to attempt a match
+     * @returns The current and previous indentation levels and the matched whitespace
+     */
+    protected matchWhitespace(text: string, offset: number) {
         this.whitespaceRegExp.lastIndex = offset;
         const match = this.whitespaceRegExp.exec(text);
         return {
@@ -241,7 +259,16 @@ export class IndentationAwareTokenBuilder extends DefaultTokenBuilder {
         };
     }
 
-    private createIndentationTokenInstance(tokenType: TokenType, text: string, image: string, offset: number) {
+    /**
+     * Helper function to create an instance of an indentation token.
+     *
+     * @param tokenType Indent or dedent token type
+     * @param text Full input string, used to calculate the line number
+     * @param image The original image of the token (tabs or spaces)
+     * @param offset Current position in the input string
+     * @returns The indentation token instance
+     */
+    protected createIndentationTokenInstance(tokenType: TokenType, text: string, image: string, offset: number) {
         const lineNumber = text.substring(0, offset).split(/\r\n|\r|\n/).length;
         return createTokenInstance(
             tokenType,

From 2db70a00de472a7cd19781267453d47e1290d105 Mon Sep 17 00:00:00 2001
From: Abdelrahman Abounegm <abounegm@yandex-team.ru>
Date: Sun, 14 Jul 2024 22:34:44 +0300
Subject: [PATCH 05/14] Require IndentationAwareTokenBuilder for the lexer

---
 packages/langium/src/parser/lexer.ts | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/packages/langium/src/parser/lexer.ts b/packages/langium/src/parser/lexer.ts
index f80df9335..0eb0653b9 100644
--- a/packages/langium/src/parser/lexer.ts
+++ b/packages/langium/src/parser/lexer.ts
@@ -81,23 +81,20 @@ export class DefaultLexer implements Lexer {
  * ```
  */
 export class IndentationAwareLexer extends DefaultLexer {
-    protected readonly indentationTokenBuilder?: IndentationAwareTokenBuilder;
+    protected readonly indentationTokenBuilder: IndentationAwareTokenBuilder;
 
     constructor(services: LangiumCoreServices) {
         super(services);
         if (services.parser.TokenBuilder instanceof IndentationAwareTokenBuilder) {
             this.indentationTokenBuilder = services.parser.TokenBuilder;
+        } else {
+            throw new Error('IndentationAwareLexer requires an accompanying IndentationAwareTokenBuilder');
         }
     }
 
     override tokenize(text: string): LexerResult {
         const result = super.tokenize(text);
 
-        if (!this.indentationTokenBuilder) {
-            // A token builder other than the expected IndentationAwareTokenBuilder is used
-            return result;
-        }
-
         // reset the indent stack between processing of different text inputs
         const remainingDedents = this.indentationTokenBuilder.popRemainingDedents(text);
         result.tokens.push(...remainingDedents);

From b41957fbfa69a08da243261077b6e0dfee53c820 Mon Sep 17 00:00:00 2001
From: Abdelrahman Abounegm <abounegm@yandex-team.ru>
Date: Sun, 14 Jul 2024 22:35:55 +0300
Subject: [PATCH 06/14] Export default options and fix typo

---
 packages/langium/src/parser/token-builder.ts | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/packages/langium/src/parser/token-builder.ts b/packages/langium/src/parser/token-builder.ts
index 74469662e..eedb5b489 100644
--- a/packages/langium/src/parser/token-builder.ts
+++ b/packages/langium/src/parser/token-builder.ts
@@ -153,7 +153,7 @@ export interface IndentationTokenBuilderOptions {
     whitespaceTokenName: string;
 }
 
-const indetationBuilderDefaultOptions: IndentationTokenBuilderOptions = {
+export const indentationBuilderDefaultOptions: IndentationTokenBuilderOptions = {
     indentTokenName: 'INDENT',
     dedentTokenName: 'DEDENT',
     whitespaceTokenName: 'WS',
@@ -189,10 +189,10 @@ export class IndentationAwareTokenBuilder extends DefaultTokenBuilder {
      */
     protected whitespaceRegExp = /[ \t]+/y;
 
-    constructor(options: Partial<IndentationTokenBuilderOptions> = indetationBuilderDefaultOptions) {
+    constructor(options: Partial<IndentationTokenBuilderOptions> = indentationBuilderDefaultOptions) {
         super();
         this.options = {
-            ...indetationBuilderDefaultOptions,
+            ...indentationBuilderDefaultOptions,
             ...options,
         };
 

From 24860caf0eb19e965b2b3499b8e43346eb6ae74a Mon Sep 17 00:00:00 2001
From: Abdelrahman Aly Abounegm <abounegm.abdelrahman@gmail.com>
Date: Sun, 14 Jul 2024 22:39:42 +0300
Subject: [PATCH 07/14] Add explicit return type to the function

Co-authored-by: Mark Sujew <mark.sujew@typefox.io>
---
 packages/langium/src/parser/token-builder.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/langium/src/parser/token-builder.ts b/packages/langium/src/parser/token-builder.ts
index eedb5b489..7f3882a0a 100644
--- a/packages/langium/src/parser/token-builder.ts
+++ b/packages/langium/src/parser/token-builder.ts
@@ -391,7 +391,7 @@ export class IndentationAwareTokenBuilder extends DefaultTokenBuilder {
      * @param text Full text that was tokenized
      * @returns Remaining dedent tokens to match all previous indents at the end of the file
      */
-    public popRemainingDedents(text: string) {
+    popRemainingDedents(text: string): IToken[] {
         const remainingDedents: IToken[] = [];
         while (this.indentationStack.length > 1) {
             remainingDedents.push(

From 01fb746cc96ee45e81015294155f8c465d46f6cb Mon Sep 17 00:00:00 2001
From: Abdelrahman Aly Abounegm <abounegm.abdelrahman@gmail.com>
Date: Sun, 14 Jul 2024 22:40:18 +0300
Subject: [PATCH 08/14] Replace 4 iterations through tokens with 1 loop

Co-authored-by: Mark Sujew <mark.sujew@typefox.io>
---
 packages/langium/src/parser/token-builder.ts | 23 ++++++++++++++------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/packages/langium/src/parser/token-builder.ts b/packages/langium/src/parser/token-builder.ts
index 7f3882a0a..e6b734d82 100644
--- a/packages/langium/src/parser/token-builder.ts
+++ b/packages/langium/src/parser/token-builder.ts
@@ -219,16 +219,25 @@ export class IndentationAwareTokenBuilder extends DefaultTokenBuilder {
 
         // Rearrange tokens because whitespace (which is ignored) goes to the beginning by default, consuming indentation as well
         // Order should be: dedent, indent, spaces
-        const dedent = tokenTypes.find(tok => tok.name === dedentTokenName);
-        const indent = tokenTypes.find(tok => tok.name === indentTokenName);
-        const ws = tokenTypes.find(tok => tok.name === whitespaceTokenName);
+        let dedent: TokenType | undefined;
+        let indent: TokenType | undefined;
+        let ws: TokenType | undefined;
+        const otherTokens: TokenType[] = [];
+        for (const tokenType of tokenTypes) {
+            if (tokenType.name === dedentTokenName) {
+                dedent = tokenType;
+            } else if (tokenType.name === indentTokenName) {
+                indent = tokenType;
+            } else if (tokenType.name === whitespaceTokenName) {
+                ws = tokenType;
+            } else {
+                otherTokens.push(tokenType);
+            }
+        }
         if (!dedent || !indent || !ws) {
             throw new Error('Some indentation/whitespace tokens not found!');
         }
-
-        const spaceTokens = [dedent, indent, ws];
-        const otherTokens = tokenTypes.filter(tok => !spaceTokens.includes(tok));
-        return [...spaceTokens, ...otherTokens];
+        return [dedent, indent, ws, ...otherTokens];
     }
 
     /**

From 3efc2ba80a3dad1977c42c4809ae056af65bd99a Mon Sep 17 00:00:00 2001
From: Abdelrahman Abounegm <abounegm@yandex-team.ru>
Date: Tue, 16 Jul 2024 19:20:46 +0300
Subject: [PATCH 09/14] Move indentation-aware classes to separate module

---
 .../langium/src/parser/indentation-aware.ts   | 344 ++++++++++++++++++
 packages/langium/src/parser/index.ts          |   1 +
 packages/langium/src/parser/lexer.ts          |  37 --
 packages/langium/src/parser/token-builder.ts  | 299 +--------------
 4 files changed, 347 insertions(+), 334 deletions(-)
 create mode 100644 packages/langium/src/parser/indentation-aware.ts

diff --git a/packages/langium/src/parser/indentation-aware.ts b/packages/langium/src/parser/indentation-aware.ts
new file mode 100644
index 000000000..9c4bb4dec
--- /dev/null
+++ b/packages/langium/src/parser/indentation-aware.ts
@@ -0,0 +1,344 @@
+/******************************************************************************
+ * Copyright 2024 TypeFox GmbH
+ * This program and the accompanying materials are made available under the
+ * terms of the MIT License, which is available in the project root.
+ ******************************************************************************/
+
+import type { CustomPatternMatcherFunc, TokenType, IToken } from 'chevrotain';
+import type { Grammar, TerminalRule } from '../languages/generated/ast.js';
+import type { TokenBuilderOptions } from './token-builder.js';
+import type { LexerResult } from './lexer.js';
+import type { LangiumCoreServices } from '../services.js';
+import { createToken, createTokenInstance, Lexer } from 'chevrotain';
+import { DefaultTokenBuilder } from './token-builder.js';
+import { DefaultLexer, isTokenTypeArray } from './lexer.js';
+
+export interface IndentationTokenBuilderOptions {
+    /**
+     * The name of the token used to denote indentation in the grammar.
+     * A possible definition in the grammar could look like this:
+     * ```langium
+     * terminal INDENT: ':synthetic-indent:';
+     * ```
+     *
+     * @default 'INDENT'
+     */
+    indentTokenName: string;
+    /**
+     * The name of the token used to denote deindentation in the grammar.
+     * A possible definition in the grammar could look like this:
+     * ```langium
+     * terminal DEDENT: ':synthetic-dedent:';
+     * ```
+     *
+     * @default 'DEDENT'
+     */
+    dedentTokenName: string;
+    /**
+     * The name of the token used to denote whitespace other than indentation and newlines in the grammar.
+     * A possible definition in the grammar could look like this:
+     * ```langium
+     * hidden terminal WS: /[ \t]+/;
+     * ```
+     *
+     * @default 'WS'
+     */
+    whitespaceTokenName: string;
+}
+
+export const indentationBuilderDefaultOptions: IndentationTokenBuilderOptions = {
+    indentTokenName: 'INDENT',
+    dedentTokenName: 'DEDENT',
+    whitespaceTokenName: 'WS',
+};
+
+/**
+ * A token builder that is sensitive to indentation in the input text.
+ * It will generate tokens for indentation and dedentation based on the indentation level.
+ *
+ * Inspired by https://github.com/chevrotain/chevrotain/blob/master/examples/lexer/python_indentation/python_indentation.js
+ */
+export class IndentationAwareTokenBuilder extends DefaultTokenBuilder {
+    /**
+     * The stack in which all the previous matched indentation levels are stored
+     * to understand how deep a the next tokens are nested.
+     */
+    protected indentationStack: number[] = [0];
+    protected options: IndentationTokenBuilderOptions;
+
+    /**
+     * The token type to be used for indentation tokens
+     */
+    protected indentTokenType: TokenType;
+
+    /**
+     * The token type to be used for dedentation tokens
+     */
+    protected dedentTokenType: TokenType;
+
+    /**
+     * A regular expression to match a series of tabs and/or spaces.
+     * Override this to customize what the indentation is allowed to consist of.
+     */
+    protected whitespaceRegExp = /[ \t]+/y;
+
+    constructor(options: Partial<IndentationTokenBuilderOptions> = indentationBuilderDefaultOptions) {
+        super();
+        this.options = {
+            ...indentationBuilderDefaultOptions,
+            ...options,
+        };
+
+        this.indentTokenType = createToken({
+            name: this.options.indentTokenName,
+            pattern: this.indentMatcher,
+            line_breaks: false,
+        });
+
+        this.dedentTokenType = createToken({
+            name: this.options.dedentTokenName,
+            pattern: this.dedentMatcher,
+            line_breaks: false,
+        });
+    }
+
+    override buildTokens(grammar: Grammar, options?: TokenBuilderOptions | undefined) {
+        const tokenTypes = super.buildTokens(grammar, options);
+        if (!isTokenTypeArray(tokenTypes)) {
+            throw new Error('Invalid tokens built by default builder');
+        }
+
+        const {indentTokenName, dedentTokenName, whitespaceTokenName} = this.options;
+
+        // Rearrange tokens because whitespace (which is ignored) goes to the beginning by default, consuming indentation as well
+        // Order should be: dedent, indent, spaces
+        let dedent: TokenType | undefined;
+        let indent: TokenType | undefined;
+        let ws: TokenType | undefined;
+        const otherTokens: TokenType[] = [];
+        for (const tokenType of tokenTypes) {
+            if (tokenType.name === dedentTokenName) {
+                dedent = tokenType;
+            } else if (tokenType.name === indentTokenName) {
+                indent = tokenType;
+            } else if (tokenType.name === whitespaceTokenName) {
+                ws = tokenType;
+            } else {
+                otherTokens.push(tokenType);
+            }
+        }
+        if (!dedent || !indent || !ws) {
+            throw new Error('Some indentation/whitespace tokens not found!');
+        }
+        return [dedent, indent, ws, ...otherTokens];
+    }
+
+    /**
+     * Helper function to check if the current position is the start of a new line.
+     *
+     * @param text The full input string.
+     * @param offset The current position at which to check
+     * @returns Whether the current position is the start of a new line
+     */
+    protected isStartOfLine(text: string, offset: number): boolean {
+        return offset === 0 || '\r\n'.includes(text[offset - 1]);
+    }
+
+    /**
+     * A helper function used in matching both indents and dedents.
+     *
+     * @param text The full input string.
+     * @param offset The current position at which to attempt a match
+     * @returns The current and previous indentation levels and the matched whitespace
+     */
+    protected matchWhitespace(text: string, offset: number) {
+        this.whitespaceRegExp.lastIndex = offset;
+        const match = this.whitespaceRegExp.exec(text);
+        return {
+            currIndentLevel: match?.[0].length ?? 0,
+            prevIndentLevel: this.indentationStack.at(-1)!,
+            match,
+        };
+    }
+
+    /**
+     * Helper function to create an instance of an indentation token.
+     *
+     * @param tokenType Indent or dedent token type
+     * @param text Full input string, used to calculate the line number
+     * @param image The original image of the token (tabs or spaces)
+     * @param offset Current position in the input string
+     * @returns The indentation token instance
+     */
+    protected createIndentationTokenInstance(tokenType: TokenType, text: string, image: string, offset: number) {
+        const lineNumber = text.substring(0, offset).split(/\r\n|\r|\n/).length;
+        return createTokenInstance(
+            tokenType,
+            image,
+            offset, offset + image.length,
+            lineNumber, lineNumber,
+            0, image.length,
+        );
+    }
+
+    /**
+     * A custom pattern for matching indents
+     *
+     * @param text The full input string.
+     * @param offset The offset at which to attempt a match
+     * @param tokens Previously scanned Tokens
+     * @param groups Token Groups
+     */
+    protected indentMatcher: CustomPatternMatcherFunc = (text, offset, tokens, _groups) => {
+        const {indentTokenName} = this.options;
+
+        if (!this.isStartOfLine(text, offset)) {
+            return null;
+        }
+
+        const {currIndentLevel, prevIndentLevel, match} = this.matchWhitespace(text, offset);
+
+        if (currIndentLevel <= prevIndentLevel) {
+            // shallower indentation (should be matched by dedent)
+            // or same indentation level (should be matched by whitespace and ignored)
+            return null;
+        }
+
+        this.indentationStack.push(currIndentLevel);
+
+        const indentToken = this.createIndentationTokenInstance(
+            this.indentTokenType,
+            text,
+            match?.[0] ?? indentTokenName,
+            offset,
+        );
+        tokens.push(indentToken);
+
+        // Token already added, let the indentation now be consumed as whitespace and ignored
+        return null;
+    };
+
+    /**
+     * A custom pattern for matching dedents
+     *
+     * @param text The full input string.
+     * @param offset The offset at which to attempt a match
+     * @param tokens Previously scanned Tokens
+     * @param groups Token Groups
+     */
+    protected dedentMatcher: CustomPatternMatcherFunc = (text, offset, tokens, _groups) => {
+        const {dedentTokenName} = this.options;
+
+        if (!this.isStartOfLine(text, offset)) {
+            return null;
+        }
+
+        const {currIndentLevel, prevIndentLevel, match} = this.matchWhitespace(text, offset);
+
+        if (currIndentLevel >= prevIndentLevel) {
+            // bigger indentation (should be matched by indent)
+            // or same indentation level (should be matched by whitespace and ignored)
+            return null;
+        }
+
+        const matchIndentIndex = this.indentationStack.lastIndexOf(currIndentLevel);
+
+        // Any dedent must match some previous indentation level.
+        if (matchIndentIndex === -1) {
+            console.error(`Invalid dedent level ${currIndentLevel} at offset: ${offset}. Current indetation stack: ${this.indentationStack}`);
+            // throwing an error would crash the language server
+            // TODO: find a way to report error diagnostics message
+            return null;
+        }
+
+        const numberOfDedents = this.indentationStack.length - matchIndentIndex - 1;
+
+        for (let i = 0; i < numberOfDedents; i++) {
+            const token = this.createIndentationTokenInstance(
+                this.dedentTokenType,
+                text,
+                match?.[0] ?? dedentTokenName,
+                offset,
+            );
+            tokens.push(token);
+            this.indentationStack.pop();
+        }
+
+        // Token already added, let the dedentation now be consumed as whitespace and ignored
+        return null;
+    };
+
+    protected override buildTerminalToken(terminal: TerminalRule): TokenType {
+        const tokenType = super.buildTerminalToken(terminal);
+        const {indentTokenName, dedentTokenName, whitespaceTokenName} = this.options;
+
+        if (tokenType.name === indentTokenName) {
+            return this.indentTokenType;
+        } else if (tokenType.name === dedentTokenName) {
+            return this.dedentTokenType;
+        } else if (tokenType.name === whitespaceTokenName) {
+            return createToken({
+                name: whitespaceTokenName,
+                pattern: this.whitespaceRegExp,
+                group: Lexer.SKIPPED,
+            });
+        }
+
+        return tokenType;
+    }
+
+    /**
+     * Resets the indentation stack between different runs of the lexer
+     *
+     * @param text Full text that was tokenized
+     * @returns Remaining dedent tokens to match all previous indents at the end of the file
+     */
+    popRemainingDedents(text: string): IToken[] {
+        const remainingDedents: IToken[] = [];
+        while (this.indentationStack.length > 1) {
+            remainingDedents.push(
+                this.createIndentationTokenInstance(this.dedentTokenType, text, this.options.dedentTokenName, text.length)
+            );
+            this.indentationStack.pop();
+        }
+
+        this.indentationStack = [0];
+        return remainingDedents;
+    }
+}
+
+/**
+ * A lexer that is aware of indentation in the input text.
+ * The only purpose of this lexer is to reset the internal state of the {@link IndentationAwareTokenBuilder}
+ * between the tokenization of different text inputs.
+ *
+ * In your module, you can override the default lexer with this one as such:
+ * ```ts
+ * parser: {
+ *    TokenBuilder: () => new IndentationAwareTokenBuilder(),
+ *    Lexer: (services) => new IndentationAwareLexer(services),
+ * }
+ * ```
+ */
+export class IndentationAwareLexer extends DefaultLexer {
+    protected readonly indentationTokenBuilder: IndentationAwareTokenBuilder;
+
+    constructor(services: LangiumCoreServices) {
+        super(services);
+        if (services.parser.TokenBuilder instanceof IndentationAwareTokenBuilder) {
+            this.indentationTokenBuilder = services.parser.TokenBuilder;
+        } else {
+            throw new Error('IndentationAwareLexer requires an accompanying IndentationAwareTokenBuilder');
+        }
+    }
+
+    override tokenize(text: string): LexerResult {
+        const result = super.tokenize(text);
+
+        // reset the indent stack between processing of different text inputs
+        const remainingDedents = this.indentationTokenBuilder.popRemainingDedents(text);
+        result.tokens.push(...remainingDedents);
+
+        return result;
+    }
+}
diff --git a/packages/langium/src/parser/index.ts b/packages/langium/src/parser/index.ts
index c0a9e8d7f..17b49b62c 100644
--- a/packages/langium/src/parser/index.ts
+++ b/packages/langium/src/parser/index.ts
@@ -13,3 +13,4 @@ export * from './lexer.js';
 export * from './parser-config.js';
 export * from './token-builder.js';
 export * from './value-converter.js';
+export * from './indentation-aware.js';
diff --git a/packages/langium/src/parser/lexer.ts b/packages/langium/src/parser/lexer.ts
index 0eb0653b9..eec23ea12 100644
--- a/packages/langium/src/parser/lexer.ts
+++ b/packages/langium/src/parser/lexer.ts
@@ -6,7 +6,6 @@
 
 import type { ILexingError, IMultiModeLexerDefinition, IToken, TokenType, TokenTypeDictionary, TokenVocabulary } from 'chevrotain';
 import type { LangiumCoreServices } from '../services.js';
-import { IndentationAwareTokenBuilder } from './token-builder.js';
 import { Lexer as ChevrotainLexer } from 'chevrotain';
 
 export interface LexerResult {
@@ -67,42 +66,6 @@ export class DefaultLexer implements Lexer {
     }
 }
 
-/**
- * A lexer that is aware of indentation in the input text.
- * The only purpose of this lexer is to reset the internal state of the {@link IndentationAwareTokenBuilder}
- * between the tokenization of different text inputs.
- *
- * In your module, you can override the default lexer with this one as such:
- * ```ts
- * parser: {
- *    TokenBuilder: () => new IndentationAwareTokenBuilder(),
- *    Lexer: (services) => new IndentationAwareLexer(services),
- * }
- * ```
- */
-export class IndentationAwareLexer extends DefaultLexer {
-    protected readonly indentationTokenBuilder: IndentationAwareTokenBuilder;
-
-    constructor(services: LangiumCoreServices) {
-        super(services);
-        if (services.parser.TokenBuilder instanceof IndentationAwareTokenBuilder) {
-            this.indentationTokenBuilder = services.parser.TokenBuilder;
-        } else {
-            throw new Error('IndentationAwareLexer requires an accompanying IndentationAwareTokenBuilder');
-        }
-    }
-
-    override tokenize(text: string): LexerResult {
-        const result = super.tokenize(text);
-
-        // reset the indent stack between processing of different text inputs
-        const remainingDedents = this.indentationTokenBuilder.popRemainingDedents(text);
-        result.tokens.push(...remainingDedents);
-
-        return result;
-    }
-}
-
 /**
  * Returns a check whether the given TokenVocabulary is TokenType array
  */
diff --git a/packages/langium/src/parser/token-builder.ts b/packages/langium/src/parser/token-builder.ts
index e6b734d82..4320c5ca6 100644
--- a/packages/langium/src/parser/token-builder.ts
+++ b/packages/langium/src/parser/token-builder.ts
@@ -4,16 +4,15 @@
  * terms of the MIT License, which is available in the project root.
  ******************************************************************************/
 
-import type { CustomPatternMatcherFunc, TokenPattern, TokenType, TokenVocabulary, IToken } from 'chevrotain';
+import type { CustomPatternMatcherFunc, TokenPattern, TokenType, TokenVocabulary } from 'chevrotain';
 import type { AbstractRule, Grammar, Keyword, TerminalRule } from '../languages/generated/ast.js';
 import type { Stream } from '../utils/stream.js';
-import { createToken, createTokenInstance, Lexer } from 'chevrotain';
+import { Lexer } from 'chevrotain';
 import { isKeyword, isParserRule, isTerminalRule } from '../languages/generated/ast.js';
 import { streamAllContents } from '../utils/ast-utils.js';
 import { getAllReachableRules, terminalRegex } from '../utils/grammar-utils.js';
 import { getCaseInsensitivePattern, isWhitespace, partialMatches } from '../utils/regexp-utils.js';
 import { stream } from '../utils/stream.js';
-import { isTokenTypeArray } from './lexer.js';
 
 export interface TokenBuilderOptions {
     caseInsensitive?: boolean
@@ -119,297 +118,3 @@ export class DefaultTokenBuilder implements TokenBuilder {
         }, []);
     }
 }
-
-export interface IndentationTokenBuilderOptions {
-    /**
-     * The name of the token used to denote indentation in the grammar.
-     * A possible definition in the grammar could look like this:
-     * ```langium
-     * terminal INDENT: ':synthetic-indent:';
-     * ```
-     *
-     * @default 'INDENT'
-     */
-    indentTokenName: string;
-    /**
-     * The name of the token used to denote deindentation in the grammar.
-     * A possible definition in the grammar could look like this:
-     * ```langium
-     * terminal DEDENT: ':synthetic-dedent:';
-     * ```
-     *
-     * @default 'DEDENT'
-     */
-    dedentTokenName: string;
-    /**
-     * The name of the token used to denote whitespace other than indentation and newlines in the grammar.
-     * A possible definition in the grammar could look like this:
-     * ```langium
-     * hidden terminal WS: /[ \t]+/;
-     * ```
-     *
-     * @default 'WS'
-     */
-    whitespaceTokenName: string;
-}
-
-export const indentationBuilderDefaultOptions: IndentationTokenBuilderOptions = {
-    indentTokenName: 'INDENT',
-    dedentTokenName: 'DEDENT',
-    whitespaceTokenName: 'WS',
-};
-
-/**
- * A token builder that is sensitive to indentation in the input text.
- * It will generate tokens for indentation and dedentation based on the indentation level.
- *
- * Inspired by https://github.com/chevrotain/chevrotain/blob/master/examples/lexer/python_indentation/python_indentation.js
- */
-export class IndentationAwareTokenBuilder extends DefaultTokenBuilder {
-    /**
-     * The stack in which all the previous matched indentation levels are stored
-     * to understand how deep a the next tokens are nested.
-     */
-    protected indentationStack: number[] = [0];
-    protected options: IndentationTokenBuilderOptions;
-
-    /**
-     * The token type to be used for indentation tokens
-     */
-    protected indentTokenType: TokenType;
-
-    /**
-     * The token type to be used for dedentation tokens
-     */
-    protected dedentTokenType: TokenType;
-
-    /**
-     * A regular expression to match a series of tabs and/or spaces.
-     * Override this to customize what the indentation is allowed to consist of.
-     */
-    protected whitespaceRegExp = /[ \t]+/y;
-
-    constructor(options: Partial<IndentationTokenBuilderOptions> = indentationBuilderDefaultOptions) {
-        super();
-        this.options = {
-            ...indentationBuilderDefaultOptions,
-            ...options,
-        };
-
-        this.indentTokenType = createToken({
-            name: this.options.indentTokenName,
-            pattern: this.indentMatcher,
-            line_breaks: false,
-        });
-
-        this.dedentTokenType = createToken({
-            name: this.options.dedentTokenName,
-            pattern: this.dedentMatcher,
-            line_breaks: false,
-        });
-    }
-
-    override buildTokens(grammar: Grammar, options?: TokenBuilderOptions | undefined) {
-        const tokenTypes = super.buildTokens(grammar, options);
-        if (!isTokenTypeArray(tokenTypes)) {
-            throw new Error('Invalid tokens built by default builder');
-        }
-
-        const {indentTokenName, dedentTokenName, whitespaceTokenName} = this.options;
-
-        // Rearrange tokens because whitespace (which is ignored) goes to the beginning by default, consuming indentation as well
-        // Order should be: dedent, indent, spaces
-        let dedent: TokenType | undefined;
-        let indent: TokenType | undefined;
-        let ws: TokenType | undefined;
-        const otherTokens: TokenType[] = [];
-        for (const tokenType of tokenTypes) {
-            if (tokenType.name === dedentTokenName) {
-                dedent = tokenType;
-            } else if (tokenType.name === indentTokenName) {
-                indent = tokenType;
-            } else if (tokenType.name === whitespaceTokenName) {
-                ws = tokenType;
-            } else {
-                otherTokens.push(tokenType);
-            }
-        }
-        if (!dedent || !indent || !ws) {
-            throw new Error('Some indentation/whitespace tokens not found!');
-        }
-        return [dedent, indent, ws, ...otherTokens];
-    }
-
-    /**
-     * Helper function to check if the current position is the start of a new line.
-     *
-     * @param text The full input string.
-     * @param offset The current position at which to check
-     * @returns Whether the current position is the start of a new line
-     */
-    protected isStartOfLine(text: string, offset: number): boolean {
-        return offset === 0 || '\r\n'.includes(text[offset - 1]);
-    }
-
-    /**
-     * A helper function used in matching both indents and dedents.
-     *
-     * @param text The full input string.
-     * @param offset The current position at which to attempt a match
-     * @returns The current and previous indentation levels and the matched whitespace
-     */
-    protected matchWhitespace(text: string, offset: number) {
-        this.whitespaceRegExp.lastIndex = offset;
-        const match = this.whitespaceRegExp.exec(text);
-        return {
-            currIndentLevel: match?.[0].length ?? 0,
-            prevIndentLevel: this.indentationStack.at(-1)!,
-            match,
-        };
-    }
-
-    /**
-     * Helper function to create an instance of an indentation token.
-     *
-     * @param tokenType Indent or dedent token type
-     * @param text Full input string, used to calculate the line number
-     * @param image The original image of the token (tabs or spaces)
-     * @param offset Current position in the input string
-     * @returns The indentation token instance
-     */
-    protected createIndentationTokenInstance(tokenType: TokenType, text: string, image: string, offset: number) {
-        const lineNumber = text.substring(0, offset).split(/\r\n|\r|\n/).length;
-        return createTokenInstance(
-            tokenType,
-            image,
-            offset, offset + image.length,
-            lineNumber, lineNumber,
-            0, image.length,
-        );
-    }
-
-    /**
-     * A custom pattern for matching indents
-     *
-     * @param text The full input string.
-     * @param offset The offset at which to attempt a match
-     * @param tokens Previously scanned Tokens
-     * @param groups Token Groups
-     */
-    protected indentMatcher: CustomPatternMatcherFunc = (text, offset, tokens, _groups) => {
-        const {indentTokenName} = this.options;
-
-        if (!this.isStartOfLine(text, offset)) {
-            return null;
-        }
-
-        const {currIndentLevel, prevIndentLevel, match} = this.matchWhitespace(text, offset);
-
-        if (currIndentLevel <= prevIndentLevel) {
-            // shallower indentation (should be matched by dedent)
-            // or same indentation level (should be matched by whitespace and ignored)
-            return null;
-        }
-
-        this.indentationStack.push(currIndentLevel);
-
-        const indentToken = this.createIndentationTokenInstance(
-            this.indentTokenType,
-            text,
-            match?.[0] ?? indentTokenName,
-            offset,
-        );
-        tokens.push(indentToken);
-
-        // Token already added, let the indentation now be consumed as whitespace and ignored
-        return null;
-    };
-
-    /**
-     * A custom pattern for matching dedents
-     *
-     * @param text The full input string.
-     * @param offset The offset at which to attempt a match
-     * @param tokens Previously scanned Tokens
-     * @param groups Token Groups
-     */
-    protected dedentMatcher: CustomPatternMatcherFunc = (text, offset, tokens, _groups) => {
-        const {dedentTokenName} = this.options;
-
-        if (!this.isStartOfLine(text, offset)) {
-            return null;
-        }
-
-        const {currIndentLevel, prevIndentLevel, match} = this.matchWhitespace(text, offset);
-
-        if (currIndentLevel >= prevIndentLevel) {
-            // bigger indentation (should be matched by indent)
-            // or same indentation level (should be matched by whitespace and ignored)
-            return null;
-        }
-
-        const matchIndentIndex = this.indentationStack.lastIndexOf(currIndentLevel);
-
-        // Any dedent must match some previous indentation level.
-        if (matchIndentIndex === -1) {
-            console.error(`Invalid dedent level ${currIndentLevel} at offset: ${offset}. Current indetation stack: ${this.indentationStack}`);
-            // throwing an error would crash the language server
-            // TODO: find a way to report error diagnostics message
-            return null;
-        }
-
-        const numberOfDedents = this.indentationStack.length - matchIndentIndex - 1;
-
-        for (let i = 0; i < numberOfDedents; i++) {
-            const token = this.createIndentationTokenInstance(
-                this.dedentTokenType,
-                text,
-                match?.[0] ?? dedentTokenName,
-                offset,
-            );
-            tokens.push(token);
-            this.indentationStack.pop();
-        }
-
-        // Token already added, let the dedentation now be consumed as whitespace and ignored
-        return null;
-    };
-
-    protected override buildTerminalToken(terminal: TerminalRule): TokenType {
-        const tokenType = super.buildTerminalToken(terminal);
-        const {indentTokenName, dedentTokenName, whitespaceTokenName} = this.options;
-
-        if (tokenType.name === indentTokenName) {
-            return this.indentTokenType;
-        } else if (tokenType.name === dedentTokenName) {
-            return this.dedentTokenType;
-        } else if (tokenType.name === whitespaceTokenName) {
-            return createToken({
-                name: whitespaceTokenName,
-                pattern: this.whitespaceRegExp,
-                group: Lexer.SKIPPED,
-            });
-        }
-
-        return tokenType;
-    }
-
-    /**
-     * Resets the indentation stack between different runs of the lexer
-     *
-     * @param text Full text that was tokenized
-     * @returns Remaining dedent tokens to match all previous indents at the end of the file
-     */
-    popRemainingDedents(text: string): IToken[] {
-        const remainingDedents: IToken[] = [];
-        while (this.indentationStack.length > 1) {
-            remainingDedents.push(
-                this.createIndentationTokenInstance(this.dedentTokenType, text, this.options.dedentTokenName, text.length)
-            );
-            this.indentationStack.pop();
-        }
-
-        this.indentationStack = [0];
-        return remainingDedents;
-    }
-}

From 4f7ecf3b3d617612a031953fbe1400349822831e Mon Sep 17 00:00:00 2001
From: Abdelrahman Abounegm <abounegm@yandex-team.ru>
Date: Wed, 17 Jul 2024 12:39:14 +0300
Subject: [PATCH 10/14] Replace thrown error with warning again

It was causing issues with the "super" call in the constructor in tests
---
 packages/langium/src/parser/indentation-aware.ts | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/packages/langium/src/parser/indentation-aware.ts b/packages/langium/src/parser/indentation-aware.ts
index 9c4bb4dec..f4dc45ad9 100644
--- a/packages/langium/src/parser/indentation-aware.ts
+++ b/packages/langium/src/parser/indentation-aware.ts
@@ -321,20 +321,24 @@ export class IndentationAwareTokenBuilder extends DefaultTokenBuilder {
  * ```
  */
 export class IndentationAwareLexer extends DefaultLexer {
-    protected readonly indentationTokenBuilder: IndentationAwareTokenBuilder;
+    protected readonly indentationTokenBuilder?: IndentationAwareTokenBuilder;
 
     constructor(services: LangiumCoreServices) {
         super(services);
         if (services.parser.TokenBuilder instanceof IndentationAwareTokenBuilder) {
             this.indentationTokenBuilder = services.parser.TokenBuilder;
         } else {
-            throw new Error('IndentationAwareLexer requires an accompanying IndentationAwareTokenBuilder');
+            console.warn('IndentationAwareLexer requires an accompanying IndentationAwareTokenBuilder');
         }
     }
 
     override tokenize(text: string): LexerResult {
         const result = super.tokenize(text);
 
+        if (this.indentationTokenBuilder === undefined) {
+            return result;
+        }
+
         // reset the indent stack between processing of different text inputs
         const remainingDedents = this.indentationTokenBuilder.popRemainingDedents(text);
         result.tokens.push(...remainingDedents);

From 4d0724ad30a7f7611181656d9a4ad458e12a6320 Mon Sep 17 00:00:00 2001
From: Abdelrahman Abounegm <abounegm@yandex-team.ru>
Date: Wed, 17 Jul 2024 13:02:41 +0300
Subject: [PATCH 11/14] Remove indent-dedent pairs with nothing in between

They provide no value, but may throw off the parser
---
 .../langium/src/parser/indentation-aware.ts    | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/packages/langium/src/parser/indentation-aware.ts b/packages/langium/src/parser/indentation-aware.ts
index f4dc45ad9..0be75de99 100644
--- a/packages/langium/src/parser/indentation-aware.ts
+++ b/packages/langium/src/parser/indentation-aware.ts
@@ -64,7 +64,7 @@ export class IndentationAwareTokenBuilder extends DefaultTokenBuilder {
      * to understand how deep a the next tokens are nested.
      */
     protected indentationStack: number[] = [0];
-    protected options: IndentationTokenBuilderOptions;
+    readonly options: IndentationTokenBuilderOptions;
 
     /**
      * The token type to be used for indentation tokens
@@ -343,6 +343,22 @@ export class IndentationAwareLexer extends DefaultLexer {
         const remainingDedents = this.indentationTokenBuilder.popRemainingDedents(text);
         result.tokens.push(...remainingDedents);
 
+        // remove any "indent-dedent" pair with an empty body as these are typically
+        // added by comments or lines with just whitespace but have no real value
+        const { indentTokenName, dedentTokenName } = this.indentationTokenBuilder.options;
+        const cleanTokens: IToken[] = [];
+        for (let i = 0; i < result.tokens.length; i++) {
+            const token = result.tokens[i];
+            const nextToken = result.tokens[i + 1];
+            if (token.tokenType.name === indentTokenName && nextToken?.tokenType.name === dedentTokenName) {
+                i++;
+                continue;
+            }
+
+            cleanTokens.push(token);
+        }
+        result.tokens = cleanTokens;
+
         return result;
     }
 }

From 6d69af422fe9d511bc5731223ba9c1249ba58742 Mon Sep 17 00:00:00 2001
From: Abdelrahman Abounegm <abounegm@yandex-team.ru>
Date: Wed, 17 Jul 2024 13:11:13 +0300
Subject: [PATCH 12/14] Add unit tests for indentation-aware module

---
 .../test/parser/indentation-aware.test.ts     | 249 ++++++++++++++++++
 1 file changed, 249 insertions(+)
 create mode 100644 packages/langium/test/parser/indentation-aware.test.ts

diff --git a/packages/langium/test/parser/indentation-aware.test.ts b/packages/langium/test/parser/indentation-aware.test.ts
new file mode 100644
index 000000000..8033c7873
--- /dev/null
+++ b/packages/langium/test/parser/indentation-aware.test.ts
@@ -0,0 +1,249 @@
+/******************************************************************************
+ * Copyright 2024 TypeFox GmbH
+ * This program and the accompanying materials are made available under the
+ * terms of the MIT License, which is available in the project root.
+ ******************************************************************************/
+
+import type { TokenType } from '@chevrotain/types';
+import type { Grammar, LangiumParser, Lexer } from 'langium';
+import { beforeAll, describe, expect, test } from 'vitest';
+import { EmptyFileSystem, IndentationAwareLexer, IndentationAwareTokenBuilder } from 'langium';
+import { createLangiumGrammarServices, createServicesForGrammar } from 'langium/grammar';
+import { parseHelper } from 'langium/test';
+
+const grammarServices = createLangiumGrammarServices(EmptyFileSystem).grammar;
+const helper = parseHelper<Grammar>(grammarServices);
+
+const tokenBuilder = new IndentationAwareTokenBuilder();
+
+async function getTokens(grammarString: string): Promise<TokenType[]> {
+    const grammar = (await helper(grammarString)).parseResult.value;
+    return tokenBuilder.buildTokens(grammar) as TokenType[];
+}
+
+async function getLexer(grammar: string): Promise<Lexer> {
+    const services = await createServicesForGrammar({ grammar });
+    services.parser.TokenBuilder = tokenBuilder;
+    services.parser.Lexer = new IndentationAwareLexer(services);
+    return services.parser.Lexer;
+}
+
+async function getParser(grammar: string): Promise<LangiumParser> {
+    const services = await createServicesForGrammar({
+        grammar
+    });
+    services.parser.TokenBuilder = tokenBuilder;
+    services.parser.Lexer = new IndentationAwareLexer(services);
+    return services.parser.LangiumParser;
+}
+
+beforeAll(() => {
+    tokenBuilder.popRemainingDedents('');
+});
+
+describe('indentationAwareTokenBuilder', () => {
+
+    const sampleGrammar = `
+    entry Main:
+        INDENT name=ID DEDENT;
+
+    terminal ID: /[a-zA-Z_]\\w*/;
+    hidden terminal NL: /[\\r\\n]+/;
+    hidden terminal WS: /[\\t ]+/;
+    terminal INDENT: 'synthetic:indent';
+    terminal DEDENT: 'synthetic:dedent';
+    `;
+
+    test('Moves indent/dedent token types to the beginning', async () => {
+        const tokenTypes = await getTokens(sampleGrammar);
+
+        expect(tokenTypes).toHaveLength(5);
+
+        const [dedent, indent, ws] = tokenTypes;
+        expect(dedent.name).toBe('DEDENT');
+        expect(indent.name).toBe('INDENT');
+        expect(ws.name).toBe('WS');
+    });
+
+    test('Modifies indent/dedent patterns to be functions', async () => {
+        const tokenTypes = await getTokens(sampleGrammar);
+
+        expect(tokenTypes).toHaveLength(5);
+
+        const [dedent, indent] = tokenTypes;
+        expect(dedent.PATTERN).toBeTypeOf('function');
+        expect(indent.PATTERN).toBeTypeOf('function');
+    });
+
+    test('Rejects grammar without indent', async () => {
+        const indentlessGrammar = `
+        entry Main: name=ID;
+
+        terminal ID: /[a-zA-Z_]\\w*/;
+        hidden terminal WS: /\\s+/;
+        terminal DEDENT: 'synthetic:dedent';
+        `;
+
+        await expect(getTokens(indentlessGrammar)).rejects.toThrowError();
+    });
+
+    test('Rejects grammar without dedent', async () => {
+        const dedentlessGrammar = `
+        entry Main: name=ID;
+
+        terminal ID: /[a-zA-Z_]\\w*/;
+        hidden terminal WS: /\\s+/;
+        terminal INDENT: 'synthetic:indent';
+        `;
+
+        await expect(getTokens(dedentlessGrammar)).rejects.toThrowError();
+    });
+
+    test('Rejects grammar without whitespace', async () => {
+        const spacelessGrammar = `
+        entry Main: name=ID;
+
+        terminal ID: /[a-zA-Z_]\\w*/;
+        terminal INDENT: 'synthetic:indent';
+        terminal DEDENT: 'synthetic:dedent';
+        `;
+
+        await expect(getTokens(spacelessGrammar)).rejects.toThrowError();
+    });
+
+});
+
+describe('indentationAwareLexer', () => {
+
+    const sampleGrammar = `
+    grammar Test
+
+    entry Block: '{' INDENT names+=ID* DEDENT nested+=Block* '}';
+
+    terminal ID: /[a-zA-Z_]\\w*/;
+    hidden terminal NL: /[\\r\\n]+/;
+    hidden terminal WS: /[\\t ]+/;
+    terminal INDENT: 'synthetic:indent';
+    terminal DEDENT: 'synthetic:dedent';
+    hidden terminal ML_COMMENT: /\\/\\*[\\s\\S]*?\\*\\//;
+    hidden terminal SL_COMMENT: /\\/\\/[^\\n\\r]*/;
+    `;
+
+    test('should emit indent/dedent tokens around a block', async () => {
+        const lexer = await getLexer(sampleGrammar);
+        const {tokens, errors} = lexer.tokenize(`{
+    name
+    anotherName
+}`);
+
+        expect(errors).toHaveLength(0);
+        expect(tokens).toHaveLength(6);
+
+        const [/* L_BRAC */, indent, /* id1 */, /* id2 */, dedent, /* _R_BRAC */] = tokens;
+        expect(indent.tokenType.name).toBe('INDENT');
+        expect(dedent.tokenType.name).toBe('DEDENT');
+    });
+
+    test('should ignore indent tokens before comments', async () => {
+        const lexer = await getLexer(sampleGrammar);
+        const {tokens, errors} = lexer.tokenize(`// single-line comment
+            // indented comment when not expecting indentation
+{
+    name
+        // comment with different indentation inside block
+    anotherName
+}`);
+
+        expect(errors).toHaveLength(0);
+        expect(tokens).toHaveLength(6);
+    });
+
+    test('should not dedect indentation without a newline', async () => {
+        const lexer = await getLexer(sampleGrammar);
+        const {tokens} = lexer.tokenize(`{    name
+        // indented comment - to be ignored
+}`);
+        expect(tokens).toHaveLength(3);
+        expect(tokens[1]).not.toBe('INDENT');
+    });
+
+    test('should add remaining dedents to the end', async () => {
+        const lexer = await getLexer(sampleGrammar);
+        const {tokens} = lexer.tokenize(`// single-line comment
+{
+    name`);
+        expect(tokens).toHaveLength(4);
+
+        const [/* L_BRAC */, indent, /* id */, dedent] = tokens;
+        expect(indent.tokenType.name).toBe('INDENT');
+        expect(dedent.tokenType.name).toBe('DEDENT');
+    });
+
+});
+
+describe('indentationAware parsing', () => {
+
+    const sampleGrammar = `
+    grammar PythonIf
+
+    entry Statement: If | Return;
+
+    If:
+        'if' condition=BOOLEAN ':'
+        INDENT thenBody=Statement DEDENT
+        ('else' ':' INDENT elseBody=Statement DEDENT)?;
+
+    Return: 'return' value=BOOLEAN;
+
+    terminal BOOLEAN: /true|false/;
+    terminal INDENT: 'synthetic:indent';
+    terminal DEDENT: 'synthetic:dedent';
+    hidden terminal NL: /[\\r\\n]+/;
+    hidden terminal WS: /[\\t ]+/;
+    `;
+
+    test('should parse correctly indented code', async () => {
+        const parser = await getParser(sampleGrammar);
+        const { parserErrors } = parser.parse(`
+if true:
+    return false
+else:
+    return true`);
+
+        expect(parserErrors).toHaveLength(0);
+    });
+
+    test('should error on non-matching dedent', async () => {
+        const parser = await getParser(sampleGrammar);
+        const {parserErrors} = parser.parse(`
+if true:
+    return false
+  else:
+    return true`);
+
+        expect(parserErrors.length).toBeGreaterThan(0);
+    });
+
+    test('should throw an error on unexpected indent', async () => {
+        const parser = await getParser(sampleGrammar);
+        const { parserErrors } = parser.parse(`
+        if true:
+            return false`);
+
+        expect(parserErrors.length).toBeGreaterThan(0);
+    });
+
+    test('should correctly parse nested blocks', async () => {
+        const parser = await getParser(sampleGrammar);
+        const {parserErrors} = parser.parse(`
+if true:
+    return true
+else:
+    if false:
+        return false
+        `);
+
+        expect(parserErrors).toHaveLength(0);
+    });
+
+});

From 657b7bc25d8d55fc739ca8e9ceba9decaf26e108 Mon Sep 17 00:00:00 2001
From: Mark Sujew <mark.sujew@typefox.io>
Date: Wed, 17 Jul 2024 16:00:08 +0200
Subject: [PATCH 13/14] Formatting and minor optimizations

---
 .../langium/src/parser/indentation-aware.ts   |  37 ++--
 packages/langium/src/parser/index.ts          |   2 +-
 .../test/parser/indentation-aware.test.ts     | 198 +++++++++++-------
 3 files changed, 142 insertions(+), 95 deletions(-)

diff --git a/packages/langium/src/parser/indentation-aware.ts b/packages/langium/src/parser/indentation-aware.ts
index 0be75de99..194acd198 100644
--- a/packages/langium/src/parser/indentation-aware.ts
+++ b/packages/langium/src/parser/indentation-aware.ts
@@ -69,12 +69,12 @@ export class IndentationAwareTokenBuilder extends DefaultTokenBuilder {
     /**
      * The token type to be used for indentation tokens
      */
-    protected indentTokenType: TokenType;
+    readonly indentTokenType: TokenType;
 
     /**
      * The token type to be used for dedentation tokens
      */
-    protected dedentTokenType: TokenType;
+    readonly dedentTokenType: TokenType;
 
     /**
      * A regular expression to match a series of tabs and/or spaces.
@@ -108,7 +108,7 @@ export class IndentationAwareTokenBuilder extends DefaultTokenBuilder {
             throw new Error('Invalid tokens built by default builder');
         }
 
-        const {indentTokenName, dedentTokenName, whitespaceTokenName} = this.options;
+        const { indentTokenName, dedentTokenName, whitespaceTokenName } = this.options;
 
         // Rearrange tokens because whitespace (which is ignored) goes to the beginning by default, consuming indentation as well
         // Order should be: dedent, indent, spaces
@@ -190,13 +190,13 @@ export class IndentationAwareTokenBuilder extends DefaultTokenBuilder {
      * @param groups Token Groups
      */
     protected indentMatcher: CustomPatternMatcherFunc = (text, offset, tokens, _groups) => {
-        const {indentTokenName} = this.options;
+        const { indentTokenName } = this.options;
 
         if (!this.isStartOfLine(text, offset)) {
             return null;
         }
 
-        const {currIndentLevel, prevIndentLevel, match} = this.matchWhitespace(text, offset);
+        const { currIndentLevel, prevIndentLevel, match } = this.matchWhitespace(text, offset);
 
         if (currIndentLevel <= prevIndentLevel) {
             // shallower indentation (should be matched by dedent)
@@ -227,13 +227,13 @@ export class IndentationAwareTokenBuilder extends DefaultTokenBuilder {
      * @param groups Token Groups
      */
     protected dedentMatcher: CustomPatternMatcherFunc = (text, offset, tokens, _groups) => {
-        const {dedentTokenName} = this.options;
+        const { dedentTokenName } = this.options;
 
         if (!this.isStartOfLine(text, offset)) {
             return null;
         }
 
-        const {currIndentLevel, prevIndentLevel, match} = this.matchWhitespace(text, offset);
+        const { currIndentLevel, prevIndentLevel, match } = this.matchWhitespace(text, offset);
 
         if (currIndentLevel >= prevIndentLevel) {
             // bigger indentation (should be matched by indent)
@@ -270,7 +270,7 @@ export class IndentationAwareTokenBuilder extends DefaultTokenBuilder {
 
     protected override buildTerminalToken(terminal: TerminalRule): TokenType {
         const tokenType = super.buildTerminalToken(terminal);
-        const {indentTokenName, dedentTokenName, whitespaceTokenName} = this.options;
+        const { indentTokenName, dedentTokenName, whitespaceTokenName } = this.options;
 
         if (tokenType.name === indentTokenName) {
             return this.indentTokenType;
@@ -321,42 +321,45 @@ export class IndentationAwareTokenBuilder extends DefaultTokenBuilder {
  * ```
  */
 export class IndentationAwareLexer extends DefaultLexer {
-    protected readonly indentationTokenBuilder?: IndentationAwareTokenBuilder;
+
+    protected readonly indentationTokenBuilder: IndentationAwareTokenBuilder;
 
     constructor(services: LangiumCoreServices) {
         super(services);
         if (services.parser.TokenBuilder instanceof IndentationAwareTokenBuilder) {
             this.indentationTokenBuilder = services.parser.TokenBuilder;
         } else {
-            console.warn('IndentationAwareLexer requires an accompanying IndentationAwareTokenBuilder');
+            throw new Error('IndentationAwareLexer requires an accompanying IndentationAwareTokenBuilder');
         }
     }
 
     override tokenize(text: string): LexerResult {
         const result = super.tokenize(text);
 
-        if (this.indentationTokenBuilder === undefined) {
-            return result;
-        }
-
         // reset the indent stack between processing of different text inputs
         const remainingDedents = this.indentationTokenBuilder.popRemainingDedents(text);
         result.tokens.push(...remainingDedents);
 
         // remove any "indent-dedent" pair with an empty body as these are typically
         // added by comments or lines with just whitespace but have no real value
-        const { indentTokenName, dedentTokenName } = this.indentationTokenBuilder.options;
+        const { indentTokenType, dedentTokenType } = this.indentationTokenBuilder;
+        // Use tokenTypeIdx for fast comparison
+        const indentTokenIdx = indentTokenType.tokenTypeIdx;
+        const dedentTokenIdx = dedentTokenType.tokenTypeIdx;
         const cleanTokens: IToken[] = [];
-        for (let i = 0; i < result.tokens.length; i++) {
+        const length = result.tokens.length - 1;
+        for (let i = 0; i < length; i++) {
             const token = result.tokens[i];
             const nextToken = result.tokens[i + 1];
-            if (token.tokenType.name === indentTokenName && nextToken?.tokenType.name === dedentTokenName) {
+            if (token.tokenTypeIdx === indentTokenIdx && nextToken.tokenTypeIdx === dedentTokenIdx) {
                 i++;
                 continue;
             }
 
             cleanTokens.push(token);
         }
+        // Push last token separately
+        cleanTokens.push(result.tokens[length]);
         result.tokens = cleanTokens;
 
         return result;
diff --git a/packages/langium/src/parser/index.ts b/packages/langium/src/parser/index.ts
index 17b49b62c..fab284a9c 100644
--- a/packages/langium/src/parser/index.ts
+++ b/packages/langium/src/parser/index.ts
@@ -7,10 +7,10 @@
 export * from './async-parser.js';
 export * from './completion-parser-builder.js';
 export * from './cst-node-builder.js';
+export * from './indentation-aware.js';
 export * from './langium-parser-builder.js';
 export * from './langium-parser.js';
 export * from './lexer.js';
 export * from './parser-config.js';
 export * from './token-builder.js';
 export * from './value-converter.js';
-export * from './indentation-aware.js';
diff --git a/packages/langium/test/parser/indentation-aware.test.ts b/packages/langium/test/parser/indentation-aware.test.ts
index 8033c7873..1e376c285 100644
--- a/packages/langium/test/parser/indentation-aware.test.ts
+++ b/packages/langium/test/parser/indentation-aware.test.ts
@@ -5,10 +5,12 @@
  ******************************************************************************/
 
 import type { TokenType } from '@chevrotain/types';
-import type { Grammar, LangiumParser, Lexer } from 'langium';
-import { beforeAll, describe, expect, test } from 'vitest';
+import type { AstNode, Grammar, LangiumParser, Lexer, Module } from 'langium';
+import { beforeEach, describe, expect, test } from 'vitest';
 import { EmptyFileSystem, IndentationAwareLexer, IndentationAwareTokenBuilder } from 'langium';
 import { createLangiumGrammarServices, createServicesForGrammar } from 'langium/grammar';
+import type { LangiumServices, PartialLangiumServices } from 'langium/lsp';
+import { expandToString } from 'langium/generate';
 import { parseHelper } from 'langium/test';
 
 const grammarServices = createLangiumGrammarServices(EmptyFileSystem).grammar;
@@ -22,36 +24,43 @@ async function getTokens(grammarString: string): Promise<TokenType[]> {
 }
 
 async function getLexer(grammar: string): Promise<Lexer> {
-    const services = await createServicesForGrammar({ grammar });
-    services.parser.TokenBuilder = tokenBuilder;
-    services.parser.Lexer = new IndentationAwareLexer(services);
+    const services = await createIndentationAwareServices(grammar);
     return services.parser.Lexer;
 }
 
 async function getParser(grammar: string): Promise<LangiumParser> {
+    const services = await createIndentationAwareServices(grammar);
+    return services.parser.LangiumParser;
+}
+
+async function createIndentationAwareServices(grammar: string): Promise<LangiumServices> {
     const services = await createServicesForGrammar({
-        grammar
+        grammar,
+        module: {
+            parser: {
+                TokenBuilder: () => new IndentationAwareTokenBuilder(),
+                Lexer: services => new IndentationAwareLexer(services)
+            }
+        } satisfies Module<LangiumServices, PartialLangiumServices>
     });
-    services.parser.TokenBuilder = tokenBuilder;
-    services.parser.Lexer = new IndentationAwareLexer(services);
-    return services.parser.LangiumParser;
+    return services;
 }
 
-beforeAll(() => {
+beforeEach(() => {
     tokenBuilder.popRemainingDedents('');
 });
 
-describe('indentationAwareTokenBuilder', () => {
+describe('IndentationAwareTokenBuilder', () => {
 
     const sampleGrammar = `
-    entry Main:
-        INDENT name=ID DEDENT;
-
-    terminal ID: /[a-zA-Z_]\\w*/;
-    hidden terminal NL: /[\\r\\n]+/;
-    hidden terminal WS: /[\\t ]+/;
-    terminal INDENT: 'synthetic:indent';
-    terminal DEDENT: 'synthetic:dedent';
+        entry Main:
+            INDENT name=ID DEDENT;
+
+        terminal ID: /[a-zA-Z_]\\w*/;
+        hidden terminal NL: /[\\r\\n]+/;
+        hidden terminal WS: /[\\t ]+/;
+        terminal INDENT: 'synthetic:indent';
+        terminal DEDENT: 'synthetic:dedent';
     `;
 
     test('Moves indent/dedent token types to the beginning', async () => {
@@ -113,28 +122,29 @@ describe('indentationAwareTokenBuilder', () => {
 
 });
 
-describe('indentationAwareLexer', () => {
+describe('IndentationAwareLexer', () => {
 
     const sampleGrammar = `
-    grammar Test
+        grammar Test
 
-    entry Block: '{' INDENT names+=ID* DEDENT nested+=Block* '}';
+        entry Block: '{' INDENT names+=ID* DEDENT nested+=Block* '}';
 
-    terminal ID: /[a-zA-Z_]\\w*/;
-    hidden terminal NL: /[\\r\\n]+/;
-    hidden terminal WS: /[\\t ]+/;
-    terminal INDENT: 'synthetic:indent';
-    terminal DEDENT: 'synthetic:dedent';
-    hidden terminal ML_COMMENT: /\\/\\*[\\s\\S]*?\\*\\//;
-    hidden terminal SL_COMMENT: /\\/\\/[^\\n\\r]*/;
+        terminal ID: /[a-zA-Z_]\\w*/;
+        hidden terminal NL: /[\\r\\n]+/;
+        hidden terminal WS: /[\\t ]+/;
+        terminal INDENT: 'synthetic:indent';
+        terminal DEDENT: 'synthetic:dedent';
+        hidden terminal ML_COMMENT: /\\/\\*[\\s\\S]*?\\*\\//;
+        hidden terminal SL_COMMENT: /\\/\\/[^\\n\\r]*/;
     `;
 
     test('should emit indent/dedent tokens around a block', async () => {
         const lexer = await getLexer(sampleGrammar);
-        const {tokens, errors} = lexer.tokenize(`{
-    name
-    anotherName
-}`);
+        const { tokens, errors } = lexer.tokenize(expandToString`
+        {
+            name
+            anotherName
+        }`);
 
         expect(errors).toHaveLength(0);
         expect(tokens).toHaveLength(6);
@@ -146,13 +156,14 @@ describe('indentationAwareLexer', () => {
 
     test('should ignore indent tokens before comments', async () => {
         const lexer = await getLexer(sampleGrammar);
-        const {tokens, errors} = lexer.tokenize(`// single-line comment
+        const { tokens, errors } = lexer.tokenize(expandToString`
+        // single-line comment
             // indented comment when not expecting indentation
-{
-    name
-        // comment with different indentation inside block
-    anotherName
-}`);
+        {
+            name
+                // comment with different indentation inside block
+            anotherName
+        }`);
 
         expect(errors).toHaveLength(0);
         expect(tokens).toHaveLength(6);
@@ -160,18 +171,20 @@ describe('indentationAwareLexer', () => {
 
     test('should not dedect indentation without a newline', async () => {
         const lexer = await getLexer(sampleGrammar);
-        const {tokens} = lexer.tokenize(`{    name
-        // indented comment - to be ignored
-}`);
+        const { tokens } = lexer.tokenize(expandToString`
+        { name
+            // indented comment - to be ignored
+        }`);
         expect(tokens).toHaveLength(3);
         expect(tokens[1]).not.toBe('INDENT');
     });
 
     test('should add remaining dedents to the end', async () => {
         const lexer = await getLexer(sampleGrammar);
-        const {tokens} = lexer.tokenize(`// single-line comment
-{
-    name`);
+        const { tokens } = lexer.tokenize(expandToString`
+        // single-line comment
+        {
+            name`);
         expect(tokens).toHaveLength(4);
 
         const [/* L_BRAC */, indent, /* id */, dedent] = tokens;
@@ -181,69 +194,100 @@ describe('indentationAwareLexer', () => {
 
 });
 
-describe('indentationAware parsing', () => {
+describe('IndentationAware parsing', () => {
 
     const sampleGrammar = `
-    grammar PythonIf
+        grammar PythonIf
 
-    entry Statement: If | Return;
+        entry Statement: If | Return;
 
-    If:
-        'if' condition=BOOLEAN ':'
-        INDENT thenBody=Statement DEDENT
-        ('else' ':' INDENT elseBody=Statement DEDENT)?;
+        If:
+            'if' condition=BOOLEAN ':'
+            INDENT thenBlock+=Statement+ DEDENT
+            ('else' ':' INDENT elseBlock+=Statement+ DEDENT)?;
 
-    Return: 'return' value=BOOLEAN;
+        Return: 'return' value=BOOLEAN;
 
-    terminal BOOLEAN: /true|false/;
-    terminal INDENT: 'synthetic:indent';
-    terminal DEDENT: 'synthetic:dedent';
-    hidden terminal NL: /[\\r\\n]+/;
-    hidden terminal WS: /[\\t ]+/;
+        terminal BOOLEAN returns boolean: /true|false/;
+        terminal INDENT: 'synthetic:indent';
+        terminal DEDENT: 'synthetic:dedent';
+        hidden terminal NL: /[\\r\\n]+/;
+        hidden terminal WS: /[\\t ]+/;
     `;
 
     test('should parse correctly indented code', async () => {
         const parser = await getParser(sampleGrammar);
-        const { parserErrors } = parser.parse(`
-if true:
-    return false
-else:
-    return true`);
+        const { parserErrors } = parser.parse(expandToString`
+        if true:
+            return false
+        else:
+            return true
+        `);
 
         expect(parserErrors).toHaveLength(0);
     });
 
     test('should error on non-matching dedent', async () => {
         const parser = await getParser(sampleGrammar);
-        const {parserErrors} = parser.parse(`
-if true:
-    return false
-  else:
-    return true`);
+        const { parserErrors } = parser.parse(expandToString`
+        if true:
+            return false
+          else:
+            return true
+        `);
 
         expect(parserErrors.length).toBeGreaterThan(0);
     });
 
     test('should throw an error on unexpected indent', async () => {
         const parser = await getParser(sampleGrammar);
-        const { parserErrors } = parser.parse(`
-        if true:
-            return false`);
+        const { parserErrors } = parser.parse(expandToString`
+        // Parsing starts here
+                if true:
+                    return false
+        `);
 
         expect(parserErrors.length).toBeGreaterThan(0);
     });
 
     test('should correctly parse nested blocks', async () => {
         const parser = await getParser(sampleGrammar);
-        const {parserErrors} = parser.parse(`
-if true:
-    return true
-else:
-    if false:
-        return false
+        const { parserErrors, value } = parser.parse(expandToString`
+        if true:
+            return true
+        else:
+            if false:
+                return true
+                return false
+            return true
         `);
 
         expect(parserErrors).toHaveLength(0);
+        const ifValue = value as If;
+        expect(ifValue.thenBlock).toHaveLength(1);
+        expect(ifValue.elseBlock).toHaveLength(2);
+        const elseBlock = ifValue.elseBlock![0] as If;
+        expect(elseBlock.thenBlock).toHaveLength(2);
+        const nestedReturn1 = elseBlock.thenBlock[0] as Return;
+        expect(nestedReturn1.value).toBe(true);
+        const nestedReturn2 = elseBlock.thenBlock[1] as Return;
+        expect(nestedReturn2.value).toBe(false);
+        const return2 = ifValue.elseBlock![1] as Return;
+        expect(return2.value).toBe(true);
     });
 
 });
+
+type Statement = If | Return;
+
+interface If extends AstNode {
+    $type: 'If';
+    condition: boolean;
+    thenBlock: Statement[];
+    elseBlock?: Statement[];
+}
+
+interface Return extends AstNode {
+    $type: 'Return';
+    value: boolean;
+}

From a609a701d7bc5dd9d9bce04be4ac0a08457ef90d Mon Sep 17 00:00:00 2001
From: Mark Sujew <mark.sujew@typefox.io>
Date: Wed, 17 Jul 2024 16:42:54 +0200
Subject: [PATCH 14/14] Fix nullability

---
 packages/langium/test/parser/indentation-aware.test.ts | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/packages/langium/test/parser/indentation-aware.test.ts b/packages/langium/test/parser/indentation-aware.test.ts
index 1e376c285..269da3c2a 100644
--- a/packages/langium/test/parser/indentation-aware.test.ts
+++ b/packages/langium/test/parser/indentation-aware.test.ts
@@ -266,13 +266,13 @@ describe('IndentationAware parsing', () => {
         const ifValue = value as If;
         expect(ifValue.thenBlock).toHaveLength(1);
         expect(ifValue.elseBlock).toHaveLength(2);
-        const elseBlock = ifValue.elseBlock![0] as If;
+        const elseBlock = ifValue.elseBlock[0] as If;
         expect(elseBlock.thenBlock).toHaveLength(2);
         const nestedReturn1 = elseBlock.thenBlock[0] as Return;
         expect(nestedReturn1.value).toBe(true);
         const nestedReturn2 = elseBlock.thenBlock[1] as Return;
         expect(nestedReturn2.value).toBe(false);
-        const return2 = ifValue.elseBlock![1] as Return;
+        const return2 = ifValue.elseBlock[1] as Return;
         expect(return2.value).toBe(true);
     });
 
@@ -284,7 +284,7 @@ interface If extends AstNode {
     $type: 'If';
     condition: boolean;
     thenBlock: Statement[];
-    elseBlock?: Statement[];
+    elseBlock: Statement[];
 }
 
 interface Return extends AstNode {