From 6074e38a0769cfdd5051aec41e7f8001787dd2e8 Mon Sep 17 00:00:00 2001 From: Abdelrahman Aly Abounegm Date: Sun, 14 Jul 2024 04:18:23 +0300 Subject: [PATCH 01/14] Add indentation-aware TokenBuilder and Lexer This is useful for languages where indentation is used as a block delimiter, such as Python --- packages/langium/src/parser/lexer.ts | 40 +++ packages/langium/src/parser/token-builder.ts | 259 +++++++++++++++++++ 2 files changed, 299 insertions(+) diff --git a/packages/langium/src/parser/lexer.ts b/packages/langium/src/parser/lexer.ts index eec23ea12..33296acc3 100644 --- a/packages/langium/src/parser/lexer.ts +++ b/packages/langium/src/parser/lexer.ts @@ -6,6 +6,7 @@ import type { ILexingError, IMultiModeLexerDefinition, IToken, TokenType, TokenTypeDictionary, TokenVocabulary } from 'chevrotain'; import type { LangiumCoreServices } from '../services.js'; +import { IndentationAwareTokenBuilder } from "./token-builder.js"; import { Lexer as ChevrotainLexer } from 'chevrotain'; export interface LexerResult { @@ -66,6 +67,45 @@ export class DefaultLexer implements Lexer { } } +/** + * A lexer that is aware of indentation in the input text. + * The only purpose of this lexer is to reset the internal state of the {@link IndentationAwareTokenBuilder} + * between the tokenization of different text inputs. + * + * In your module, you can override the default lexer with this one as such: + * ```ts + * parser: { + * TokenBuilder: () => new IndentationAwareTokenBuilder(), + * Lexer: (services) => new IndentationAwareLexer(services), + * } + * ``` + */ +export class IndentationAwareLexer extends DefaultLexer { + private indentationTokenBuilder?: IndentationAwareTokenBuilder; + + constructor(services: LangiumCoreServices) { + super(services); + if (services.parser.TokenBuilder instanceof IndentationAwareTokenBuilder) { + this.indentationTokenBuilder = services.parser.TokenBuilder; + } + } + + override tokenize(text: string): LexerResult { + const result = super.tokenize(text); + + if (!this.indentationTokenBuilder) { + // A token builder other than the expected IndentationAwareTokenBuilder is used + return result; + } + + // reset the indent stack between processing of different text inputs + const remainingDedents = this.indentationTokenBuilder.popRemainingDedents(text); + result.tokens.push(...remainingDedents); + + return result; + } +} + /** * Returns a check whether the given TokenVocabulary is TokenType array */ diff --git a/packages/langium/src/parser/token-builder.ts b/packages/langium/src/parser/token-builder.ts index 4320c5ca6..79d59d8b2 100644 --- a/packages/langium/src/parser/token-builder.ts +++ b/packages/langium/src/parser/token-builder.ts @@ -118,3 +118,262 @@ export class DefaultTokenBuilder implements TokenBuilder { }, []); } } + + +export interface IndentationTokenBuilderOptions { + /** + * The name of the token used to denote indentation in the grammar. + * A possible definition in the grammar could look like this: + * ```langium + * terminal INDENT: ':synthetic-indent:'; + * ``` + * + * @default "INDENT" + */ + indentTokenName: string; + /** + * The name of the token used to denote deindentation in the grammar. + * A possible definition in the grammar could look like this: + * ```langium + * terminal DEDENT: ':synthetic-dedent:'; + * ``` + * + * @default "DEDENT" + */ + dedentTokenName: string; + /** + * The name of the token used to denote whitespace other than indentation and newlines in the grammar. + * A possible definition in the grammar could look like this: + * ```langium + * hidden terminal WS: /[ \t]+/; + * ``` + * + * @default "WS" + */ + whitespaceTokenName: string; +} + +const indetationBuilderDefaultOptions: IndentationTokenBuilderOptions = { + indentTokenName: 'INDENT', + dedentTokenName: 'DEDENT', + whitespaceTokenName: 'WS', +}; + +/** + * A token builder that is sensitive to indentation in the input text. + * It will generate tokens for indentation and dedentation based on the indentation level. + * + * Inspired by https://github.com/chevrotain/chevrotain/blob/master/examples/lexer/python_indentation/python_indentation.js + */ +export class IndentationAwareTokenBuilder extends DefaultTokenBuilder { + private indentationStack: number[] = [0]; + private options: IndentationTokenBuilderOptions; + + /** + * The token type to be used for indentation tokens + */ + protected indentTokenType: TokenType; + + /** + * The token type to be used for dedentation tokens + */ + protected dedentTokenType: TokenType; + + /** + * A regular expression to match a series of tabs and/or spaces. + * Override this to customize what the indentation is allowed to consist of. + */ + protected whitespaceRegExp = /[ \t]+/y; + + constructor(options: Partial = indetationBuilderDefaultOptions) { + super(); + this.options = { + ...indetationBuilderDefaultOptions, + ...options, + }; + + this.indentTokenType = createToken({ + name: this.options.indentTokenName, + pattern: this.indentMatcher, + line_breaks: false, + }); + + this.dedentTokenType = createToken({ + name: this.options.dedentTokenName, + pattern: this.dedentMatcher, + line_breaks: false, + }); + } + + override buildTokens(grammar: GrammarAST.Grammar, options?: TokenBuilderOptions | undefined) { + const tokenTypes = super.buildTokens(grammar, options); + if (!isTokenTypeArray(tokenTypes)) { + throw new Error('Invalid tokens built by default builder'); + } + + const {indentTokenName, dedentTokenName, whitespaceTokenName} = this.options; + + // Rearrange tokens because whitespace (which is ignored) goes to the beginning by default, consuming indentation as well + // Order should be: dedent, indent, spaces + const dedent = tokenTypes.find(tok => tok.name === dedentTokenName); + const indent = tokenTypes.find(tok => tok.name === indentTokenName); + const ws = tokenTypes.find(tok => tok.name === whitespaceTokenName); + if (!dedent || !indent || !ws) { + throw new Error('Some indentation/whitespace tokens not found!'); + } + + const spaceTokens = [dedent, indent, ws]; + const otherTokens = tokenTypes.filter(tok => !spaceTokens.includes(tok)); + return [...spaceTokens, ...otherTokens]; + } + + private isStartOfLine(text: string, offset: number): boolean { + return offset === 0 || '\r\n'.includes(text[offset - 1]); + } + + private matchWhitespace(text: string, offset: number) { + this.whitespaceRegExp.lastIndex = offset; + const match = this.whitespaceRegExp.exec(text); + return { + currIndentLevel: match?.[0].length ?? 0, + prevIndentLevel: this.indentationStack.at(-1)!, + match, + }; + } + + private createIndentationTokenInstance(tokenType: TokenType, text: string, image: string, offset: number) { + const lineNumber = text.substring(0, offset).split(/\r\n|\r|\n/).length; + return createTokenInstance( + tokenType, + image, + offset, offset + image.length, + lineNumber, lineNumber, + 0, image.length, + ); + } + + /** + * A custom pattern for matching indents + * + * @param text The full input string. + * @param offset The offset at which to attempt a match + * @param tokens Previously scanned Tokens + * @param groups Token Groups + */ + protected indentMatcher: CustomPatternMatcherFunc = (text, offset, tokens, groups) => { + const {indentTokenName} = this.options; + + if (!this.isStartOfLine(text, offset)) { + return null; + } + + const {currIndentLevel, prevIndentLevel, match} = this.matchWhitespace(text, offset); + + if (currIndentLevel <= prevIndentLevel) { + // shallower indentation (should be matched by dedent) + // or same indentation level (should be matched by whitespace and ignored) + return null; + } + + this.indentationStack.push(currIndentLevel); + + const indentToken = this.createIndentationTokenInstance( + this.indentTokenType, + text, + match?.[0] ?? indentTokenName, + offset, + ); + tokens.push(indentToken); + + // Token already added, let the indentation now be consumed as whitespace and ignored + return null; + }; + + /** + * A custom pattern for matching dedents + * + * @param text The full input string. + * @param offset The offset at which to attempt a match + * @param tokens Previously scanned Tokens + * @param groups Token Groups + */ + protected dedentMatcher: CustomPatternMatcherFunc = (text, offset, tokens, groups) => { + const {dedentTokenName} = this.options; + + if (!this.isStartOfLine(text, offset)) { + return null; + } + + const {currIndentLevel, prevIndentLevel, match} = this.matchWhitespace(text, offset); + + if (currIndentLevel >= prevIndentLevel) { + // bigger indentation (should be matched by indent) + // or same indentation level (should be matched by whitespace and ignored) + return null; + } + + const matchIndentIndex = this.indentationStack.lastIndexOf(currIndentLevel); + + // Any dedent must match some previous indentation level. + if (matchIndentIndex === -1) { + console.error(`Invalid dedent level ${currIndentLevel} at offset: ${offset}. Current indetation stack: ${this.indentationStack}`); + // throwing an error would crash the language server + // TODO: find a way to report error diagnostics message + return null; + } + + const numberOfDedents = this.indentationStack.length - matchIndentIndex - 1; + + for (let i = 0; i < numberOfDedents; i++) { + const token = this.createIndentationTokenInstance( + this.dedentTokenType, + text, + match?.[0] ?? dedentTokenName, + offset, + ); + tokens.push(token); + this.indentationStack.pop(); + } + + // Token already added, let the dedentation now be consumed as whitespace and ignored + return null; + }; + + protected override buildTerminalToken(terminal: GrammarAST.TerminalRule): TokenType { + const tokenType = super.buildTerminalToken(terminal); + const {indentTokenName, dedentTokenName, whitespaceTokenName} = this.options; + + if (tokenType.name === indentTokenName) { + return this.indentTokenType; + } else if (tokenType.name === dedentTokenName) { + return this.dedentTokenType; + } else if (tokenType.name === whitespaceTokenName) { + return createToken({ + name: whitespaceTokenName, + pattern: this.whitespaceRegExp, + group: Lexer.SKIPPED, + }); + } + + return tokenType; + } + + /** + * Resets the indentation stack between different runs of the lexer + * + * @param text Full text that was tokenized + * @returns Remaining dedent tokens to match all previous indents at the end of the file + */ + public popRemainingDedents(text: string) { + const remainingDedents: IToken[] = []; + while (this.indentationStack.length > 1) { + remainingDedents.push( + this.createIndentationTokenInstance(this.dedentTokenType, text, this.options.dedentTokenName, text.length) + ); + this.indentationStack.pop(); + } + + this.indentationStack = [0]; + return remainingDedents; + } +} From f4943e7449fc8a92f26875be517597c10f873e45 Mon Sep 17 00:00:00 2001 From: Abdelrahman Aly Abounegm Date: Sun, 14 Jul 2024 13:52:52 +0300 Subject: [PATCH 02/14] Replace double quotes with single quotes For consistency with the rest of the codebase --- packages/langium/src/parser/lexer.ts | 2 +- packages/langium/src/parser/token-builder.ts | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/packages/langium/src/parser/lexer.ts b/packages/langium/src/parser/lexer.ts index 33296acc3..4f3d27a5c 100644 --- a/packages/langium/src/parser/lexer.ts +++ b/packages/langium/src/parser/lexer.ts @@ -6,7 +6,7 @@ import type { ILexingError, IMultiModeLexerDefinition, IToken, TokenType, TokenTypeDictionary, TokenVocabulary } from 'chevrotain'; import type { LangiumCoreServices } from '../services.js'; -import { IndentationAwareTokenBuilder } from "./token-builder.js"; +import { IndentationAwareTokenBuilder } from './token-builder.js'; import { Lexer as ChevrotainLexer } from 'chevrotain'; export interface LexerResult { diff --git a/packages/langium/src/parser/token-builder.ts b/packages/langium/src/parser/token-builder.ts index 79d59d8b2..72b491ce8 100644 --- a/packages/langium/src/parser/token-builder.ts +++ b/packages/langium/src/parser/token-builder.ts @@ -128,7 +128,7 @@ export interface IndentationTokenBuilderOptions { * terminal INDENT: ':synthetic-indent:'; * ``` * - * @default "INDENT" + * @default 'INDENT' */ indentTokenName: string; /** @@ -138,7 +138,7 @@ export interface IndentationTokenBuilderOptions { * terminal DEDENT: ':synthetic-dedent:'; * ``` * - * @default "DEDENT" + * @default 'DEDENT' */ dedentTokenName: string; /** @@ -148,7 +148,7 @@ export interface IndentationTokenBuilderOptions { * hidden terminal WS: /[ \t]+/; * ``` * - * @default "WS" + * @default 'WS' */ whitespaceTokenName: string; } From 7d85c475411dbdf1ed11748c4c78766295c6bc81 Mon Sep 17 00:00:00 2001 From: Abdelrahman Abounegm Date: Sun, 14 Jul 2024 14:37:36 +0300 Subject: [PATCH 03/14] Add missing imports --- packages/langium/src/parser/token-builder.ts | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/packages/langium/src/parser/token-builder.ts b/packages/langium/src/parser/token-builder.ts index 72b491ce8..ef8ef1e70 100644 --- a/packages/langium/src/parser/token-builder.ts +++ b/packages/langium/src/parser/token-builder.ts @@ -4,15 +4,16 @@ * terms of the MIT License, which is available in the project root. ******************************************************************************/ -import type { CustomPatternMatcherFunc, TokenPattern, TokenType, TokenVocabulary } from 'chevrotain'; +import type { CustomPatternMatcherFunc, TokenPattern, TokenType, TokenVocabulary, IToken } from 'chevrotain'; import type { AbstractRule, Grammar, Keyword, TerminalRule } from '../languages/generated/ast.js'; import type { Stream } from '../utils/stream.js'; -import { Lexer } from 'chevrotain'; +import { createToken, createTokenInstance, Lexer } from 'chevrotain'; import { isKeyword, isParserRule, isTerminalRule } from '../languages/generated/ast.js'; import { streamAllContents } from '../utils/ast-utils.js'; import { getAllReachableRules, terminalRegex } from '../utils/grammar-utils.js'; import { getCaseInsensitivePattern, isWhitespace, partialMatches } from '../utils/regexp-utils.js'; import { stream } from '../utils/stream.js'; +import { isTokenTypeArray } from './lexer.js'; export interface TokenBuilderOptions { caseInsensitive?: boolean @@ -119,7 +120,6 @@ export class DefaultTokenBuilder implements TokenBuilder { } } - export interface IndentationTokenBuilderOptions { /** * The name of the token used to denote indentation in the grammar. @@ -205,7 +205,7 @@ export class IndentationAwareTokenBuilder extends DefaultTokenBuilder { }); } - override buildTokens(grammar: GrammarAST.Grammar, options?: TokenBuilderOptions | undefined) { + override buildTokens(grammar: Grammar, options?: TokenBuilderOptions | undefined) { const tokenTypes = super.buildTokens(grammar, options); if (!isTokenTypeArray(tokenTypes)) { throw new Error('Invalid tokens built by default builder'); @@ -260,7 +260,7 @@ export class IndentationAwareTokenBuilder extends DefaultTokenBuilder { * @param tokens Previously scanned Tokens * @param groups Token Groups */ - protected indentMatcher: CustomPatternMatcherFunc = (text, offset, tokens, groups) => { + protected indentMatcher: CustomPatternMatcherFunc = (text, offset, tokens, _groups) => { const {indentTokenName} = this.options; if (!this.isStartOfLine(text, offset)) { @@ -297,7 +297,7 @@ export class IndentationAwareTokenBuilder extends DefaultTokenBuilder { * @param tokens Previously scanned Tokens * @param groups Token Groups */ - protected dedentMatcher: CustomPatternMatcherFunc = (text, offset, tokens, groups) => { + protected dedentMatcher: CustomPatternMatcherFunc = (text, offset, tokens, _groups) => { const {dedentTokenName} = this.options; if (!this.isStartOfLine(text, offset)) { @@ -339,7 +339,7 @@ export class IndentationAwareTokenBuilder extends DefaultTokenBuilder { return null; }; - protected override buildTerminalToken(terminal: GrammarAST.TerminalRule): TokenType { + protected override buildTerminalToken(terminal: TerminalRule): TokenType { const tokenType = super.buildTerminalToken(terminal); const {indentTokenName, dedentTokenName, whitespaceTokenName} = this.options; From e3963fe72c3f3f0b904501cefcfb3a0ae79ff134 Mon Sep 17 00:00:00 2001 From: Abdelrahman Abounegm Date: Sun, 14 Jul 2024 22:29:42 +0300 Subject: [PATCH 04/14] Make all private fields protected and add JSDoc --- packages/langium/src/parser/lexer.ts | 2 +- packages/langium/src/parser/token-builder.ts | 37 +++++++++++++++++--- 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/packages/langium/src/parser/lexer.ts b/packages/langium/src/parser/lexer.ts index 4f3d27a5c..f80df9335 100644 --- a/packages/langium/src/parser/lexer.ts +++ b/packages/langium/src/parser/lexer.ts @@ -81,7 +81,7 @@ export class DefaultLexer implements Lexer { * ``` */ export class IndentationAwareLexer extends DefaultLexer { - private indentationTokenBuilder?: IndentationAwareTokenBuilder; + protected readonly indentationTokenBuilder?: IndentationAwareTokenBuilder; constructor(services: LangiumCoreServices) { super(services); diff --git a/packages/langium/src/parser/token-builder.ts b/packages/langium/src/parser/token-builder.ts index ef8ef1e70..74469662e 100644 --- a/packages/langium/src/parser/token-builder.ts +++ b/packages/langium/src/parser/token-builder.ts @@ -166,8 +166,12 @@ const indetationBuilderDefaultOptions: IndentationTokenBuilderOptions = { * Inspired by https://github.com/chevrotain/chevrotain/blob/master/examples/lexer/python_indentation/python_indentation.js */ export class IndentationAwareTokenBuilder extends DefaultTokenBuilder { - private indentationStack: number[] = [0]; - private options: IndentationTokenBuilderOptions; + /** + * The stack in which all the previous matched indentation levels are stored + * to understand how deep a the next tokens are nested. + */ + protected indentationStack: number[] = [0]; + protected options: IndentationTokenBuilderOptions; /** * The token type to be used for indentation tokens @@ -227,11 +231,25 @@ export class IndentationAwareTokenBuilder extends DefaultTokenBuilder { return [...spaceTokens, ...otherTokens]; } - private isStartOfLine(text: string, offset: number): boolean { + /** + * Helper function to check if the current position is the start of a new line. + * + * @param text The full input string. + * @param offset The current position at which to check + * @returns Whether the current position is the start of a new line + */ + protected isStartOfLine(text: string, offset: number): boolean { return offset === 0 || '\r\n'.includes(text[offset - 1]); } - private matchWhitespace(text: string, offset: number) { + /** + * A helper function used in matching both indents and dedents. + * + * @param text The full input string. + * @param offset The current position at which to attempt a match + * @returns The current and previous indentation levels and the matched whitespace + */ + protected matchWhitespace(text: string, offset: number) { this.whitespaceRegExp.lastIndex = offset; const match = this.whitespaceRegExp.exec(text); return { @@ -241,7 +259,16 @@ export class IndentationAwareTokenBuilder extends DefaultTokenBuilder { }; } - private createIndentationTokenInstance(tokenType: TokenType, text: string, image: string, offset: number) { + /** + * Helper function to create an instance of an indentation token. + * + * @param tokenType Indent or dedent token type + * @param text Full input string, used to calculate the line number + * @param image The original image of the token (tabs or spaces) + * @param offset Current position in the input string + * @returns The indentation token instance + */ + protected createIndentationTokenInstance(tokenType: TokenType, text: string, image: string, offset: number) { const lineNumber = text.substring(0, offset).split(/\r\n|\r|\n/).length; return createTokenInstance( tokenType, From 2db70a00de472a7cd19781267453d47e1290d105 Mon Sep 17 00:00:00 2001 From: Abdelrahman Abounegm Date: Sun, 14 Jul 2024 22:34:44 +0300 Subject: [PATCH 05/14] Require IndentationAwareTokenBuilder for the lexer --- packages/langium/src/parser/lexer.ts | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/packages/langium/src/parser/lexer.ts b/packages/langium/src/parser/lexer.ts index f80df9335..0eb0653b9 100644 --- a/packages/langium/src/parser/lexer.ts +++ b/packages/langium/src/parser/lexer.ts @@ -81,23 +81,20 @@ export class DefaultLexer implements Lexer { * ``` */ export class IndentationAwareLexer extends DefaultLexer { - protected readonly indentationTokenBuilder?: IndentationAwareTokenBuilder; + protected readonly indentationTokenBuilder: IndentationAwareTokenBuilder; constructor(services: LangiumCoreServices) { super(services); if (services.parser.TokenBuilder instanceof IndentationAwareTokenBuilder) { this.indentationTokenBuilder = services.parser.TokenBuilder; + } else { + throw new Error('IndentationAwareLexer requires an accompanying IndentationAwareTokenBuilder'); } } override tokenize(text: string): LexerResult { const result = super.tokenize(text); - if (!this.indentationTokenBuilder) { - // A token builder other than the expected IndentationAwareTokenBuilder is used - return result; - } - // reset the indent stack between processing of different text inputs const remainingDedents = this.indentationTokenBuilder.popRemainingDedents(text); result.tokens.push(...remainingDedents); From b41957fbfa69a08da243261077b6e0dfee53c820 Mon Sep 17 00:00:00 2001 From: Abdelrahman Abounegm Date: Sun, 14 Jul 2024 22:35:55 +0300 Subject: [PATCH 06/14] Export default options and fix typo --- packages/langium/src/parser/token-builder.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/langium/src/parser/token-builder.ts b/packages/langium/src/parser/token-builder.ts index 74469662e..eedb5b489 100644 --- a/packages/langium/src/parser/token-builder.ts +++ b/packages/langium/src/parser/token-builder.ts @@ -153,7 +153,7 @@ export interface IndentationTokenBuilderOptions { whitespaceTokenName: string; } -const indetationBuilderDefaultOptions: IndentationTokenBuilderOptions = { +export const indentationBuilderDefaultOptions: IndentationTokenBuilderOptions = { indentTokenName: 'INDENT', dedentTokenName: 'DEDENT', whitespaceTokenName: 'WS', @@ -189,10 +189,10 @@ export class IndentationAwareTokenBuilder extends DefaultTokenBuilder { */ protected whitespaceRegExp = /[ \t]+/y; - constructor(options: Partial = indetationBuilderDefaultOptions) { + constructor(options: Partial = indentationBuilderDefaultOptions) { super(); this.options = { - ...indetationBuilderDefaultOptions, + ...indentationBuilderDefaultOptions, ...options, }; From 24860caf0eb19e965b2b3499b8e43346eb6ae74a Mon Sep 17 00:00:00 2001 From: Abdelrahman Aly Abounegm Date: Sun, 14 Jul 2024 22:39:42 +0300 Subject: [PATCH 07/14] Add explicit return type to the function Co-authored-by: Mark Sujew --- packages/langium/src/parser/token-builder.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/langium/src/parser/token-builder.ts b/packages/langium/src/parser/token-builder.ts index eedb5b489..7f3882a0a 100644 --- a/packages/langium/src/parser/token-builder.ts +++ b/packages/langium/src/parser/token-builder.ts @@ -391,7 +391,7 @@ export class IndentationAwareTokenBuilder extends DefaultTokenBuilder { * @param text Full text that was tokenized * @returns Remaining dedent tokens to match all previous indents at the end of the file */ - public popRemainingDedents(text: string) { + popRemainingDedents(text: string): IToken[] { const remainingDedents: IToken[] = []; while (this.indentationStack.length > 1) { remainingDedents.push( From 01fb746cc96ee45e81015294155f8c465d46f6cb Mon Sep 17 00:00:00 2001 From: Abdelrahman Aly Abounegm Date: Sun, 14 Jul 2024 22:40:18 +0300 Subject: [PATCH 08/14] Replace 4 iterations through tokens with 1 loop Co-authored-by: Mark Sujew --- packages/langium/src/parser/token-builder.ts | 23 ++++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/packages/langium/src/parser/token-builder.ts b/packages/langium/src/parser/token-builder.ts index 7f3882a0a..e6b734d82 100644 --- a/packages/langium/src/parser/token-builder.ts +++ b/packages/langium/src/parser/token-builder.ts @@ -219,16 +219,25 @@ export class IndentationAwareTokenBuilder extends DefaultTokenBuilder { // Rearrange tokens because whitespace (which is ignored) goes to the beginning by default, consuming indentation as well // Order should be: dedent, indent, spaces - const dedent = tokenTypes.find(tok => tok.name === dedentTokenName); - const indent = tokenTypes.find(tok => tok.name === indentTokenName); - const ws = tokenTypes.find(tok => tok.name === whitespaceTokenName); + let dedent: TokenType | undefined; + let indent: TokenType | undefined; + let ws: TokenType | undefined; + const otherTokens: TokenType[] = []; + for (const tokenType of tokenTypes) { + if (tokenType.name === dedentTokenName) { + dedent = tokenType; + } else if (tokenType.name === indentTokenName) { + indent = tokenType; + } else if (tokenType.name === whitespaceTokenName) { + ws = tokenType; + } else { + otherTokens.push(tokenType); + } + } if (!dedent || !indent || !ws) { throw new Error('Some indentation/whitespace tokens not found!'); } - - const spaceTokens = [dedent, indent, ws]; - const otherTokens = tokenTypes.filter(tok => !spaceTokens.includes(tok)); - return [...spaceTokens, ...otherTokens]; + return [dedent, indent, ws, ...otherTokens]; } /** From 3efc2ba80a3dad1977c42c4809ae056af65bd99a Mon Sep 17 00:00:00 2001 From: Abdelrahman Abounegm Date: Tue, 16 Jul 2024 19:20:46 +0300 Subject: [PATCH 09/14] Move indentation-aware classes to separate module --- .../langium/src/parser/indentation-aware.ts | 344 ++++++++++++++++++ packages/langium/src/parser/index.ts | 1 + packages/langium/src/parser/lexer.ts | 37 -- packages/langium/src/parser/token-builder.ts | 299 +-------------- 4 files changed, 347 insertions(+), 334 deletions(-) create mode 100644 packages/langium/src/parser/indentation-aware.ts diff --git a/packages/langium/src/parser/indentation-aware.ts b/packages/langium/src/parser/indentation-aware.ts new file mode 100644 index 000000000..9c4bb4dec --- /dev/null +++ b/packages/langium/src/parser/indentation-aware.ts @@ -0,0 +1,344 @@ +/****************************************************************************** + * Copyright 2024 TypeFox GmbH + * This program and the accompanying materials are made available under the + * terms of the MIT License, which is available in the project root. + ******************************************************************************/ + +import type { CustomPatternMatcherFunc, TokenType, IToken } from 'chevrotain'; +import type { Grammar, TerminalRule } from '../languages/generated/ast.js'; +import type { TokenBuilderOptions } from './token-builder.js'; +import type { LexerResult } from './lexer.js'; +import type { LangiumCoreServices } from '../services.js'; +import { createToken, createTokenInstance, Lexer } from 'chevrotain'; +import { DefaultTokenBuilder } from './token-builder.js'; +import { DefaultLexer, isTokenTypeArray } from './lexer.js'; + +export interface IndentationTokenBuilderOptions { + /** + * The name of the token used to denote indentation in the grammar. + * A possible definition in the grammar could look like this: + * ```langium + * terminal INDENT: ':synthetic-indent:'; + * ``` + * + * @default 'INDENT' + */ + indentTokenName: string; + /** + * The name of the token used to denote deindentation in the grammar. + * A possible definition in the grammar could look like this: + * ```langium + * terminal DEDENT: ':synthetic-dedent:'; + * ``` + * + * @default 'DEDENT' + */ + dedentTokenName: string; + /** + * The name of the token used to denote whitespace other than indentation and newlines in the grammar. + * A possible definition in the grammar could look like this: + * ```langium + * hidden terminal WS: /[ \t]+/; + * ``` + * + * @default 'WS' + */ + whitespaceTokenName: string; +} + +export const indentationBuilderDefaultOptions: IndentationTokenBuilderOptions = { + indentTokenName: 'INDENT', + dedentTokenName: 'DEDENT', + whitespaceTokenName: 'WS', +}; + +/** + * A token builder that is sensitive to indentation in the input text. + * It will generate tokens for indentation and dedentation based on the indentation level. + * + * Inspired by https://github.com/chevrotain/chevrotain/blob/master/examples/lexer/python_indentation/python_indentation.js + */ +export class IndentationAwareTokenBuilder extends DefaultTokenBuilder { + /** + * The stack in which all the previous matched indentation levels are stored + * to understand how deep a the next tokens are nested. + */ + protected indentationStack: number[] = [0]; + protected options: IndentationTokenBuilderOptions; + + /** + * The token type to be used for indentation tokens + */ + protected indentTokenType: TokenType; + + /** + * The token type to be used for dedentation tokens + */ + protected dedentTokenType: TokenType; + + /** + * A regular expression to match a series of tabs and/or spaces. + * Override this to customize what the indentation is allowed to consist of. + */ + protected whitespaceRegExp = /[ \t]+/y; + + constructor(options: Partial = indentationBuilderDefaultOptions) { + super(); + this.options = { + ...indentationBuilderDefaultOptions, + ...options, + }; + + this.indentTokenType = createToken({ + name: this.options.indentTokenName, + pattern: this.indentMatcher, + line_breaks: false, + }); + + this.dedentTokenType = createToken({ + name: this.options.dedentTokenName, + pattern: this.dedentMatcher, + line_breaks: false, + }); + } + + override buildTokens(grammar: Grammar, options?: TokenBuilderOptions | undefined) { + const tokenTypes = super.buildTokens(grammar, options); + if (!isTokenTypeArray(tokenTypes)) { + throw new Error('Invalid tokens built by default builder'); + } + + const {indentTokenName, dedentTokenName, whitespaceTokenName} = this.options; + + // Rearrange tokens because whitespace (which is ignored) goes to the beginning by default, consuming indentation as well + // Order should be: dedent, indent, spaces + let dedent: TokenType | undefined; + let indent: TokenType | undefined; + let ws: TokenType | undefined; + const otherTokens: TokenType[] = []; + for (const tokenType of tokenTypes) { + if (tokenType.name === dedentTokenName) { + dedent = tokenType; + } else if (tokenType.name === indentTokenName) { + indent = tokenType; + } else if (tokenType.name === whitespaceTokenName) { + ws = tokenType; + } else { + otherTokens.push(tokenType); + } + } + if (!dedent || !indent || !ws) { + throw new Error('Some indentation/whitespace tokens not found!'); + } + return [dedent, indent, ws, ...otherTokens]; + } + + /** + * Helper function to check if the current position is the start of a new line. + * + * @param text The full input string. + * @param offset The current position at which to check + * @returns Whether the current position is the start of a new line + */ + protected isStartOfLine(text: string, offset: number): boolean { + return offset === 0 || '\r\n'.includes(text[offset - 1]); + } + + /** + * A helper function used in matching both indents and dedents. + * + * @param text The full input string. + * @param offset The current position at which to attempt a match + * @returns The current and previous indentation levels and the matched whitespace + */ + protected matchWhitespace(text: string, offset: number) { + this.whitespaceRegExp.lastIndex = offset; + const match = this.whitespaceRegExp.exec(text); + return { + currIndentLevel: match?.[0].length ?? 0, + prevIndentLevel: this.indentationStack.at(-1)!, + match, + }; + } + + /** + * Helper function to create an instance of an indentation token. + * + * @param tokenType Indent or dedent token type + * @param text Full input string, used to calculate the line number + * @param image The original image of the token (tabs or spaces) + * @param offset Current position in the input string + * @returns The indentation token instance + */ + protected createIndentationTokenInstance(tokenType: TokenType, text: string, image: string, offset: number) { + const lineNumber = text.substring(0, offset).split(/\r\n|\r|\n/).length; + return createTokenInstance( + tokenType, + image, + offset, offset + image.length, + lineNumber, lineNumber, + 0, image.length, + ); + } + + /** + * A custom pattern for matching indents + * + * @param text The full input string. + * @param offset The offset at which to attempt a match + * @param tokens Previously scanned Tokens + * @param groups Token Groups + */ + protected indentMatcher: CustomPatternMatcherFunc = (text, offset, tokens, _groups) => { + const {indentTokenName} = this.options; + + if (!this.isStartOfLine(text, offset)) { + return null; + } + + const {currIndentLevel, prevIndentLevel, match} = this.matchWhitespace(text, offset); + + if (currIndentLevel <= prevIndentLevel) { + // shallower indentation (should be matched by dedent) + // or same indentation level (should be matched by whitespace and ignored) + return null; + } + + this.indentationStack.push(currIndentLevel); + + const indentToken = this.createIndentationTokenInstance( + this.indentTokenType, + text, + match?.[0] ?? indentTokenName, + offset, + ); + tokens.push(indentToken); + + // Token already added, let the indentation now be consumed as whitespace and ignored + return null; + }; + + /** + * A custom pattern for matching dedents + * + * @param text The full input string. + * @param offset The offset at which to attempt a match + * @param tokens Previously scanned Tokens + * @param groups Token Groups + */ + protected dedentMatcher: CustomPatternMatcherFunc = (text, offset, tokens, _groups) => { + const {dedentTokenName} = this.options; + + if (!this.isStartOfLine(text, offset)) { + return null; + } + + const {currIndentLevel, prevIndentLevel, match} = this.matchWhitespace(text, offset); + + if (currIndentLevel >= prevIndentLevel) { + // bigger indentation (should be matched by indent) + // or same indentation level (should be matched by whitespace and ignored) + return null; + } + + const matchIndentIndex = this.indentationStack.lastIndexOf(currIndentLevel); + + // Any dedent must match some previous indentation level. + if (matchIndentIndex === -1) { + console.error(`Invalid dedent level ${currIndentLevel} at offset: ${offset}. Current indetation stack: ${this.indentationStack}`); + // throwing an error would crash the language server + // TODO: find a way to report error diagnostics message + return null; + } + + const numberOfDedents = this.indentationStack.length - matchIndentIndex - 1; + + for (let i = 0; i < numberOfDedents; i++) { + const token = this.createIndentationTokenInstance( + this.dedentTokenType, + text, + match?.[0] ?? dedentTokenName, + offset, + ); + tokens.push(token); + this.indentationStack.pop(); + } + + // Token already added, let the dedentation now be consumed as whitespace and ignored + return null; + }; + + protected override buildTerminalToken(terminal: TerminalRule): TokenType { + const tokenType = super.buildTerminalToken(terminal); + const {indentTokenName, dedentTokenName, whitespaceTokenName} = this.options; + + if (tokenType.name === indentTokenName) { + return this.indentTokenType; + } else if (tokenType.name === dedentTokenName) { + return this.dedentTokenType; + } else if (tokenType.name === whitespaceTokenName) { + return createToken({ + name: whitespaceTokenName, + pattern: this.whitespaceRegExp, + group: Lexer.SKIPPED, + }); + } + + return tokenType; + } + + /** + * Resets the indentation stack between different runs of the lexer + * + * @param text Full text that was tokenized + * @returns Remaining dedent tokens to match all previous indents at the end of the file + */ + popRemainingDedents(text: string): IToken[] { + const remainingDedents: IToken[] = []; + while (this.indentationStack.length > 1) { + remainingDedents.push( + this.createIndentationTokenInstance(this.dedentTokenType, text, this.options.dedentTokenName, text.length) + ); + this.indentationStack.pop(); + } + + this.indentationStack = [0]; + return remainingDedents; + } +} + +/** + * A lexer that is aware of indentation in the input text. + * The only purpose of this lexer is to reset the internal state of the {@link IndentationAwareTokenBuilder} + * between the tokenization of different text inputs. + * + * In your module, you can override the default lexer with this one as such: + * ```ts + * parser: { + * TokenBuilder: () => new IndentationAwareTokenBuilder(), + * Lexer: (services) => new IndentationAwareLexer(services), + * } + * ``` + */ +export class IndentationAwareLexer extends DefaultLexer { + protected readonly indentationTokenBuilder: IndentationAwareTokenBuilder; + + constructor(services: LangiumCoreServices) { + super(services); + if (services.parser.TokenBuilder instanceof IndentationAwareTokenBuilder) { + this.indentationTokenBuilder = services.parser.TokenBuilder; + } else { + throw new Error('IndentationAwareLexer requires an accompanying IndentationAwareTokenBuilder'); + } + } + + override tokenize(text: string): LexerResult { + const result = super.tokenize(text); + + // reset the indent stack between processing of different text inputs + const remainingDedents = this.indentationTokenBuilder.popRemainingDedents(text); + result.tokens.push(...remainingDedents); + + return result; + } +} diff --git a/packages/langium/src/parser/index.ts b/packages/langium/src/parser/index.ts index c0a9e8d7f..17b49b62c 100644 --- a/packages/langium/src/parser/index.ts +++ b/packages/langium/src/parser/index.ts @@ -13,3 +13,4 @@ export * from './lexer.js'; export * from './parser-config.js'; export * from './token-builder.js'; export * from './value-converter.js'; +export * from './indentation-aware.js'; diff --git a/packages/langium/src/parser/lexer.ts b/packages/langium/src/parser/lexer.ts index 0eb0653b9..eec23ea12 100644 --- a/packages/langium/src/parser/lexer.ts +++ b/packages/langium/src/parser/lexer.ts @@ -6,7 +6,6 @@ import type { ILexingError, IMultiModeLexerDefinition, IToken, TokenType, TokenTypeDictionary, TokenVocabulary } from 'chevrotain'; import type { LangiumCoreServices } from '../services.js'; -import { IndentationAwareTokenBuilder } from './token-builder.js'; import { Lexer as ChevrotainLexer } from 'chevrotain'; export interface LexerResult { @@ -67,42 +66,6 @@ export class DefaultLexer implements Lexer { } } -/** - * A lexer that is aware of indentation in the input text. - * The only purpose of this lexer is to reset the internal state of the {@link IndentationAwareTokenBuilder} - * between the tokenization of different text inputs. - * - * In your module, you can override the default lexer with this one as such: - * ```ts - * parser: { - * TokenBuilder: () => new IndentationAwareTokenBuilder(), - * Lexer: (services) => new IndentationAwareLexer(services), - * } - * ``` - */ -export class IndentationAwareLexer extends DefaultLexer { - protected readonly indentationTokenBuilder: IndentationAwareTokenBuilder; - - constructor(services: LangiumCoreServices) { - super(services); - if (services.parser.TokenBuilder instanceof IndentationAwareTokenBuilder) { - this.indentationTokenBuilder = services.parser.TokenBuilder; - } else { - throw new Error('IndentationAwareLexer requires an accompanying IndentationAwareTokenBuilder'); - } - } - - override tokenize(text: string): LexerResult { - const result = super.tokenize(text); - - // reset the indent stack between processing of different text inputs - const remainingDedents = this.indentationTokenBuilder.popRemainingDedents(text); - result.tokens.push(...remainingDedents); - - return result; - } -} - /** * Returns a check whether the given TokenVocabulary is TokenType array */ diff --git a/packages/langium/src/parser/token-builder.ts b/packages/langium/src/parser/token-builder.ts index e6b734d82..4320c5ca6 100644 --- a/packages/langium/src/parser/token-builder.ts +++ b/packages/langium/src/parser/token-builder.ts @@ -4,16 +4,15 @@ * terms of the MIT License, which is available in the project root. ******************************************************************************/ -import type { CustomPatternMatcherFunc, TokenPattern, TokenType, TokenVocabulary, IToken } from 'chevrotain'; +import type { CustomPatternMatcherFunc, TokenPattern, TokenType, TokenVocabulary } from 'chevrotain'; import type { AbstractRule, Grammar, Keyword, TerminalRule } from '../languages/generated/ast.js'; import type { Stream } from '../utils/stream.js'; -import { createToken, createTokenInstance, Lexer } from 'chevrotain'; +import { Lexer } from 'chevrotain'; import { isKeyword, isParserRule, isTerminalRule } from '../languages/generated/ast.js'; import { streamAllContents } from '../utils/ast-utils.js'; import { getAllReachableRules, terminalRegex } from '../utils/grammar-utils.js'; import { getCaseInsensitivePattern, isWhitespace, partialMatches } from '../utils/regexp-utils.js'; import { stream } from '../utils/stream.js'; -import { isTokenTypeArray } from './lexer.js'; export interface TokenBuilderOptions { caseInsensitive?: boolean @@ -119,297 +118,3 @@ export class DefaultTokenBuilder implements TokenBuilder { }, []); } } - -export interface IndentationTokenBuilderOptions { - /** - * The name of the token used to denote indentation in the grammar. - * A possible definition in the grammar could look like this: - * ```langium - * terminal INDENT: ':synthetic-indent:'; - * ``` - * - * @default 'INDENT' - */ - indentTokenName: string; - /** - * The name of the token used to denote deindentation in the grammar. - * A possible definition in the grammar could look like this: - * ```langium - * terminal DEDENT: ':synthetic-dedent:'; - * ``` - * - * @default 'DEDENT' - */ - dedentTokenName: string; - /** - * The name of the token used to denote whitespace other than indentation and newlines in the grammar. - * A possible definition in the grammar could look like this: - * ```langium - * hidden terminal WS: /[ \t]+/; - * ``` - * - * @default 'WS' - */ - whitespaceTokenName: string; -} - -export const indentationBuilderDefaultOptions: IndentationTokenBuilderOptions = { - indentTokenName: 'INDENT', - dedentTokenName: 'DEDENT', - whitespaceTokenName: 'WS', -}; - -/** - * A token builder that is sensitive to indentation in the input text. - * It will generate tokens for indentation and dedentation based on the indentation level. - * - * Inspired by https://github.com/chevrotain/chevrotain/blob/master/examples/lexer/python_indentation/python_indentation.js - */ -export class IndentationAwareTokenBuilder extends DefaultTokenBuilder { - /** - * The stack in which all the previous matched indentation levels are stored - * to understand how deep a the next tokens are nested. - */ - protected indentationStack: number[] = [0]; - protected options: IndentationTokenBuilderOptions; - - /** - * The token type to be used for indentation tokens - */ - protected indentTokenType: TokenType; - - /** - * The token type to be used for dedentation tokens - */ - protected dedentTokenType: TokenType; - - /** - * A regular expression to match a series of tabs and/or spaces. - * Override this to customize what the indentation is allowed to consist of. - */ - protected whitespaceRegExp = /[ \t]+/y; - - constructor(options: Partial = indentationBuilderDefaultOptions) { - super(); - this.options = { - ...indentationBuilderDefaultOptions, - ...options, - }; - - this.indentTokenType = createToken({ - name: this.options.indentTokenName, - pattern: this.indentMatcher, - line_breaks: false, - }); - - this.dedentTokenType = createToken({ - name: this.options.dedentTokenName, - pattern: this.dedentMatcher, - line_breaks: false, - }); - } - - override buildTokens(grammar: Grammar, options?: TokenBuilderOptions | undefined) { - const tokenTypes = super.buildTokens(grammar, options); - if (!isTokenTypeArray(tokenTypes)) { - throw new Error('Invalid tokens built by default builder'); - } - - const {indentTokenName, dedentTokenName, whitespaceTokenName} = this.options; - - // Rearrange tokens because whitespace (which is ignored) goes to the beginning by default, consuming indentation as well - // Order should be: dedent, indent, spaces - let dedent: TokenType | undefined; - let indent: TokenType | undefined; - let ws: TokenType | undefined; - const otherTokens: TokenType[] = []; - for (const tokenType of tokenTypes) { - if (tokenType.name === dedentTokenName) { - dedent = tokenType; - } else if (tokenType.name === indentTokenName) { - indent = tokenType; - } else if (tokenType.name === whitespaceTokenName) { - ws = tokenType; - } else { - otherTokens.push(tokenType); - } - } - if (!dedent || !indent || !ws) { - throw new Error('Some indentation/whitespace tokens not found!'); - } - return [dedent, indent, ws, ...otherTokens]; - } - - /** - * Helper function to check if the current position is the start of a new line. - * - * @param text The full input string. - * @param offset The current position at which to check - * @returns Whether the current position is the start of a new line - */ - protected isStartOfLine(text: string, offset: number): boolean { - return offset === 0 || '\r\n'.includes(text[offset - 1]); - } - - /** - * A helper function used in matching both indents and dedents. - * - * @param text The full input string. - * @param offset The current position at which to attempt a match - * @returns The current and previous indentation levels and the matched whitespace - */ - protected matchWhitespace(text: string, offset: number) { - this.whitespaceRegExp.lastIndex = offset; - const match = this.whitespaceRegExp.exec(text); - return { - currIndentLevel: match?.[0].length ?? 0, - prevIndentLevel: this.indentationStack.at(-1)!, - match, - }; - } - - /** - * Helper function to create an instance of an indentation token. - * - * @param tokenType Indent or dedent token type - * @param text Full input string, used to calculate the line number - * @param image The original image of the token (tabs or spaces) - * @param offset Current position in the input string - * @returns The indentation token instance - */ - protected createIndentationTokenInstance(tokenType: TokenType, text: string, image: string, offset: number) { - const lineNumber = text.substring(0, offset).split(/\r\n|\r|\n/).length; - return createTokenInstance( - tokenType, - image, - offset, offset + image.length, - lineNumber, lineNumber, - 0, image.length, - ); - } - - /** - * A custom pattern for matching indents - * - * @param text The full input string. - * @param offset The offset at which to attempt a match - * @param tokens Previously scanned Tokens - * @param groups Token Groups - */ - protected indentMatcher: CustomPatternMatcherFunc = (text, offset, tokens, _groups) => { - const {indentTokenName} = this.options; - - if (!this.isStartOfLine(text, offset)) { - return null; - } - - const {currIndentLevel, prevIndentLevel, match} = this.matchWhitespace(text, offset); - - if (currIndentLevel <= prevIndentLevel) { - // shallower indentation (should be matched by dedent) - // or same indentation level (should be matched by whitespace and ignored) - return null; - } - - this.indentationStack.push(currIndentLevel); - - const indentToken = this.createIndentationTokenInstance( - this.indentTokenType, - text, - match?.[0] ?? indentTokenName, - offset, - ); - tokens.push(indentToken); - - // Token already added, let the indentation now be consumed as whitespace and ignored - return null; - }; - - /** - * A custom pattern for matching dedents - * - * @param text The full input string. - * @param offset The offset at which to attempt a match - * @param tokens Previously scanned Tokens - * @param groups Token Groups - */ - protected dedentMatcher: CustomPatternMatcherFunc = (text, offset, tokens, _groups) => { - const {dedentTokenName} = this.options; - - if (!this.isStartOfLine(text, offset)) { - return null; - } - - const {currIndentLevel, prevIndentLevel, match} = this.matchWhitespace(text, offset); - - if (currIndentLevel >= prevIndentLevel) { - // bigger indentation (should be matched by indent) - // or same indentation level (should be matched by whitespace and ignored) - return null; - } - - const matchIndentIndex = this.indentationStack.lastIndexOf(currIndentLevel); - - // Any dedent must match some previous indentation level. - if (matchIndentIndex === -1) { - console.error(`Invalid dedent level ${currIndentLevel} at offset: ${offset}. Current indetation stack: ${this.indentationStack}`); - // throwing an error would crash the language server - // TODO: find a way to report error diagnostics message - return null; - } - - const numberOfDedents = this.indentationStack.length - matchIndentIndex - 1; - - for (let i = 0; i < numberOfDedents; i++) { - const token = this.createIndentationTokenInstance( - this.dedentTokenType, - text, - match?.[0] ?? dedentTokenName, - offset, - ); - tokens.push(token); - this.indentationStack.pop(); - } - - // Token already added, let the dedentation now be consumed as whitespace and ignored - return null; - }; - - protected override buildTerminalToken(terminal: TerminalRule): TokenType { - const tokenType = super.buildTerminalToken(terminal); - const {indentTokenName, dedentTokenName, whitespaceTokenName} = this.options; - - if (tokenType.name === indentTokenName) { - return this.indentTokenType; - } else if (tokenType.name === dedentTokenName) { - return this.dedentTokenType; - } else if (tokenType.name === whitespaceTokenName) { - return createToken({ - name: whitespaceTokenName, - pattern: this.whitespaceRegExp, - group: Lexer.SKIPPED, - }); - } - - return tokenType; - } - - /** - * Resets the indentation stack between different runs of the lexer - * - * @param text Full text that was tokenized - * @returns Remaining dedent tokens to match all previous indents at the end of the file - */ - popRemainingDedents(text: string): IToken[] { - const remainingDedents: IToken[] = []; - while (this.indentationStack.length > 1) { - remainingDedents.push( - this.createIndentationTokenInstance(this.dedentTokenType, text, this.options.dedentTokenName, text.length) - ); - this.indentationStack.pop(); - } - - this.indentationStack = [0]; - return remainingDedents; - } -} From 4f7ecf3b3d617612a031953fbe1400349822831e Mon Sep 17 00:00:00 2001 From: Abdelrahman Abounegm Date: Wed, 17 Jul 2024 12:39:14 +0300 Subject: [PATCH 10/14] Replace thrown error with warning again It was causing issues with the "super" call in the constructor in tests --- packages/langium/src/parser/indentation-aware.ts | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/packages/langium/src/parser/indentation-aware.ts b/packages/langium/src/parser/indentation-aware.ts index 9c4bb4dec..f4dc45ad9 100644 --- a/packages/langium/src/parser/indentation-aware.ts +++ b/packages/langium/src/parser/indentation-aware.ts @@ -321,20 +321,24 @@ export class IndentationAwareTokenBuilder extends DefaultTokenBuilder { * ``` */ export class IndentationAwareLexer extends DefaultLexer { - protected readonly indentationTokenBuilder: IndentationAwareTokenBuilder; + protected readonly indentationTokenBuilder?: IndentationAwareTokenBuilder; constructor(services: LangiumCoreServices) { super(services); if (services.parser.TokenBuilder instanceof IndentationAwareTokenBuilder) { this.indentationTokenBuilder = services.parser.TokenBuilder; } else { - throw new Error('IndentationAwareLexer requires an accompanying IndentationAwareTokenBuilder'); + console.warn('IndentationAwareLexer requires an accompanying IndentationAwareTokenBuilder'); } } override tokenize(text: string): LexerResult { const result = super.tokenize(text); + if (this.indentationTokenBuilder === undefined) { + return result; + } + // reset the indent stack between processing of different text inputs const remainingDedents = this.indentationTokenBuilder.popRemainingDedents(text); result.tokens.push(...remainingDedents); From 4d0724ad30a7f7611181656d9a4ad458e12a6320 Mon Sep 17 00:00:00 2001 From: Abdelrahman Abounegm Date: Wed, 17 Jul 2024 13:02:41 +0300 Subject: [PATCH 11/14] Remove indent-dedent pairs with nothing in between They provide no value, but may throw off the parser --- .../langium/src/parser/indentation-aware.ts | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/packages/langium/src/parser/indentation-aware.ts b/packages/langium/src/parser/indentation-aware.ts index f4dc45ad9..0be75de99 100644 --- a/packages/langium/src/parser/indentation-aware.ts +++ b/packages/langium/src/parser/indentation-aware.ts @@ -64,7 +64,7 @@ export class IndentationAwareTokenBuilder extends DefaultTokenBuilder { * to understand how deep a the next tokens are nested. */ protected indentationStack: number[] = [0]; - protected options: IndentationTokenBuilderOptions; + readonly options: IndentationTokenBuilderOptions; /** * The token type to be used for indentation tokens @@ -343,6 +343,22 @@ export class IndentationAwareLexer extends DefaultLexer { const remainingDedents = this.indentationTokenBuilder.popRemainingDedents(text); result.tokens.push(...remainingDedents); + // remove any "indent-dedent" pair with an empty body as these are typically + // added by comments or lines with just whitespace but have no real value + const { indentTokenName, dedentTokenName } = this.indentationTokenBuilder.options; + const cleanTokens: IToken[] = []; + for (let i = 0; i < result.tokens.length; i++) { + const token = result.tokens[i]; + const nextToken = result.tokens[i + 1]; + if (token.tokenType.name === indentTokenName && nextToken?.tokenType.name === dedentTokenName) { + i++; + continue; + } + + cleanTokens.push(token); + } + result.tokens = cleanTokens; + return result; } } From 6d69af422fe9d511bc5731223ba9c1249ba58742 Mon Sep 17 00:00:00 2001 From: Abdelrahman Abounegm Date: Wed, 17 Jul 2024 13:11:13 +0300 Subject: [PATCH 12/14] Add unit tests for indentation-aware module --- .../test/parser/indentation-aware.test.ts | 249 ++++++++++++++++++ 1 file changed, 249 insertions(+) create mode 100644 packages/langium/test/parser/indentation-aware.test.ts diff --git a/packages/langium/test/parser/indentation-aware.test.ts b/packages/langium/test/parser/indentation-aware.test.ts new file mode 100644 index 000000000..8033c7873 --- /dev/null +++ b/packages/langium/test/parser/indentation-aware.test.ts @@ -0,0 +1,249 @@ +/****************************************************************************** + * Copyright 2024 TypeFox GmbH + * This program and the accompanying materials are made available under the + * terms of the MIT License, which is available in the project root. + ******************************************************************************/ + +import type { TokenType } from '@chevrotain/types'; +import type { Grammar, LangiumParser, Lexer } from 'langium'; +import { beforeAll, describe, expect, test } from 'vitest'; +import { EmptyFileSystem, IndentationAwareLexer, IndentationAwareTokenBuilder } from 'langium'; +import { createLangiumGrammarServices, createServicesForGrammar } from 'langium/grammar'; +import { parseHelper } from 'langium/test'; + +const grammarServices = createLangiumGrammarServices(EmptyFileSystem).grammar; +const helper = parseHelper(grammarServices); + +const tokenBuilder = new IndentationAwareTokenBuilder(); + +async function getTokens(grammarString: string): Promise { + const grammar = (await helper(grammarString)).parseResult.value; + return tokenBuilder.buildTokens(grammar) as TokenType[]; +} + +async function getLexer(grammar: string): Promise { + const services = await createServicesForGrammar({ grammar }); + services.parser.TokenBuilder = tokenBuilder; + services.parser.Lexer = new IndentationAwareLexer(services); + return services.parser.Lexer; +} + +async function getParser(grammar: string): Promise { + const services = await createServicesForGrammar({ + grammar + }); + services.parser.TokenBuilder = tokenBuilder; + services.parser.Lexer = new IndentationAwareLexer(services); + return services.parser.LangiumParser; +} + +beforeAll(() => { + tokenBuilder.popRemainingDedents(''); +}); + +describe('indentationAwareTokenBuilder', () => { + + const sampleGrammar = ` + entry Main: + INDENT name=ID DEDENT; + + terminal ID: /[a-zA-Z_]\\w*/; + hidden terminal NL: /[\\r\\n]+/; + hidden terminal WS: /[\\t ]+/; + terminal INDENT: 'synthetic:indent'; + terminal DEDENT: 'synthetic:dedent'; + `; + + test('Moves indent/dedent token types to the beginning', async () => { + const tokenTypes = await getTokens(sampleGrammar); + + expect(tokenTypes).toHaveLength(5); + + const [dedent, indent, ws] = tokenTypes; + expect(dedent.name).toBe('DEDENT'); + expect(indent.name).toBe('INDENT'); + expect(ws.name).toBe('WS'); + }); + + test('Modifies indent/dedent patterns to be functions', async () => { + const tokenTypes = await getTokens(sampleGrammar); + + expect(tokenTypes).toHaveLength(5); + + const [dedent, indent] = tokenTypes; + expect(dedent.PATTERN).toBeTypeOf('function'); + expect(indent.PATTERN).toBeTypeOf('function'); + }); + + test('Rejects grammar without indent', async () => { + const indentlessGrammar = ` + entry Main: name=ID; + + terminal ID: /[a-zA-Z_]\\w*/; + hidden terminal WS: /\\s+/; + terminal DEDENT: 'synthetic:dedent'; + `; + + await expect(getTokens(indentlessGrammar)).rejects.toThrowError(); + }); + + test('Rejects grammar without dedent', async () => { + const dedentlessGrammar = ` + entry Main: name=ID; + + terminal ID: /[a-zA-Z_]\\w*/; + hidden terminal WS: /\\s+/; + terminal INDENT: 'synthetic:indent'; + `; + + await expect(getTokens(dedentlessGrammar)).rejects.toThrowError(); + }); + + test('Rejects grammar without whitespace', async () => { + const spacelessGrammar = ` + entry Main: name=ID; + + terminal ID: /[a-zA-Z_]\\w*/; + terminal INDENT: 'synthetic:indent'; + terminal DEDENT: 'synthetic:dedent'; + `; + + await expect(getTokens(spacelessGrammar)).rejects.toThrowError(); + }); + +}); + +describe('indentationAwareLexer', () => { + + const sampleGrammar = ` + grammar Test + + entry Block: '{' INDENT names+=ID* DEDENT nested+=Block* '}'; + + terminal ID: /[a-zA-Z_]\\w*/; + hidden terminal NL: /[\\r\\n]+/; + hidden terminal WS: /[\\t ]+/; + terminal INDENT: 'synthetic:indent'; + terminal DEDENT: 'synthetic:dedent'; + hidden terminal ML_COMMENT: /\\/\\*[\\s\\S]*?\\*\\//; + hidden terminal SL_COMMENT: /\\/\\/[^\\n\\r]*/; + `; + + test('should emit indent/dedent tokens around a block', async () => { + const lexer = await getLexer(sampleGrammar); + const {tokens, errors} = lexer.tokenize(`{ + name + anotherName +}`); + + expect(errors).toHaveLength(0); + expect(tokens).toHaveLength(6); + + const [/* L_BRAC */, indent, /* id1 */, /* id2 */, dedent, /* _R_BRAC */] = tokens; + expect(indent.tokenType.name).toBe('INDENT'); + expect(dedent.tokenType.name).toBe('DEDENT'); + }); + + test('should ignore indent tokens before comments', async () => { + const lexer = await getLexer(sampleGrammar); + const {tokens, errors} = lexer.tokenize(`// single-line comment + // indented comment when not expecting indentation +{ + name + // comment with different indentation inside block + anotherName +}`); + + expect(errors).toHaveLength(0); + expect(tokens).toHaveLength(6); + }); + + test('should not dedect indentation without a newline', async () => { + const lexer = await getLexer(sampleGrammar); + const {tokens} = lexer.tokenize(`{ name + // indented comment - to be ignored +}`); + expect(tokens).toHaveLength(3); + expect(tokens[1]).not.toBe('INDENT'); + }); + + test('should add remaining dedents to the end', async () => { + const lexer = await getLexer(sampleGrammar); + const {tokens} = lexer.tokenize(`// single-line comment +{ + name`); + expect(tokens).toHaveLength(4); + + const [/* L_BRAC */, indent, /* id */, dedent] = tokens; + expect(indent.tokenType.name).toBe('INDENT'); + expect(dedent.tokenType.name).toBe('DEDENT'); + }); + +}); + +describe('indentationAware parsing', () => { + + const sampleGrammar = ` + grammar PythonIf + + entry Statement: If | Return; + + If: + 'if' condition=BOOLEAN ':' + INDENT thenBody=Statement DEDENT + ('else' ':' INDENT elseBody=Statement DEDENT)?; + + Return: 'return' value=BOOLEAN; + + terminal BOOLEAN: /true|false/; + terminal INDENT: 'synthetic:indent'; + terminal DEDENT: 'synthetic:dedent'; + hidden terminal NL: /[\\r\\n]+/; + hidden terminal WS: /[\\t ]+/; + `; + + test('should parse correctly indented code', async () => { + const parser = await getParser(sampleGrammar); + const { parserErrors } = parser.parse(` +if true: + return false +else: + return true`); + + expect(parserErrors).toHaveLength(0); + }); + + test('should error on non-matching dedent', async () => { + const parser = await getParser(sampleGrammar); + const {parserErrors} = parser.parse(` +if true: + return false + else: + return true`); + + expect(parserErrors.length).toBeGreaterThan(0); + }); + + test('should throw an error on unexpected indent', async () => { + const parser = await getParser(sampleGrammar); + const { parserErrors } = parser.parse(` + if true: + return false`); + + expect(parserErrors.length).toBeGreaterThan(0); + }); + + test('should correctly parse nested blocks', async () => { + const parser = await getParser(sampleGrammar); + const {parserErrors} = parser.parse(` +if true: + return true +else: + if false: + return false + `); + + expect(parserErrors).toHaveLength(0); + }); + +}); From 657b7bc25d8d55fc739ca8e9ceba9decaf26e108 Mon Sep 17 00:00:00 2001 From: Mark Sujew Date: Wed, 17 Jul 2024 16:00:08 +0200 Subject: [PATCH 13/14] Formatting and minor optimizations --- .../langium/src/parser/indentation-aware.ts | 37 ++-- packages/langium/src/parser/index.ts | 2 +- .../test/parser/indentation-aware.test.ts | 198 +++++++++++------- 3 files changed, 142 insertions(+), 95 deletions(-) diff --git a/packages/langium/src/parser/indentation-aware.ts b/packages/langium/src/parser/indentation-aware.ts index 0be75de99..194acd198 100644 --- a/packages/langium/src/parser/indentation-aware.ts +++ b/packages/langium/src/parser/indentation-aware.ts @@ -69,12 +69,12 @@ export class IndentationAwareTokenBuilder extends DefaultTokenBuilder { /** * The token type to be used for indentation tokens */ - protected indentTokenType: TokenType; + readonly indentTokenType: TokenType; /** * The token type to be used for dedentation tokens */ - protected dedentTokenType: TokenType; + readonly dedentTokenType: TokenType; /** * A regular expression to match a series of tabs and/or spaces. @@ -108,7 +108,7 @@ export class IndentationAwareTokenBuilder extends DefaultTokenBuilder { throw new Error('Invalid tokens built by default builder'); } - const {indentTokenName, dedentTokenName, whitespaceTokenName} = this.options; + const { indentTokenName, dedentTokenName, whitespaceTokenName } = this.options; // Rearrange tokens because whitespace (which is ignored) goes to the beginning by default, consuming indentation as well // Order should be: dedent, indent, spaces @@ -190,13 +190,13 @@ export class IndentationAwareTokenBuilder extends DefaultTokenBuilder { * @param groups Token Groups */ protected indentMatcher: CustomPatternMatcherFunc = (text, offset, tokens, _groups) => { - const {indentTokenName} = this.options; + const { indentTokenName } = this.options; if (!this.isStartOfLine(text, offset)) { return null; } - const {currIndentLevel, prevIndentLevel, match} = this.matchWhitespace(text, offset); + const { currIndentLevel, prevIndentLevel, match } = this.matchWhitespace(text, offset); if (currIndentLevel <= prevIndentLevel) { // shallower indentation (should be matched by dedent) @@ -227,13 +227,13 @@ export class IndentationAwareTokenBuilder extends DefaultTokenBuilder { * @param groups Token Groups */ protected dedentMatcher: CustomPatternMatcherFunc = (text, offset, tokens, _groups) => { - const {dedentTokenName} = this.options; + const { dedentTokenName } = this.options; if (!this.isStartOfLine(text, offset)) { return null; } - const {currIndentLevel, prevIndentLevel, match} = this.matchWhitespace(text, offset); + const { currIndentLevel, prevIndentLevel, match } = this.matchWhitespace(text, offset); if (currIndentLevel >= prevIndentLevel) { // bigger indentation (should be matched by indent) @@ -270,7 +270,7 @@ export class IndentationAwareTokenBuilder extends DefaultTokenBuilder { protected override buildTerminalToken(terminal: TerminalRule): TokenType { const tokenType = super.buildTerminalToken(terminal); - const {indentTokenName, dedentTokenName, whitespaceTokenName} = this.options; + const { indentTokenName, dedentTokenName, whitespaceTokenName } = this.options; if (tokenType.name === indentTokenName) { return this.indentTokenType; @@ -321,42 +321,45 @@ export class IndentationAwareTokenBuilder extends DefaultTokenBuilder { * ``` */ export class IndentationAwareLexer extends DefaultLexer { - protected readonly indentationTokenBuilder?: IndentationAwareTokenBuilder; + + protected readonly indentationTokenBuilder: IndentationAwareTokenBuilder; constructor(services: LangiumCoreServices) { super(services); if (services.parser.TokenBuilder instanceof IndentationAwareTokenBuilder) { this.indentationTokenBuilder = services.parser.TokenBuilder; } else { - console.warn('IndentationAwareLexer requires an accompanying IndentationAwareTokenBuilder'); + throw new Error('IndentationAwareLexer requires an accompanying IndentationAwareTokenBuilder'); } } override tokenize(text: string): LexerResult { const result = super.tokenize(text); - if (this.indentationTokenBuilder === undefined) { - return result; - } - // reset the indent stack between processing of different text inputs const remainingDedents = this.indentationTokenBuilder.popRemainingDedents(text); result.tokens.push(...remainingDedents); // remove any "indent-dedent" pair with an empty body as these are typically // added by comments or lines with just whitespace but have no real value - const { indentTokenName, dedentTokenName } = this.indentationTokenBuilder.options; + const { indentTokenType, dedentTokenType } = this.indentationTokenBuilder; + // Use tokenTypeIdx for fast comparison + const indentTokenIdx = indentTokenType.tokenTypeIdx; + const dedentTokenIdx = dedentTokenType.tokenTypeIdx; const cleanTokens: IToken[] = []; - for (let i = 0; i < result.tokens.length; i++) { + const length = result.tokens.length - 1; + for (let i = 0; i < length; i++) { const token = result.tokens[i]; const nextToken = result.tokens[i + 1]; - if (token.tokenType.name === indentTokenName && nextToken?.tokenType.name === dedentTokenName) { + if (token.tokenTypeIdx === indentTokenIdx && nextToken.tokenTypeIdx === dedentTokenIdx) { i++; continue; } cleanTokens.push(token); } + // Push last token separately + cleanTokens.push(result.tokens[length]); result.tokens = cleanTokens; return result; diff --git a/packages/langium/src/parser/index.ts b/packages/langium/src/parser/index.ts index 17b49b62c..fab284a9c 100644 --- a/packages/langium/src/parser/index.ts +++ b/packages/langium/src/parser/index.ts @@ -7,10 +7,10 @@ export * from './async-parser.js'; export * from './completion-parser-builder.js'; export * from './cst-node-builder.js'; +export * from './indentation-aware.js'; export * from './langium-parser-builder.js'; export * from './langium-parser.js'; export * from './lexer.js'; export * from './parser-config.js'; export * from './token-builder.js'; export * from './value-converter.js'; -export * from './indentation-aware.js'; diff --git a/packages/langium/test/parser/indentation-aware.test.ts b/packages/langium/test/parser/indentation-aware.test.ts index 8033c7873..1e376c285 100644 --- a/packages/langium/test/parser/indentation-aware.test.ts +++ b/packages/langium/test/parser/indentation-aware.test.ts @@ -5,10 +5,12 @@ ******************************************************************************/ import type { TokenType } from '@chevrotain/types'; -import type { Grammar, LangiumParser, Lexer } from 'langium'; -import { beforeAll, describe, expect, test } from 'vitest'; +import type { AstNode, Grammar, LangiumParser, Lexer, Module } from 'langium'; +import { beforeEach, describe, expect, test } from 'vitest'; import { EmptyFileSystem, IndentationAwareLexer, IndentationAwareTokenBuilder } from 'langium'; import { createLangiumGrammarServices, createServicesForGrammar } from 'langium/grammar'; +import type { LangiumServices, PartialLangiumServices } from 'langium/lsp'; +import { expandToString } from 'langium/generate'; import { parseHelper } from 'langium/test'; const grammarServices = createLangiumGrammarServices(EmptyFileSystem).grammar; @@ -22,36 +24,43 @@ async function getTokens(grammarString: string): Promise { } async function getLexer(grammar: string): Promise { - const services = await createServicesForGrammar({ grammar }); - services.parser.TokenBuilder = tokenBuilder; - services.parser.Lexer = new IndentationAwareLexer(services); + const services = await createIndentationAwareServices(grammar); return services.parser.Lexer; } async function getParser(grammar: string): Promise { + const services = await createIndentationAwareServices(grammar); + return services.parser.LangiumParser; +} + +async function createIndentationAwareServices(grammar: string): Promise { const services = await createServicesForGrammar({ - grammar + grammar, + module: { + parser: { + TokenBuilder: () => new IndentationAwareTokenBuilder(), + Lexer: services => new IndentationAwareLexer(services) + } + } satisfies Module }); - services.parser.TokenBuilder = tokenBuilder; - services.parser.Lexer = new IndentationAwareLexer(services); - return services.parser.LangiumParser; + return services; } -beforeAll(() => { +beforeEach(() => { tokenBuilder.popRemainingDedents(''); }); -describe('indentationAwareTokenBuilder', () => { +describe('IndentationAwareTokenBuilder', () => { const sampleGrammar = ` - entry Main: - INDENT name=ID DEDENT; - - terminal ID: /[a-zA-Z_]\\w*/; - hidden terminal NL: /[\\r\\n]+/; - hidden terminal WS: /[\\t ]+/; - terminal INDENT: 'synthetic:indent'; - terminal DEDENT: 'synthetic:dedent'; + entry Main: + INDENT name=ID DEDENT; + + terminal ID: /[a-zA-Z_]\\w*/; + hidden terminal NL: /[\\r\\n]+/; + hidden terminal WS: /[\\t ]+/; + terminal INDENT: 'synthetic:indent'; + terminal DEDENT: 'synthetic:dedent'; `; test('Moves indent/dedent token types to the beginning', async () => { @@ -113,28 +122,29 @@ describe('indentationAwareTokenBuilder', () => { }); -describe('indentationAwareLexer', () => { +describe('IndentationAwareLexer', () => { const sampleGrammar = ` - grammar Test + grammar Test - entry Block: '{' INDENT names+=ID* DEDENT nested+=Block* '}'; + entry Block: '{' INDENT names+=ID* DEDENT nested+=Block* '}'; - terminal ID: /[a-zA-Z_]\\w*/; - hidden terminal NL: /[\\r\\n]+/; - hidden terminal WS: /[\\t ]+/; - terminal INDENT: 'synthetic:indent'; - terminal DEDENT: 'synthetic:dedent'; - hidden terminal ML_COMMENT: /\\/\\*[\\s\\S]*?\\*\\//; - hidden terminal SL_COMMENT: /\\/\\/[^\\n\\r]*/; + terminal ID: /[a-zA-Z_]\\w*/; + hidden terminal NL: /[\\r\\n]+/; + hidden terminal WS: /[\\t ]+/; + terminal INDENT: 'synthetic:indent'; + terminal DEDENT: 'synthetic:dedent'; + hidden terminal ML_COMMENT: /\\/\\*[\\s\\S]*?\\*\\//; + hidden terminal SL_COMMENT: /\\/\\/[^\\n\\r]*/; `; test('should emit indent/dedent tokens around a block', async () => { const lexer = await getLexer(sampleGrammar); - const {tokens, errors} = lexer.tokenize(`{ - name - anotherName -}`); + const { tokens, errors } = lexer.tokenize(expandToString` + { + name + anotherName + }`); expect(errors).toHaveLength(0); expect(tokens).toHaveLength(6); @@ -146,13 +156,14 @@ describe('indentationAwareLexer', () => { test('should ignore indent tokens before comments', async () => { const lexer = await getLexer(sampleGrammar); - const {tokens, errors} = lexer.tokenize(`// single-line comment + const { tokens, errors } = lexer.tokenize(expandToString` + // single-line comment // indented comment when not expecting indentation -{ - name - // comment with different indentation inside block - anotherName -}`); + { + name + // comment with different indentation inside block + anotherName + }`); expect(errors).toHaveLength(0); expect(tokens).toHaveLength(6); @@ -160,18 +171,20 @@ describe('indentationAwareLexer', () => { test('should not dedect indentation without a newline', async () => { const lexer = await getLexer(sampleGrammar); - const {tokens} = lexer.tokenize(`{ name - // indented comment - to be ignored -}`); + const { tokens } = lexer.tokenize(expandToString` + { name + // indented comment - to be ignored + }`); expect(tokens).toHaveLength(3); expect(tokens[1]).not.toBe('INDENT'); }); test('should add remaining dedents to the end', async () => { const lexer = await getLexer(sampleGrammar); - const {tokens} = lexer.tokenize(`// single-line comment -{ - name`); + const { tokens } = lexer.tokenize(expandToString` + // single-line comment + { + name`); expect(tokens).toHaveLength(4); const [/* L_BRAC */, indent, /* id */, dedent] = tokens; @@ -181,69 +194,100 @@ describe('indentationAwareLexer', () => { }); -describe('indentationAware parsing', () => { +describe('IndentationAware parsing', () => { const sampleGrammar = ` - grammar PythonIf + grammar PythonIf - entry Statement: If | Return; + entry Statement: If | Return; - If: - 'if' condition=BOOLEAN ':' - INDENT thenBody=Statement DEDENT - ('else' ':' INDENT elseBody=Statement DEDENT)?; + If: + 'if' condition=BOOLEAN ':' + INDENT thenBlock+=Statement+ DEDENT + ('else' ':' INDENT elseBlock+=Statement+ DEDENT)?; - Return: 'return' value=BOOLEAN; + Return: 'return' value=BOOLEAN; - terminal BOOLEAN: /true|false/; - terminal INDENT: 'synthetic:indent'; - terminal DEDENT: 'synthetic:dedent'; - hidden terminal NL: /[\\r\\n]+/; - hidden terminal WS: /[\\t ]+/; + terminal BOOLEAN returns boolean: /true|false/; + terminal INDENT: 'synthetic:indent'; + terminal DEDENT: 'synthetic:dedent'; + hidden terminal NL: /[\\r\\n]+/; + hidden terminal WS: /[\\t ]+/; `; test('should parse correctly indented code', async () => { const parser = await getParser(sampleGrammar); - const { parserErrors } = parser.parse(` -if true: - return false -else: - return true`); + const { parserErrors } = parser.parse(expandToString` + if true: + return false + else: + return true + `); expect(parserErrors).toHaveLength(0); }); test('should error on non-matching dedent', async () => { const parser = await getParser(sampleGrammar); - const {parserErrors} = parser.parse(` -if true: - return false - else: - return true`); + const { parserErrors } = parser.parse(expandToString` + if true: + return false + else: + return true + `); expect(parserErrors.length).toBeGreaterThan(0); }); test('should throw an error on unexpected indent', async () => { const parser = await getParser(sampleGrammar); - const { parserErrors } = parser.parse(` - if true: - return false`); + const { parserErrors } = parser.parse(expandToString` + // Parsing starts here + if true: + return false + `); expect(parserErrors.length).toBeGreaterThan(0); }); test('should correctly parse nested blocks', async () => { const parser = await getParser(sampleGrammar); - const {parserErrors} = parser.parse(` -if true: - return true -else: - if false: - return false + const { parserErrors, value } = parser.parse(expandToString` + if true: + return true + else: + if false: + return true + return false + return true `); expect(parserErrors).toHaveLength(0); + const ifValue = value as If; + expect(ifValue.thenBlock).toHaveLength(1); + expect(ifValue.elseBlock).toHaveLength(2); + const elseBlock = ifValue.elseBlock![0] as If; + expect(elseBlock.thenBlock).toHaveLength(2); + const nestedReturn1 = elseBlock.thenBlock[0] as Return; + expect(nestedReturn1.value).toBe(true); + const nestedReturn2 = elseBlock.thenBlock[1] as Return; + expect(nestedReturn2.value).toBe(false); + const return2 = ifValue.elseBlock![1] as Return; + expect(return2.value).toBe(true); }); }); + +type Statement = If | Return; + +interface If extends AstNode { + $type: 'If'; + condition: boolean; + thenBlock: Statement[]; + elseBlock?: Statement[]; +} + +interface Return extends AstNode { + $type: 'Return'; + value: boolean; +} From a609a701d7bc5dd9d9bce04be4ac0a08457ef90d Mon Sep 17 00:00:00 2001 From: Mark Sujew Date: Wed, 17 Jul 2024 16:42:54 +0200 Subject: [PATCH 14/14] Fix nullability --- packages/langium/test/parser/indentation-aware.test.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/langium/test/parser/indentation-aware.test.ts b/packages/langium/test/parser/indentation-aware.test.ts index 1e376c285..269da3c2a 100644 --- a/packages/langium/test/parser/indentation-aware.test.ts +++ b/packages/langium/test/parser/indentation-aware.test.ts @@ -266,13 +266,13 @@ describe('IndentationAware parsing', () => { const ifValue = value as If; expect(ifValue.thenBlock).toHaveLength(1); expect(ifValue.elseBlock).toHaveLength(2); - const elseBlock = ifValue.elseBlock![0] as If; + const elseBlock = ifValue.elseBlock[0] as If; expect(elseBlock.thenBlock).toHaveLength(2); const nestedReturn1 = elseBlock.thenBlock[0] as Return; expect(nestedReturn1.value).toBe(true); const nestedReturn2 = elseBlock.thenBlock[1] as Return; expect(nestedReturn2.value).toBe(false); - const return2 = ifValue.elseBlock![1] as Return; + const return2 = ifValue.elseBlock[1] as Return; expect(return2.value).toBe(true); }); @@ -284,7 +284,7 @@ interface If extends AstNode { $type: 'If'; condition: boolean; thenBlock: Statement[]; - elseBlock?: Statement[]; + elseBlock: Statement[]; } interface Return extends AstNode {