Skip to content

Commit 000501c

Browse files
aabounegmmsujew
andauthored
Add indentation-aware TokenBuilder and Lexer (#1578)
Co-authored-by: Mark Sujew <[email protected]>
1 parent 91219ec commit 000501c

File tree

3 files changed

+661
-0
lines changed

3 files changed

+661
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,367 @@
1+
/******************************************************************************
2+
* Copyright 2024 TypeFox GmbH
3+
* This program and the accompanying materials are made available under the
4+
* terms of the MIT License, which is available in the project root.
5+
******************************************************************************/
6+
7+
import type { CustomPatternMatcherFunc, TokenType, IToken } from 'chevrotain';
8+
import type { Grammar, TerminalRule } from '../languages/generated/ast.js';
9+
import type { TokenBuilderOptions } from './token-builder.js';
10+
import type { LexerResult } from './lexer.js';
11+
import type { LangiumCoreServices } from '../services.js';
12+
import { createToken, createTokenInstance, Lexer } from 'chevrotain';
13+
import { DefaultTokenBuilder } from './token-builder.js';
14+
import { DefaultLexer, isTokenTypeArray } from './lexer.js';
15+
16+
export interface IndentationTokenBuilderOptions {
17+
/**
18+
* The name of the token used to denote indentation in the grammar.
19+
* A possible definition in the grammar could look like this:
20+
* ```langium
21+
* terminal INDENT: ':synthetic-indent:';
22+
* ```
23+
*
24+
* @default 'INDENT'
25+
*/
26+
indentTokenName: string;
27+
/**
28+
* The name of the token used to denote deindentation in the grammar.
29+
* A possible definition in the grammar could look like this:
30+
* ```langium
31+
* terminal DEDENT: ':synthetic-dedent:';
32+
* ```
33+
*
34+
* @default 'DEDENT'
35+
*/
36+
dedentTokenName: string;
37+
/**
38+
* The name of the token used to denote whitespace other than indentation and newlines in the grammar.
39+
* A possible definition in the grammar could look like this:
40+
* ```langium
41+
* hidden terminal WS: /[ \t]+/;
42+
* ```
43+
*
44+
* @default 'WS'
45+
*/
46+
whitespaceTokenName: string;
47+
}
48+
49+
export const indentationBuilderDefaultOptions: IndentationTokenBuilderOptions = {
50+
indentTokenName: 'INDENT',
51+
dedentTokenName: 'DEDENT',
52+
whitespaceTokenName: 'WS',
53+
};
54+
55+
/**
56+
* A token builder that is sensitive to indentation in the input text.
57+
* It will generate tokens for indentation and dedentation based on the indentation level.
58+
*
59+
* Inspired by https://github.com/chevrotain/chevrotain/blob/master/examples/lexer/python_indentation/python_indentation.js
60+
*/
61+
export class IndentationAwareTokenBuilder extends DefaultTokenBuilder {
62+
/**
63+
* The stack in which all the previous matched indentation levels are stored
64+
* to understand how deep a the next tokens are nested.
65+
*/
66+
protected indentationStack: number[] = [0];
67+
readonly options: IndentationTokenBuilderOptions;
68+
69+
/**
70+
* The token type to be used for indentation tokens
71+
*/
72+
readonly indentTokenType: TokenType;
73+
74+
/**
75+
* The token type to be used for dedentation tokens
76+
*/
77+
readonly dedentTokenType: TokenType;
78+
79+
/**
80+
* A regular expression to match a series of tabs and/or spaces.
81+
* Override this to customize what the indentation is allowed to consist of.
82+
*/
83+
protected whitespaceRegExp = /[ \t]+/y;
84+
85+
constructor(options: Partial<IndentationTokenBuilderOptions> = indentationBuilderDefaultOptions) {
86+
super();
87+
this.options = {
88+
...indentationBuilderDefaultOptions,
89+
...options,
90+
};
91+
92+
this.indentTokenType = createToken({
93+
name: this.options.indentTokenName,
94+
pattern: this.indentMatcher,
95+
line_breaks: false,
96+
});
97+
98+
this.dedentTokenType = createToken({
99+
name: this.options.dedentTokenName,
100+
pattern: this.dedentMatcher,
101+
line_breaks: false,
102+
});
103+
}
104+
105+
override buildTokens(grammar: Grammar, options?: TokenBuilderOptions | undefined) {
106+
const tokenTypes = super.buildTokens(grammar, options);
107+
if (!isTokenTypeArray(tokenTypes)) {
108+
throw new Error('Invalid tokens built by default builder');
109+
}
110+
111+
const { indentTokenName, dedentTokenName, whitespaceTokenName } = this.options;
112+
113+
// Rearrange tokens because whitespace (which is ignored) goes to the beginning by default, consuming indentation as well
114+
// Order should be: dedent, indent, spaces
115+
let dedent: TokenType | undefined;
116+
let indent: TokenType | undefined;
117+
let ws: TokenType | undefined;
118+
const otherTokens: TokenType[] = [];
119+
for (const tokenType of tokenTypes) {
120+
if (tokenType.name === dedentTokenName) {
121+
dedent = tokenType;
122+
} else if (tokenType.name === indentTokenName) {
123+
indent = tokenType;
124+
} else if (tokenType.name === whitespaceTokenName) {
125+
ws = tokenType;
126+
} else {
127+
otherTokens.push(tokenType);
128+
}
129+
}
130+
if (!dedent || !indent || !ws) {
131+
throw new Error('Some indentation/whitespace tokens not found!');
132+
}
133+
return [dedent, indent, ws, ...otherTokens];
134+
}
135+
136+
/**
137+
* Helper function to check if the current position is the start of a new line.
138+
*
139+
* @param text The full input string.
140+
* @param offset The current position at which to check
141+
* @returns Whether the current position is the start of a new line
142+
*/
143+
protected isStartOfLine(text: string, offset: number): boolean {
144+
return offset === 0 || '\r\n'.includes(text[offset - 1]);
145+
}
146+
147+
/**
148+
* A helper function used in matching both indents and dedents.
149+
*
150+
* @param text The full input string.
151+
* @param offset The current position at which to attempt a match
152+
* @returns The current and previous indentation levels and the matched whitespace
153+
*/
154+
protected matchWhitespace(text: string, offset: number) {
155+
this.whitespaceRegExp.lastIndex = offset;
156+
const match = this.whitespaceRegExp.exec(text);
157+
return {
158+
currIndentLevel: match?.[0].length ?? 0,
159+
prevIndentLevel: this.indentationStack.at(-1)!,
160+
match,
161+
};
162+
}
163+
164+
/**
165+
* Helper function to create an instance of an indentation token.
166+
*
167+
* @param tokenType Indent or dedent token type
168+
* @param text Full input string, used to calculate the line number
169+
* @param image The original image of the token (tabs or spaces)
170+
* @param offset Current position in the input string
171+
* @returns The indentation token instance
172+
*/
173+
protected createIndentationTokenInstance(tokenType: TokenType, text: string, image: string, offset: number) {
174+
const lineNumber = text.substring(0, offset).split(/\r\n|\r|\n/).length;
175+
return createTokenInstance(
176+
tokenType,
177+
image,
178+
offset, offset + image.length,
179+
lineNumber, lineNumber,
180+
0, image.length,
181+
);
182+
}
183+
184+
/**
185+
* A custom pattern for matching indents
186+
*
187+
* @param text The full input string.
188+
* @param offset The offset at which to attempt a match
189+
* @param tokens Previously scanned Tokens
190+
* @param groups Token Groups
191+
*/
192+
protected indentMatcher: CustomPatternMatcherFunc = (text, offset, tokens, _groups) => {
193+
const { indentTokenName } = this.options;
194+
195+
if (!this.isStartOfLine(text, offset)) {
196+
return null;
197+
}
198+
199+
const { currIndentLevel, prevIndentLevel, match } = this.matchWhitespace(text, offset);
200+
201+
if (currIndentLevel <= prevIndentLevel) {
202+
// shallower indentation (should be matched by dedent)
203+
// or same indentation level (should be matched by whitespace and ignored)
204+
return null;
205+
}
206+
207+
this.indentationStack.push(currIndentLevel);
208+
209+
const indentToken = this.createIndentationTokenInstance(
210+
this.indentTokenType,
211+
text,
212+
match?.[0] ?? indentTokenName,
213+
offset,
214+
);
215+
tokens.push(indentToken);
216+
217+
// Token already added, let the indentation now be consumed as whitespace and ignored
218+
return null;
219+
};
220+
221+
/**
222+
* A custom pattern for matching dedents
223+
*
224+
* @param text The full input string.
225+
* @param offset The offset at which to attempt a match
226+
* @param tokens Previously scanned Tokens
227+
* @param groups Token Groups
228+
*/
229+
protected dedentMatcher: CustomPatternMatcherFunc = (text, offset, tokens, _groups) => {
230+
const { dedentTokenName } = this.options;
231+
232+
if (!this.isStartOfLine(text, offset)) {
233+
return null;
234+
}
235+
236+
const { currIndentLevel, prevIndentLevel, match } = this.matchWhitespace(text, offset);
237+
238+
if (currIndentLevel >= prevIndentLevel) {
239+
// bigger indentation (should be matched by indent)
240+
// or same indentation level (should be matched by whitespace and ignored)
241+
return null;
242+
}
243+
244+
const matchIndentIndex = this.indentationStack.lastIndexOf(currIndentLevel);
245+
246+
// Any dedent must match some previous indentation level.
247+
if (matchIndentIndex === -1) {
248+
console.error(`Invalid dedent level ${currIndentLevel} at offset: ${offset}. Current indetation stack: ${this.indentationStack}`);
249+
// throwing an error would crash the language server
250+
// TODO: find a way to report error diagnostics message
251+
return null;
252+
}
253+
254+
const numberOfDedents = this.indentationStack.length - matchIndentIndex - 1;
255+
256+
for (let i = 0; i < numberOfDedents; i++) {
257+
const token = this.createIndentationTokenInstance(
258+
this.dedentTokenType,
259+
text,
260+
match?.[0] ?? dedentTokenName,
261+
offset,
262+
);
263+
tokens.push(token);
264+
this.indentationStack.pop();
265+
}
266+
267+
// Token already added, let the dedentation now be consumed as whitespace and ignored
268+
return null;
269+
};
270+
271+
protected override buildTerminalToken(terminal: TerminalRule): TokenType {
272+
const tokenType = super.buildTerminalToken(terminal);
273+
const { indentTokenName, dedentTokenName, whitespaceTokenName } = this.options;
274+
275+
if (tokenType.name === indentTokenName) {
276+
return this.indentTokenType;
277+
} else if (tokenType.name === dedentTokenName) {
278+
return this.dedentTokenType;
279+
} else if (tokenType.name === whitespaceTokenName) {
280+
return createToken({
281+
name: whitespaceTokenName,
282+
pattern: this.whitespaceRegExp,
283+
group: Lexer.SKIPPED,
284+
});
285+
}
286+
287+
return tokenType;
288+
}
289+
290+
/**
291+
* Resets the indentation stack between different runs of the lexer
292+
*
293+
* @param text Full text that was tokenized
294+
* @returns Remaining dedent tokens to match all previous indents at the end of the file
295+
*/
296+
popRemainingDedents(text: string): IToken[] {
297+
const remainingDedents: IToken[] = [];
298+
while (this.indentationStack.length > 1) {
299+
remainingDedents.push(
300+
this.createIndentationTokenInstance(this.dedentTokenType, text, this.options.dedentTokenName, text.length)
301+
);
302+
this.indentationStack.pop();
303+
}
304+
305+
this.indentationStack = [0];
306+
return remainingDedents;
307+
}
308+
}
309+
310+
/**
311+
* A lexer that is aware of indentation in the input text.
312+
* The only purpose of this lexer is to reset the internal state of the {@link IndentationAwareTokenBuilder}
313+
* between the tokenization of different text inputs.
314+
*
315+
* In your module, you can override the default lexer with this one as such:
316+
* ```ts
317+
* parser: {
318+
* TokenBuilder: () => new IndentationAwareTokenBuilder(),
319+
* Lexer: (services) => new IndentationAwareLexer(services),
320+
* }
321+
* ```
322+
*/
323+
export class IndentationAwareLexer extends DefaultLexer {
324+
325+
protected readonly indentationTokenBuilder: IndentationAwareTokenBuilder;
326+
327+
constructor(services: LangiumCoreServices) {
328+
super(services);
329+
if (services.parser.TokenBuilder instanceof IndentationAwareTokenBuilder) {
330+
this.indentationTokenBuilder = services.parser.TokenBuilder;
331+
} else {
332+
throw new Error('IndentationAwareLexer requires an accompanying IndentationAwareTokenBuilder');
333+
}
334+
}
335+
336+
override tokenize(text: string): LexerResult {
337+
const result = super.tokenize(text);
338+
339+
// reset the indent stack between processing of different text inputs
340+
const remainingDedents = this.indentationTokenBuilder.popRemainingDedents(text);
341+
result.tokens.push(...remainingDedents);
342+
343+
// remove any "indent-dedent" pair with an empty body as these are typically
344+
// added by comments or lines with just whitespace but have no real value
345+
const { indentTokenType, dedentTokenType } = this.indentationTokenBuilder;
346+
// Use tokenTypeIdx for fast comparison
347+
const indentTokenIdx = indentTokenType.tokenTypeIdx;
348+
const dedentTokenIdx = dedentTokenType.tokenTypeIdx;
349+
const cleanTokens: IToken[] = [];
350+
const length = result.tokens.length - 1;
351+
for (let i = 0; i < length; i++) {
352+
const token = result.tokens[i];
353+
const nextToken = result.tokens[i + 1];
354+
if (token.tokenTypeIdx === indentTokenIdx && nextToken.tokenTypeIdx === dedentTokenIdx) {
355+
i++;
356+
continue;
357+
}
358+
359+
cleanTokens.push(token);
360+
}
361+
// Push last token separately
362+
cleanTokens.push(result.tokens[length]);
363+
result.tokens = cleanTokens;
364+
365+
return result;
366+
}
367+
}

packages/langium/src/parser/index.ts

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
export * from './async-parser.js';
88
export * from './completion-parser-builder.js';
99
export * from './cst-node-builder.js';
10+
export * from './indentation-aware.js';
1011
export * from './langium-parser-builder.js';
1112
export * from './langium-parser.js';
1213
export * from './lexer.js';

0 commit comments

Comments
 (0)