code-hike
diff --git a/‎.changeset/gentle-cats-divide.md
+5 b/‎.changeset/gentle-cats-divide.md
+5
diff --git a/‎lib/dist/browser.esm.mjs
+38-7 b/‎lib/dist/browser.esm.mjs
+38-7
diff --git a/‎lib/dist/index.cjs.js
+38-7 b/‎lib/dist/index.cjs.js
+38-7
diff --git a/‎lib/dist/index.esm.mjs
+38-7 b/‎lib/dist/index.esm.mjs
+38-7
diff --git a/‎lib/src/tokenizer.ts
+43-7 b/‎lib/src/tokenizer.ts
+43-7
diff --git a/‎lib/test/__snapshots__/browser.test.ts.snap
+6-42 b/‎lib/test/__snapshots__/browser.test.ts.snap
+6-42
@@ -0,0 +1,5 @@
+---
+"@code-hike/lighter": patch
+---
+
+Join tokens when possible
@@ -2398,6 +2398,7 @@ async function loadGrammarFromFile(path) {
 // MetadataConsts
 const FONT_STYLE_MASK = 0b00000000000000000111100000000000;
 const FOREGROUND_MASK = 0b00000000111111111000000000000000;
+const STYLE_MASK = 0b00000000111111111111100000000000;
 const FONT_STYLE_OFFSET = 11;
 const FOREGROUND_OFFSET = 15;
 const FontStyle = {
@@ -2420,26 +2421,56 @@ function tokenize(code, grammar, colors) {
         }));
     });
 }
-function tokenizeLine(grammar, stack, line) {
+function tokenizeLine(grammar, stack, line, config) {
     const { tokens, ruleStack } = grammar.tokenizeLine2(line, stack);
     const newTokens = [];
     let tokenEnd = line.length;
     for (let i = tokens.length - 2; i >= 0; i = i - 2) {
         const tokenStart = tokens[i];
         const metadata = tokens[i + 1];
-        newTokens.unshift({
-            content: line.slice(tokenStart, tokenEnd),
-            metadata,
-        });
+        const content = line.slice(tokenStart, tokenEnd);
+        newTokens.unshift({ content, metadata });
         tokenEnd = tokenStart;
     }
-    return { rawTokens: newTokens, nextStack: ruleStack };
+    let rawTokens = [];
+    if (config === null || config === void 0 ? void 0 : config.preserveWhitespace) {
+        rawTokens = newTokens;
+    }
+    else {
+        // join empty space tokens with the previous token (or the next token if there's no previous token)
+        for (let i = 0; i < newTokens.length; i++) {
+            const token = newTokens[i];
+            if (token.content.trim() !== "") {
+                // if has same style as previous token, join with previous token
+                const prev = rawTokens[rawTokens.length - 1];
+                if (prev &&
+                    (prev.metadata & STYLE_MASK) === (token.metadata & STYLE_MASK)) {
+                    prev.content += token.content;
+                }
+                else {
+                    rawTokens.push(token);
+                }
+            }
+            else if (rawTokens.length > 0) {
+                rawTokens[rawTokens.length - 1].content += token.content;
+            }
+            else if (i < newTokens.length - 1) {
+                newTokens[i + 1].content = token.content + newTokens[i + 1].content;
+            }
+            else {
+                rawTokens.push(token);
+            }
+        }
+    }
+    return { rawTokens, nextStack: ruleStack };
 }
 function tokenizeWithScopes(code, grammar, colors) {
     let stack = null;
     const lines = code.split(/\r?\n|\r/g);
     return lines.map((line) => {
-        const { rawTokens, nextStack } = tokenizeLine(grammar, stack, line);
+        const { rawTokens, nextStack } = tokenizeLine(grammar, stack, line, {
+            preserveWhitespace: true,
+        });
         const newTokens = rawTokens.map(({ content, metadata }) => ({
             content,
             style: getStyle(metadata, colors),
 
@@ -2746,6 +2746,7 @@ async function loadGrammarFromFile(path) {
 // MetadataConsts
 const FONT_STYLE_MASK = 0b00000000000000000111100000000000;
 const FOREGROUND_MASK = 0b00000000111111111000000000000000;
+const STYLE_MASK = 0b00000000111111111111100000000000;
 const FONT_STYLE_OFFSET = 11;
 const FOREGROUND_OFFSET = 15;
 const FontStyle = {
@@ -2768,26 +2769,56 @@ function tokenize(code, grammar, colors) {
         }));
     });
 }
-function tokenizeLine(grammar, stack, line) {
+function tokenizeLine(grammar, stack, line, config) {
     const { tokens, ruleStack } = grammar.tokenizeLine2(line, stack);
     const newTokens = [];
     let tokenEnd = line.length;
     for (let i = tokens.length - 2; i >= 0; i = i - 2) {
         const tokenStart = tokens[i];
         const metadata = tokens[i + 1];
-        newTokens.unshift({
-            content: line.slice(tokenStart, tokenEnd),
-            metadata,
-        });
+        const content = line.slice(tokenStart, tokenEnd);
+        newTokens.unshift({ content, metadata });
         tokenEnd = tokenStart;
     }
-    return { rawTokens: newTokens, nextStack: ruleStack };
+    let rawTokens = [];
+    if (config === null || config === void 0 ? void 0 : config.preserveWhitespace) {
+        rawTokens = newTokens;
+    }
+    else {
+        // join empty space tokens with the previous token (or the next token if there's no previous token)
+        for (let i = 0; i < newTokens.length; i++) {
+            const token = newTokens[i];
+            if (token.content.trim() !== "") {
+                // if has same style as previous token, join with previous token
+                const prev = rawTokens[rawTokens.length - 1];
+                if (prev &&
+                    (prev.metadata & STYLE_MASK) === (token.metadata & STYLE_MASK)) {
+                    prev.content += token.content;
+                }
+                else {
+                    rawTokens.push(token);
+                }
+            }
+            else if (rawTokens.length > 0) {
+                rawTokens[rawTokens.length - 1].content += token.content;
+            }
+            else if (i < newTokens.length - 1) {
+                newTokens[i + 1].content = token.content + newTokens[i + 1].content;
+            }
+            else {
+                rawTokens.push(token);
+            }
+        }
+    }
+    return { rawTokens, nextStack: ruleStack };
 }
 function tokenizeWithScopes(code, grammar, colors) {
     let stack = null;
     const lines = code.split(/\r?\n|\r/g);
     return lines.map((line) => {
-        const { rawTokens, nextStack } = tokenizeLine(grammar, stack, line);
+        const { rawTokens, nextStack } = tokenizeLine(grammar, stack, line, {
+            preserveWhitespace: true,
+        });
         const newTokens = rawTokens.map(({ content, metadata }) => ({
             content,
             style: getStyle(metadata, colors),
 
@@ -7,6 +7,7 @@ import { Line, Token } from "./annotations";
 const FONT_STYLE_MASK = 0b00000000000000000111100000000000;
 const FOREGROUND_MASK = 0b00000000111111111000000000000000;
 const BACKGROUND_MASK = 0b11111111000000000000000000000000;
+const STYLE_MASK = 0b00000000111111111111100000000000;
 const FONT_STYLE_OFFSET = 11;
 const FOREGROUND_OFFSET = 15;
 const BACKGROUND_OFFSET = 24;
@@ -35,20 +36,53 @@ export function tokenize(code: string, grammar: IGrammar, colors: string[]) {
 
 type RawToken = { content: string; metadata: number };
 
-function tokenizeLine(grammar: IGrammar, stack: StackElement, line: string) {
+function tokenizeLine(
+  grammar: IGrammar,
+  stack: StackElement,
+  line: string,
+  config?: { preserveWhitespace?: boolean }
+) {
   const { tokens, ruleStack } = grammar.tokenizeLine2(line, stack);
   const newTokens: RawToken[] = [];
   let tokenEnd = line.length;
   for (let i = tokens.length - 2; i >= 0; i = i - 2) {
     const tokenStart = tokens[i];
     const metadata = tokens[i + 1];
-    newTokens.unshift({
-      content: line.slice(tokenStart, tokenEnd),
-      metadata,
-    });
+    const content = line.slice(tokenStart, tokenEnd);
+    newTokens.unshift({ content, metadata });
     tokenEnd = tokenStart;
   }
-  return { rawTokens: newTokens, nextStack: ruleStack };
+
+  let rawTokens: RawToken[] = [];
+
+  if (config?.preserveWhitespace) {
+    rawTokens = newTokens;
+  } else {
+    // join empty space tokens with the previous token (or the next token if there's no previous token)
+    for (let i = 0; i < newTokens.length; i++) {
+      const token = newTokens[i];
+      if (token.content.trim() !== "") {
+        // if has same style as previous token, join with previous token
+        const prev = rawTokens[rawTokens.length - 1];
+        if (
+          prev &&
+          (prev.metadata & STYLE_MASK) === (token.metadata & STYLE_MASK)
+        ) {
+          prev.content += token.content;
+        } else {
+          rawTokens.push(token);
+        }
+      } else if (rawTokens.length > 0) {
+        rawTokens[rawTokens.length - 1].content += token.content;
+      } else if (i < newTokens.length - 1) {
+        newTokens[i + 1].content = token.content + newTokens[i + 1].content;
+      } else {
+        rawTokens.push(token);
+      }
+    }
+  }
+
+  return { rawTokens, nextStack: ruleStack };
 }
 
 export function tokenizeWithScopes(
@@ -60,7 +94,9 @@ export function tokenizeWithScopes(
   const lines = code.split(/\r?\n|\r/g);
 
   return lines.map((line) => {
-    const { rawTokens, nextStack } = tokenizeLine(grammar, stack, line);
+    const { rawTokens, nextStack } = tokenizeLine(grammar, stack, line, {
+      preserveWhitespace: true,
+    });
     const newTokens = rawTokens.map(({ content, metadata }) => ({
       content,
       style: getStyle(metadata, colors),
 
@@ -58,17 +58,11 @@ exports[`extract annottations 2`] = `
       "lineNumber": 1,
       "tokens": [
         {
-          "content": "const",
+          "content": "const ",
           "style": {
             "color": "#569CD6",
           },
         },
-        {
-          "content": " ",
-          "style": {
-            "color": "#D4D4D4",
-          },
-        },
         {
           "content": "x",
           "style": {
@@ -104,17 +98,11 @@ exports[`extract annottations 2`] = `
           "lineNumber": 2,
           "tokens": [
             {
-              "content": "const",
+              "content": "const ",
               "style": {
                 "color": "#569CD6",
               },
             },
-            {
-              "content": " ",
-              "style": {
-                "color": "#D4D4D4",
-              },
-            },
             {
               "content": "y",
               "style": {
@@ -278,53 +266,29 @@ exports[`highlight html with theme 1`] = `
         },
       },
       {
-        "content": "const",
+        "content": "const ",
         "style": {
           "color": "#FF7B72",
         },
       },
       {
-        "content": " ",
-        "style": {
-          "color": "#C9D1D9",
-        },
-      },
-      {
-        "content": "x",
+        "content": "x ",
         "style": {
           "color": "#79C0FF",
         },
       },
       {
-        "content": " ",
-        "style": {
-          "color": "#C9D1D9",
-        },
-      },
-      {
-        "content": "=",
+        "content": "= ",
         "style": {
           "color": "#FF7B72",
         },
       },
       {
-        "content": " ",
-        "style": {
-          "color": "#C9D1D9",
-        },
-      },
-      {
-        "content": "1",
+        "content": "1 ",
         "style": {
           "color": "#79C0FF",
         },
       },
-      {
-        "content": " ",
-        "style": {
-          "color": "#C9D1D9",
-        },
-      },
       {
         "content": "</",
         "style": {
-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
 +"@code-hike/lighter": patch
 +---
++
 +Join tokens when possible