Skip to content

Commit d491a97

Browse files
committed
/discojs/src/processing/ final(?) corrections based on review
1 parent a4d197c commit d491a97

File tree

1 file changed

+6
-21
lines changed

1 file changed

+6
-21
lines changed

discojs/src/processing/text.spec.ts

Lines changed: 6 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -3,29 +3,11 @@ import { expect } from "chai";
33
import { tokenize } from "./text.js";
44
import { AutoTokenizer } from "@xenova/transformers";
55
import { Repeat } from "immutable";
6-
import { PreTrainedTokenizer } from "@xenova/transformers";
7-
86

97
interface TokenizerOutput {
108
input_ids: number[];
119
}
1210

13-
/**
14-
* Encodes the text into token IDs and then decodes them back to text
15-
* Special tokens are skipped during decoding
16-
*
17-
* @param tokenizer - An instance of a PreTrainedTokenizer
18-
* @param text - The text to process
19-
* @returns The decoded text obtained after encoding and then decoding
20-
*/
21-
export function encodeDecode(tokenizer: PreTrainedTokenizer, text: string): string {
22-
// Encode the text using the tokenizer.
23-
const encoding = tokenizer(text, { return_tensor: false }) as TokenizerOutput;
24-
// Decode the token IDs back into text while skipping special tokens.
25-
return tokenizer.decode(encoding.input_ids, { skip_special_tokens: true });
26-
}
27-
28-
2911
describe("text processing", () => {
3012
const text = [
3113
"Hello world, a bc 1 2345, '? 976. Wikipedia is a free content online encyclopedia",
@@ -139,15 +121,18 @@ describe("Multi-Tokenizer Tests", function () {
139121
describe("Encode-Decode tokenization", function () {
140122
this.timeout(20000);
141123

142-
it("should return text close to the original after encode-decode tokenization using GPT2 tokenizer", async function () {
124+
it("should return text equal to the original after encode-decode tokenization using GPT2 tokenizer", async function () {
143125
// Load the GPT2 tokenizer
144126
const tokenizer = await AutoTokenizer.from_pretrained("Xenova/gpt2");
145127
const originalText = "Hello, world! This is a test for encode-decode tokenization.";
146128

147129
// Perform round-trip tokenization
148-
const decodedText = encodeDecode(tokenizer, originalText);
130+
const encoding = tokenizer(originalText, { return_tensor: false }) as TokenizerOutput;
131+
132+
// Decode the token IDs back into text while skipping special tokens.
133+
const decodedText = tokenizer.decode(encoding.input_ids, { skip_special_tokens: true });
149134

150-
// Check that the decoded text is almost equal to the original text
135+
// Check that the decoded text is equal to the original text
151136
expect(decodedText).to.equal(originalText);
152137
});
153138
});

0 commit comments

Comments
 (0)