@@ -3,29 +3,11 @@ import { expect } from "chai";
3
3
import { tokenize } from "./text.js" ;
4
4
import { AutoTokenizer } from "@xenova/transformers" ;
5
5
import { Repeat } from "immutable" ;
6
- import { PreTrainedTokenizer } from "@xenova/transformers" ;
7
-
8
6
9
7
interface TokenizerOutput {
10
8
input_ids : number [ ] ;
11
9
}
12
10
13
- /**
14
- * Encodes the text into token IDs and then decodes them back to text
15
- * Special tokens are skipped during decoding
16
- *
17
- * @param tokenizer - An instance of a PreTrainedTokenizer
18
- * @param text - The text to process
19
- * @returns The decoded text obtained after encoding and then decoding
20
- */
21
- export function encodeDecode ( tokenizer : PreTrainedTokenizer , text : string ) : string {
22
- // Encode the text using the tokenizer.
23
- const encoding = tokenizer ( text , { return_tensor : false } ) as TokenizerOutput ;
24
- // Decode the token IDs back into text while skipping special tokens.
25
- return tokenizer . decode ( encoding . input_ids , { skip_special_tokens : true } ) ;
26
- }
27
-
28
-
29
11
describe ( "text processing" , ( ) => {
30
12
const text = [
31
13
"Hello world, a bc 1 2345, '? 976. Wikipedia is a free content online encyclopedia" ,
@@ -139,15 +121,18 @@ describe("Multi-Tokenizer Tests", function () {
139
121
describe ( "Encode-Decode tokenization" , function ( ) {
140
122
this . timeout ( 20000 ) ;
141
123
142
- it ( "should return text close to the original after encode-decode tokenization using GPT2 tokenizer" , async function ( ) {
124
+ it ( "should return text equal to the original after encode-decode tokenization using GPT2 tokenizer" , async function ( ) {
143
125
// Load the GPT2 tokenizer
144
126
const tokenizer = await AutoTokenizer . from_pretrained ( "Xenova/gpt2" ) ;
145
127
const originalText = "Hello, world! This is a test for encode-decode tokenization." ;
146
128
147
129
// Perform round-trip tokenization
148
- const decodedText = encodeDecode ( tokenizer , originalText ) ;
130
+ const encoding = tokenizer ( originalText , { return_tensor : false } ) as TokenizerOutput ;
131
+
132
+ // Decode the token IDs back into text while skipping special tokens.
133
+ const decodedText = tokenizer . decode ( encoding . input_ids , { skip_special_tokens : true } ) ;
149
134
150
- // Check that the decoded text is almost equal to the original text
135
+ // Check that the decoded text is equal to the original text
151
136
expect ( decodedText ) . to . equal ( originalText ) ;
152
137
} ) ;
153
138
} ) ;
0 commit comments