Skip to content

Commit

Permalink
feat: switch default tokenizer to o200k_base for better Gemini compat…
Browse files Browse the repository at this point in the history
…ibility

- Changed default tokenizer to o200k_base for better compatibility with Gemini models

- Added configurable tokenizer support through tokenCount.encoding in config file

- Updated documentation to reflect new tokenizer configuration options

Fixes #1
  • Loading branch information
eastlondoner committed Feb 5, 2025
1 parent cf5749d commit 64f6d1f
Show file tree
Hide file tree
Showing 4 changed files with 14 additions and 2 deletions.
3 changes: 3 additions & 0 deletions cursor-tools.config.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,8 @@
},
"gemini": {
"model": "gemini-2.0-flash-thinking-exp-01-21"
},
"tokenCount": {
"encoding": "o200k_base"
}
}
5 changes: 4 additions & 1 deletion src/commands/doc.ts
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,9 @@ export class DocCommand implements Command {
includePatterns: includePatterns.join(','),
ignorePatterns: ignorePatterns.join(','),
},
tokenCount: {
encoding: this.config.tokenCount?.encoding || 'o200k_base',
},
signal: {},
}),
method: 'POST',
Expand Down Expand Up @@ -293,7 +296,7 @@ Focus on:
enableSecurityCheck: true,
},
tokenCount: {
encoding: 'cl100k_base',
encoding: this.config.tokenCount?.encoding || 'o200k_base',
},
cwd: process.cwd(),
});
Expand Down
2 changes: 1 addition & 1 deletion src/commands/repo.ts
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ export class RepoCommand implements Command {
enableSecurityCheck: true,
},
tokenCount: {
encoding: 'cl100k_base',
encoding: this.config.tokenCount?.encoding || 'o200k_base',
},
cwd: process.cwd(),
});
Expand Down
6 changes: 6 additions & 0 deletions src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ export interface Config {
doc?: {
maxRepoSizeMB?: number; // Maximum repository size in MB for remote processing
};
tokenCount?: {
encoding: 'o200k_base' | 'gpt2' | 'r50k_base' | 'p50k_base' | 'p50k_edit' | 'cl100k_base'; // The tokenizer encoding to use
};
}

export const defaultConfig: Config = {
Expand All @@ -26,6 +29,9 @@ export const defaultConfig: Config = {
doc: {
maxRepoSizeMB: 100, // Default to 100MB
},
tokenCount: {
encoding: 'o200k_base', // Default to o200k_base as it's optimized for Gemini
},
};

import { existsSync, readFileSync } from 'node:fs';
Expand Down

0 comments on commit 64f6d1f

Please sign in to comment.