Bump version, sync codebase

hauntsaninja · hauntsaninja · commit ec7c121e385b · 2023-03-02T11:54:12.000-08:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,14 @@
 
 This is the changelog for the open source version of tiktoken.
 
+## [v0.3.0]
+- Improve performance by 5-20%; thank you to @nistath!
+- Add `gpt-3.5-turbo` models to `encoding_for_model`
+- Add prefix matching to `encoding_for_model` to better support future model versions
+- Fix a bug in the README instructions on extending tiktoken
+- Update the set of available encodings
+- Add packaging metadata
+
 ## [v0.2.0]
 - Add ``tiktoken.encoding_for_model`` to get the encoding for a specific model
 - Improve portability of caching logic
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tiktoken"
-version = "0.2.0"
+version = "0.3.0"
 edition = "2021"
 rust-version = "1.57.0"
 
diff --git a/README.md b/README.md
@@ -30,7 +30,7 @@ Example code using `tiktoken` can be found in the
 ![image](./perf.svg)
 
 Performance measured on 1GB of text using the GPT-2 tokeniser, using `GPT2TokenizerFast` from
-`tokenizers==0.13.2` and `transformers==4.24.0`.
+`tokenizers==0.13.2`, `transformers==4.24.0` and `tiktoken==0.2.0`.
 
 
 ## Getting help
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "tiktoken"
-version = "0.2.0"
+version = "0.3.0"
 description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
 readme = "README.md"
 license = {file = "LICENSE"}
diff --git a/tests/test_simple_public.py b/tests/test_simple_public.py
@@ -26,3 +26,5 @@ def test_encoding_for_model():
     assert enc.name == "p50k_base"
     enc = tiktoken.encoding_for_model("text-davinci-edit-001")
     assert enc.name == "p50k_edit"
+    enc = tiktoken.encoding_for_model("gpt-3.5-turbo-0301")
+    assert enc.name == "cl100k_base"
diff --git a/tiktoken/model.py b/tiktoken/model.py
@@ -3,8 +3,15 @@
 from .core import Encoding
 from .registry import get_encoding
 
-# TODO: this will likely be replaced by an API endpoint
+# TODO: these will likely be replaced by an API endpoint
+MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
+    # chat
+    "gpt-3.5-turbo-": "cl100k_base"  # e.g, gpt-3.5-turbo-0301, -0401, etc.
+}
+
 MODEL_TO_ENCODING: dict[str, str] = {
+    # chat
+    "gpt-3.5-turbo": "cl100k_base",
     # text
     "text-davinci-003": "p50k_base",
     "text-davinci-002": "p50k_base",
@@ -45,11 +52,22 @@
 
 
 def encoding_for_model(model_name: str) -> Encoding:
-    try:
+    """Returns the encoding used by a model."""
+    encoding_name = None
+    if model_name in MODEL_TO_ENCODING:
         encoding_name = MODEL_TO_ENCODING[model_name]
-    except KeyError:
+    else:
+        # Check if the model matches a known prefix
+        # Prefix matching avoids needing library updates for every model version release
+        # Note that this can match on non-existent models (e.g., gpt-3.5-turbo-FAKE)
+        for model_prefix, model_encoding_name in MODEL_PREFIX_TO_ENCODING.items():
+            if model_name.startswith(model_prefix):
+                return get_encoding(model_encoding_name)
+
+    if encoding_name is None:
         raise KeyError(
             f"Could not automatically map {model_name} to a tokeniser. "
             "Please use `tiktok.get_encoding` to explicitly get the tokeniser you expect."
         ) from None
+
     return get_encoding(encoding_name)
diff --git a/tiktoken_ext/openai_public.py b/tiktoken_ext/openai_public.py
@@ -83,6 +83,6 @@ def cl100k_base():
     "gpt2": gpt2,
     "r50k_base": r50k_base,
     "p50k_base": p50k_base,
-    "cl100k_base": cl100k_base,
     "p50k_edit": p50k_edit,
+    "cl100k_base": cl100k_base,
 }

Original file line number	Diff line number	Diff line change
`@@ -83,6 +83,6 @@ def cl100k_base():`
`83`	`83`	`"gpt2": gpt2,`
`84`	`84`	`"r50k_base": r50k_base,`
`85`	`85`	`"p50k_base": p50k_base,`
`86`		`- "cl100k_base": cl100k_base,`
`87`	`86`	`"p50k_edit": p50k_edit,`
	`87`	`+ "cl100k_base": cl100k_base,`
`88`	`88`	`}`