Skip to content
This repository was archived by the owner on Jun 5, 2025. It is now read-only.

Commit 1055216

Browse files
authored
Switch usage of re package for regex which is slightly more performant (#1127)
`regex` is a drop-in replacement of `re` and provides better performance. Signed-off-by: Juan Antonio Osorio <[email protected]>
1 parent d686510 commit 1055216

File tree

13 files changed

+41
-23
lines changed

13 files changed

+41
-23
lines changed

poetry.lock

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ onnxruntime = "==1.20.1"
4040
onnx = "==1.17.0"
4141
spacy = "<3.8.0"
4242
en-core-web-sm = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl"}
43+
regex = "==2024.11.6"
4344

4445
[tool.poetry.group.dev.dependencies]
4546
pytest = "==8.3.4"

src/codegate/api/v1_processing.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
import asyncio
22
import json
3-
import re
43
from collections import defaultdict
54
from typing import AsyncGenerator, Dict, List, Optional, Tuple
65

76
import cachetools.func
7+
import regex as re
88
import requests
99
import structlog
1010

src/codegate/clients/detector.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
import re
21
from abc import ABC, abstractmethod
32
from functools import wraps
43
from typing import List, Optional
54

5+
import regex as re
66
import structlog
77
from fastapi import Request
88

src/codegate/db/fim_cache.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
import datetime
22
import hashlib
33
import json
4-
import re
54
from typing import Dict, List, Optional
65

6+
import regex as re
77
import structlog
88
from pydantic import BaseModel
99

@@ -21,6 +21,11 @@ class CachedFim(BaseModel):
2121
initial_id: str
2222

2323

24+
# Regular expression to match file paths in FIM messages.
25+
# Compiled regex to improve performance.
26+
filepath_matcher = re.compile(r"^(#|//|<!--|--|%|;).*?\b([a-zA-Z0-9_\-\/]+\.\w+)\b", re.MULTILINE)
27+
28+
2429
class FimCache:
2530

2631
def __init__(self):
@@ -55,8 +60,8 @@ def _match_filepath(self, message: str, provider: str) -> Optional[str]:
5560
# folder/testing_file.py
5661
# Path: file3.py
5762
# // Path: file3.js <-- Javascript
58-
pattern = r"^(#|//|<!--|--|%|;).*?\b([a-zA-Z0-9_\-\/]+\.\w+)\b"
59-
matches = re.findall(pattern, message, re.MULTILINE)
63+
matches = filepath_matcher.findall(message)
64+
6065
# If no path is found, hash the entire prompt message.
6166
if not matches:
6267
return None

src/codegate/extract_snippets/message_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
import re
21
from abc import ABC, abstractmethod
32
from pathlib import Path
43
from typing import Dict, List, Optional, Self
54

5+
import regex as re
66
import structlog
77
from pydantic import BaseModel, field_validator, model_validator
88
from pygments.lexers import guess_lexer

src/codegate/pipeline/cli/cli.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
import re
21
import shlex
32
from typing import Optional
43

4+
import regex as re
55
from litellm import ChatCompletionRequest
66

77
from codegate.clients.clients import ClientType

src/codegate/pipeline/codegate_context_retriever/codegate.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import json
2-
import re
32

3+
import regex as re
44
import structlog
55
from litellm import ChatCompletionRequest
66

@@ -19,6 +19,12 @@
1919
logger = structlog.get_logger("codegate")
2020

2121

22+
# Pre-compiled regex patterns for performance
23+
markdown_code_block = re.compile(r"```.*?```", flags=re.DOTALL)
24+
markdown_file_listing = re.compile(r"⋮...*?⋮...\n\n", flags=re.DOTALL)
25+
environment_details = re.compile(r"<environment_details>.*?</environment_details>", flags=re.DOTALL)
26+
27+
2228
class CodegateContextRetriever(PipelineStep):
2329
"""
2430
Pipeline step that adds a context message to the completion request when it detects
@@ -95,11 +101,9 @@ async def process( # noqa: C901
95101

96102
# Remove code snippets and file listing from the user messages and search for bad packages
97103
# in the rest of the user query/messsages
98-
user_messages = re.sub(r"```.*?```", "", user_message, flags=re.DOTALL)
99-
user_messages = re.sub(r"⋮...*?⋮...\n\n", "", user_messages, flags=re.DOTALL)
100-
user_messages = re.sub(
101-
r"<environment_details>.*?</environment_details>", "", user_messages, flags=re.DOTALL
102-
)
104+
user_messages = markdown_code_block.sub("", user_message)
105+
user_messages = markdown_file_listing.sub("", user_messages)
106+
user_messages = environment_details.sub("", user_messages)
103107

104108
# split messages into double newlines, to avoid passing so many content in the search
105109
split_messages = re.split(r"</?task>|\n|\\n", user_messages)

src/codegate/pipeline/pii/pii.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
import re
21
from typing import Any, Dict, List, Optional
32

3+
import regex as re
44
import structlog
55
from litellm import ChatCompletionRequest, ChatCompletionSystemMessage, ModelResponse
66
from litellm.types.utils import Delta, StreamingChoices

src/codegate/pipeline/secrets/secrets.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
import re
21
from abc import abstractmethod
32
from typing import List, Optional, Tuple
43

4+
import regex as re
55
import structlog
66
from litellm import ChatCompletionRequest, ChatCompletionSystemMessage, ModelResponse
77
from litellm.types.utils import Delta, StreamingChoices

0 commit comments

Comments
 (0)