Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 39 additions & 25 deletions cli/generators/ats_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,33 @@

console = Console()

# Pre-compiled regular expressions and constants for performance optimization
_TABLE_PATTERN = re.compile(r"\|[^\n]+\|")
_SPECIAL_CHARS_PATTERN = re.compile(r"[^a-zA-Z0-9\s\-\.\,\@\(\)\#\/]")
_EMAIL_PATTERN = re.compile(r"^[^@]+@[^@]+\.[^@]+$")
_PHONE_PATTERN = re.compile(r"\d")
_QUANTIFIABLE_PATTERN = re.compile(
r"\d+%|\$\d+|\d+\s*(?:users|customers|projects)", flags=re.IGNORECASE
)
_ACRONYM_PATTERN = re.compile(r"\b[A-Z]{2,4}\b")
_JSON_ARRAY_PATTERN = re.compile(r"\[.*\]", flags=re.DOTALL)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion: Make JSON array extraction regex non-greedy to avoid overmatching

With DOTALL, r"\[.*\]" will match from the first [ to the last ] in the entire response. If the model returns multiple bracketed sections or other text containing [/], json_match.group(0) may be much larger than intended and break json.loads or parse the wrong data. A non-greedy pattern like r"\[.*?\]" limits the match to the smallest bracketed JSON array, which is likely what we want.

Suggested change
_JSON_ARRAY_PATTERN = re.compile(r"\[.*\]", flags=re.DOTALL)
_JSON_ARRAY_PATTERN = re.compile(r"\[.*?\]", flags=re.DOTALL)

_TECH_TERM_PATTERN = re.compile(r"\b[a-z]+(?:\s+[a-z]+)?\b")
_SUMMARY_TERM_PATTERN = re.compile(r"\b[a-z]{2,}\b")

_ACTION_VERBS = [
"developed",
"implemented",
"built",
"created",
"designed",
"managed",
"led",
"increased",
"decreased",
"improved",
"achieved",
]


@dataclass
class ATSCategoryScore:
Expand Down Expand Up @@ -214,8 +241,8 @@ def _check_format_parsing(self, resume_data: Dict[str, Any]) -> ATSCategoryScore

# Check for complex formatting indicators
all_text = self._get_all_text(resume_data)
has_tables = bool(re.search(r"\|[^\n]+\|", all_text))
has_special_chars = len(re.findall(r"[^a-zA-Z0-9\s\-\.\,\@\(\)\#\/]", all_text))
has_tables = bool(_TABLE_PATTERN.search(all_text))
has_special_chars = len(_SPECIAL_CHARS_PATTERN.findall(all_text))

if not has_tables:
details.append("No tables detected (ATS-friendly)")
Expand Down Expand Up @@ -349,15 +376,15 @@ def _check_contact_info(self, resume_data: Dict[str, Any]) -> ATSCategoryScore:

# Check required contact fields
contact_fields = {
"email": (contact.get("email"), 5, r"^[^@]+@[^@]+\.[^@]+$"),
"phone": (contact.get("phone"), 5, r"\d"),
"email": (contact.get("email"), 5, _EMAIL_PATTERN),
"phone": (contact.get("phone"), 5, _PHONE_PATTERN),
"location": (contact.get("location"), 5, None), # Just presence check
}

for field_name, (field_value, field_points, pattern) in contact_fields.items():
if field_value:
if pattern:
if re.search(pattern, field_value):
if pattern.search(field_value):
points += field_points
details.append(f"βœ“ {field_name.capitalize()} present and valid")
else:
Expand Down Expand Up @@ -392,22 +419,10 @@ def _check_readability(self, resume_data: Dict[str, Any]) -> ATSCategoryScore:
suggestions = []

all_text = self._get_all_text(resume_data)
all_text_lower = all_text.lower()

# Check for action verbs in experience bullets
action_verbs = [
"developed",
"implemented",
"built",
"created",
"designed",
"managed",
"led",
"increased",
"decreased",
"improved",
"achieved",
]
action_verb_count = sum(1 for verb in action_verbs if verb in all_text.lower())
action_verb_count = sum(1 for verb in _ACTION_VERBS if verb in all_text_lower)

if action_verb_count >= 3:
details.append(f"βœ“ Uses action verbs ({action_verb_count} found)")
Expand All @@ -416,7 +431,7 @@ def _check_readability(self, resume_data: Dict[str, Any]) -> ATSCategoryScore:
suggestions.append("Use more action verbs (e.g., developed, implemented)")

# Check for quantifiable achievements
has_numbers = bool(re.search(r"\d+%|\$\d+|\d+\s*(users|customers|projects)", all_text))
has_numbers = bool(_QUANTIFIABLE_PATTERN.search(all_text))
if has_numbers:
details.append("βœ“ Includes quantifiable achievements")
else:
Expand All @@ -425,8 +440,7 @@ def _check_readability(self, resume_data: Dict[str, Any]) -> ATSCategoryScore:

# Check for acronyms (should be minimal or defined)
# This is a simple heuristic
acronym_pattern = r"\b[A-Z]{2,4}\b"
acronyms = re.findall(acronym_pattern, all_text)
acronyms = _ACRONYM_PATTERN.findall(all_text)
if len(acronyms) < 10:
details.append(f"βœ“ Minimal acronyms ({len(acronyms)} found)")
else:
Expand Down Expand Up @@ -505,7 +519,7 @@ def _extract_job_keywords(self, job_description: str) -> List[str]:
response = self._call_openai(prompt)

# Parse JSON from response
json_match = re.search(r"\[.*\]", response, re.DOTALL)
json_match = _JSON_ARRAY_PATTERN.search(response)
if json_match:
keywords = json.loads(json_match.group(0))
if isinstance(keywords, list):
Expand Down Expand Up @@ -547,12 +561,12 @@ def _extract_resume_keywords(self, resume_data: Dict[str, Any]) -> List[str]:
text = bullet.get("text", "").lower()
# Extract common tech terms from text
# This is a simple heuristic - AI could do better
keywords.extend(re.findall(r"\b[a-z]+(?:\s+[a-z]+)?\b", text))
keywords.extend(_TECH_TERM_PATTERN.findall(text))

# Extract from summary
summary = resume_data.get("summary", "")
if summary:
keywords.extend(re.findall(r"\b[a-z]{2,}\b", summary.lower()))
keywords.extend(_SUMMARY_TERM_PATTERN.findall(summary.lower()))

return list(set(k.strip() for k in keywords if len(k) > 2))

Expand Down
Loading