Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .jules/bolt.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,7 @@
## 2025-02-18 - Regex Pre-compilation in Hot Paths
**Learning:** Re-compiling regexes inside a frequently called function (like `latex_escape` which runs for every string) creates significant overhead. Pre-compiling them at module level yielded a ~3.2x speedup.
**Action:** Always look for regex compilations inside loops or frequently called functions and move them to module level constants.

## 2024-05-22 - Python In-Membership Checks vs Sets
**Learning:** Do not attempt to optimize string membership checks like `ext in [".yaml", ".yml"]` by converting them to module-level sets. Modern CPython already optimizes these into constant tuple lookups at compile-time (`LOAD_CONST`), making such changes useless micro-optimizations.
**Action:** Focus on algorithmic complexity, DB queries, or actual runtime bottlenecks (like re-compiling regexes inside loops) instead of simple constant membership checks.
11 changes: 7 additions & 4 deletions cli/commands/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@

from ..utils.json_resume_converter import JSONResumeConverter, convert_yaml_to_json_resume

# Optimize file extension checks with O(1) set lookup to prevent repeated list allocations
_YAML_EXTENSIONS = {".yaml", ".yml"}
Comment on lines +16 to +17
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion: The optimization comment overstates the impact and may be misleading.

Using a set literal here is fine, but the performance impact in this CLI context is negligible. The current comment implies a significant optimization that doesn’t really occur and could mislead future readers. Please either remove the optimization framing or rephrase it to something neutral like # Supported YAML file extensions to focus on semantics rather than micro-optimization.

Suggested change
# Optimize file extension checks with O(1) set lookup to prevent repeated list allocations
_YAML_EXTENSIONS = {".yaml", ".yml"}
# Supported YAML file extensions
_YAML_EXTENSIONS = {".yaml", ".yml"}



@click.command()
@click.argument("input_file", type=click.Path(exists=True, path_type=Path))
Expand Down Expand Up @@ -67,9 +70,9 @@ def convert(input_file: Path, output_file: Path, direction: str, format: str, no
input_ext = input_file.suffix.lower()
output_ext = output_file.suffix.lower()

if input_ext in [".yaml", ".yml"] and output_ext == ".json":
if input_ext in _YAML_EXTENSIONS and output_ext == ".json":
direction = "to_json"
elif input_ext == ".json" and output_ext in [".yaml", ".yml"]:
elif input_ext == ".json" and output_ext in _YAML_EXTENSIONS:
direction = "to_yaml"
else:
click.echo(
Expand Down Expand Up @@ -269,7 +272,7 @@ def import_resume(input_file: Path, fmt: Optional[str], output: Optional[Path],
ext = input_file.suffix.lower()
if ext == ".json":
fmt = "json"
elif ext in [".yaml", ".yml"]:
elif ext in _YAML_EXTENSIONS:
fmt = "yaml"
else:
click.echo(
Expand Down Expand Up @@ -375,7 +378,7 @@ def export_resume(input_file: Path, fmt: Optional[str], output: Optional[Path]):
ext = input_file.suffix.lower()
if ext == ".json":
fmt = "json"
elif ext in [".yaml", ".yml"]:
elif ext in _YAML_EXTENSIONS:
fmt = "yaml"
else:
click.echo(
Expand Down
25 changes: 16 additions & 9 deletions cli/integrations/job_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,15 @@
import re
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from typing import Any, Dict, List, Optional, Tuple, Union

from bs4 import BeautifulSoup, Tag

# Pre-compile regex patterns for performance in parsing
_INDEED_HEADER_PATTERN = re.compile(r"jobsearch-JobInfoHeader")
_REQUIREMENTS_HEADING_PATTERN = re.compile(r"requirements|qualifications|skills", re.IGNORECASE)
_RESPONSIBILITIES_HEADING_PATTERN = re.compile(r"responsibilities|duties|what you", re.IGNORECASE)

# Optional import for URL fetching
try:
import requests
Expand Down Expand Up @@ -366,7 +371,7 @@ def _parse_indeed(self, html: str) -> JobDetails:
# Extract position
position = self._extract_by_selectors(soup, self.INDEED_SELECTORS["position"])
if not position:
h1 = soup.find("h1", class_=re.compile(r"jobsearch-JobInfoHeader"))
h1 = soup.find("h1", class_=_INDEED_HEADER_PATTERN) # type: ignore[call-overload]
position = h1.get_text(strip=True) if h1 else ""

# Extract location
Expand Down Expand Up @@ -447,10 +452,10 @@ def _parse_generic(self, html: str) -> JobDetails:
salary = self._extract_salary_from_text(html)

# Extract requirements section - look for heading tags first
requirements = []
requirements: List[str] = []
req_heading = soup.find(
["h1", "h2", "h3", "h4", "h5", "h6"],
string=re.compile(r"requirements|qualifications|skills", re.IGNORECASE),
string=_REQUIREMENTS_HEADING_PATTERN, # type: ignore[call-overload]
)
if req_heading:
# Get the next sibling element(s) containing the list
Expand All @@ -462,10 +467,10 @@ def _parse_generic(self, html: str) -> JobDetails:
requirements = self._extract_list_by_keyword(html, "requirements")

# Extract responsibilities section
responsibilities = []
responsibilities: List[str] = []
resp_heading = soup.find(
["h1", "h2", "h3", "h4", "h5", "h6"],
string=re.compile(r"responsibilities|duties|what you", re.IGNORECASE),
string=_RESPONSIBILITIES_HEADING_PATTERN, # type: ignore[call-overload]
)
if resp_heading:
next_elem = resp_heading.find_next_sibling(["ul", "ol", "div", "p"])
Expand Down Expand Up @@ -734,21 +739,23 @@ def _extract_list_items(self, element: Tag) -> List[str]:

return [item for item in items if len(item) > 3][:15]

def _extract_list_by_keyword(self, html: str, keyword: str) -> List[str]:
def _extract_list_by_keyword(self, html: str, keyword: Union[str, re.Pattern]) -> List[str]:
"""
Extract list items near a keyword.

Args:
html: HTML content
keyword: Keyword to search for
keyword: Keyword to search for (string or compiled regex)

Returns:
List of extracted items
"""
soup = BeautifulSoup(html, "lxml")

pattern = keyword if isinstance(keyword, re.Pattern) else re.compile(keyword, re.IGNORECASE)

# Find element containing the keyword
for elem in soup.find_all(string=re.compile(keyword, re.IGNORECASE)):
for elem in soup.find_all(string=pattern): # type: ignore[call-overload]
parent = elem.find_parent(["div", "section", "ul", "li"])
if parent:
# Look for list items in parent or siblings
Expand Down
Loading