diff --git a/.jules/bolt.md b/.jules/bolt.md index 254b8d5..b22d37b 100644 --- a/.jules/bolt.md +++ b/.jules/bolt.md @@ -13,3 +13,7 @@ ## 2025-02-18 - Regex Pre-compilation in Hot Paths **Learning:** Re-compiling regexes inside a frequently called function (like `latex_escape` which runs for every string) creates significant overhead. Pre-compiling them at module level yielded a ~3.2x speedup. **Action:** Always look for regex compilations inside loops or frequently called functions and move them to module level constants. + +## 2024-05-22 - Python In-Membership Checks vs Sets +**Learning:** Do not attempt to optimize string membership checks like `ext in [".yaml", ".yml"]` by converting them to module-level sets. Modern CPython already optimizes these into constant tuple lookups at compile-time (`LOAD_CONST`), making such changes useless micro-optimizations. +**Action:** Focus on algorithmic complexity, DB queries, or actual runtime bottlenecks (like re-compiling regexes inside loops) instead of simple constant membership checks. diff --git a/cli/commands/convert.py b/cli/commands/convert.py index 8b1e4bb..d8002f9 100644 --- a/cli/commands/convert.py +++ b/cli/commands/convert.py @@ -13,6 +13,9 @@ from ..utils.json_resume_converter import JSONResumeConverter, convert_yaml_to_json_resume +# Optimize file extension checks with O(1) set lookup to prevent repeated list allocations +_YAML_EXTENSIONS = {".yaml", ".yml"} + @click.command() @click.argument("input_file", type=click.Path(exists=True, path_type=Path)) @@ -67,9 +70,9 @@ def convert(input_file: Path, output_file: Path, direction: str, format: str, no input_ext = input_file.suffix.lower() output_ext = output_file.suffix.lower() - if input_ext in [".yaml", ".yml"] and output_ext == ".json": + if input_ext in _YAML_EXTENSIONS and output_ext == ".json": direction = "to_json" - elif input_ext == ".json" and output_ext in [".yaml", ".yml"]: + elif input_ext == ".json" and output_ext in _YAML_EXTENSIONS: direction = "to_yaml" else: click.echo( @@ -269,7 +272,7 @@ def import_resume(input_file: Path, fmt: Optional[str], output: Optional[Path], ext = input_file.suffix.lower() if ext == ".json": fmt = "json" - elif ext in [".yaml", ".yml"]: + elif ext in _YAML_EXTENSIONS: fmt = "yaml" else: click.echo( @@ -375,7 +378,7 @@ def export_resume(input_file: Path, fmt: Optional[str], output: Optional[Path]): ext = input_file.suffix.lower() if ext == ".json": fmt = "json" - elif ext in [".yaml", ".yml"]: + elif ext in _YAML_EXTENSIONS: fmt = "yaml" else: click.echo( diff --git a/cli/integrations/job_parser.py b/cli/integrations/job_parser.py index 2c3cdf6..657fc3f 100644 --- a/cli/integrations/job_parser.py +++ b/cli/integrations/job_parser.py @@ -19,10 +19,15 @@ import re from dataclasses import asdict, dataclass, field from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, Union from bs4 import BeautifulSoup, Tag +# Pre-compile regex patterns for performance in parsing +_INDEED_HEADER_PATTERN = re.compile(r"jobsearch-JobInfoHeader") +_REQUIREMENTS_HEADING_PATTERN = re.compile(r"requirements|qualifications|skills", re.IGNORECASE) +_RESPONSIBILITIES_HEADING_PATTERN = re.compile(r"responsibilities|duties|what you", re.IGNORECASE) + # Optional import for URL fetching try: import requests @@ -366,7 +371,7 @@ def _parse_indeed(self, html: str) -> JobDetails: # Extract position position = self._extract_by_selectors(soup, self.INDEED_SELECTORS["position"]) if not position: - h1 = soup.find("h1", class_=re.compile(r"jobsearch-JobInfoHeader")) + h1 = soup.find("h1", class_=_INDEED_HEADER_PATTERN) # type: ignore[call-overload] position = h1.get_text(strip=True) if h1 else "" # Extract location @@ -447,10 +452,10 @@ def _parse_generic(self, html: str) -> JobDetails: salary = self._extract_salary_from_text(html) # Extract requirements section - look for heading tags first - requirements = [] + requirements: List[str] = [] req_heading = soup.find( ["h1", "h2", "h3", "h4", "h5", "h6"], - string=re.compile(r"requirements|qualifications|skills", re.IGNORECASE), + string=_REQUIREMENTS_HEADING_PATTERN, # type: ignore[call-overload] ) if req_heading: # Get the next sibling element(s) containing the list @@ -462,10 +467,10 @@ def _parse_generic(self, html: str) -> JobDetails: requirements = self._extract_list_by_keyword(html, "requirements") # Extract responsibilities section - responsibilities = [] + responsibilities: List[str] = [] resp_heading = soup.find( ["h1", "h2", "h3", "h4", "h5", "h6"], - string=re.compile(r"responsibilities|duties|what you", re.IGNORECASE), + string=_RESPONSIBILITIES_HEADING_PATTERN, # type: ignore[call-overload] ) if resp_heading: next_elem = resp_heading.find_next_sibling(["ul", "ol", "div", "p"]) @@ -734,21 +739,23 @@ def _extract_list_items(self, element: Tag) -> List[str]: return [item for item in items if len(item) > 3][:15] - def _extract_list_by_keyword(self, html: str, keyword: str) -> List[str]: + def _extract_list_by_keyword(self, html: str, keyword: Union[str, re.Pattern]) -> List[str]: """ Extract list items near a keyword. Args: html: HTML content - keyword: Keyword to search for + keyword: Keyword to search for (string or compiled regex) Returns: List of extracted items """ soup = BeautifulSoup(html, "lxml") + pattern = keyword if isinstance(keyword, re.Pattern) else re.compile(keyword, re.IGNORECASE) + # Find element containing the keyword - for elem in soup.find_all(string=re.compile(keyword, re.IGNORECASE)): + for elem in soup.find_all(string=pattern): # type: ignore[call-overload] parent = elem.find_parent(["div", "section", "ul", "li"]) if parent: # Look for list items in parent or siblings