diff --git a/.jules/bolt.md b/.jules/bolt.md index 254b8d5..6343e82 100644 --- a/.jules/bolt.md +++ b/.jules/bolt.md @@ -13,3 +13,7 @@ ## 2025-02-18 - Regex Pre-compilation in Hot Paths **Learning:** Re-compiling regexes inside a frequently called function (like `latex_escape` which runs for every string) creates significant overhead. Pre-compiling them at module level yielded a ~3.2x speedup. **Action:** Always look for regex compilations inside loops or frequently called functions and move them to module level constants. + +## 2025-02-18 - Regex Pre-compilation in Web Scraping +**Learning:** Instantiating `re.compile` within parsing methods (e.g. `soup.find`) or inside list comprehensions over HTML strings incurs redundant overhead during document parsing. +**Action:** Lift regex compilations for BeautifulSoup lookups and string extractions to module-level constants to ensure they're evaluated exactly once. diff --git a/cli/integrations/job_parser.py b/cli/integrations/job_parser.py index 2c3cdf6..9c7dd37 100644 --- a/cli/integrations/job_parser.py +++ b/cli/integrations/job_parser.py @@ -19,10 +19,63 @@ import re from dataclasses import asdict, dataclass, field from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, Union from bs4 import BeautifulSoup, Tag +# Pre-compile regex patterns for performance +_INDEED_HEADER_PATTERN = re.compile(r"jobsearch-JobInfoHeader") +_INDEED_TITLE_SUFFIX_PATTERN = re.compile(r"\s*[-|]\s*.*$") +_REQ_HEADING_PATTERN = re.compile(r"requirements|qualifications|skills", re.IGNORECASE) +_RESP_HEADING_PATTERN = re.compile(r"responsibilities|duties|what you", re.IGNORECASE) + +_SALARY_PATTERNS = [ + re.compile(r"\$[\d,]+(?:\s*[-–to]+\s*\$[\d,]+)?", re.IGNORECASE), # $100k - $150k + re.compile(r"\$[\d,]+k(?:\s*[-–to]+\s*\$[\d,]+k)?", re.IGNORECASE), # $100k - $150k + re.compile(r"[\d,]+k(?:\s*[-–to]+\s*[\d,]+k)", re.IGNORECASE), # 100k - 150k + re.compile(r"(?:salary|pay|compensation)[:\s]*(\$[^<>\n]+)", re.IGNORECASE), # Salary: $X + re.compile(r"(?:per|/)\s*(?:year|annum)[:\s]*(\$[^<>\n]+)", re.IGNORECASE), # per year: $X +] +_SALARY_CLEAN_PATTERN = re.compile(r"\s+") +_SALARY_K_PATTERN = re.compile(r"\d+k") + +_REQ_SECTION_PATTERN = re.compile( + r"(?:^|\n)\s*(requirements?|qualifications?|what we(?:'re)? looking for|what you(?:'ll)? bring)\s*:?\s*\n", + re.IGNORECASE, +) +_RESP_SECTION_PATTERN = re.compile( + r"(?:^|\n)\s*(responsibilities?|duties?|what you(?:'ll)? do|your impact|key responsibilities)\s*:?\s*\n", + re.IGNORECASE, +) + +_NEXT_SECTION_PATTERNS = [ + re.compile( + r"(?:^|\n)\s*(benefits|compensation|perks|about|company|team)\s*:?\s*\n", re.IGNORECASE + ), + re.compile(r"(?:^|\n)\s*(requirements?|qualifications?)\s*:?\s*\n", re.IGNORECASE), +] + +_BULLET_PATTERNS = [ + re.compile(r"[•\-\*]\s*([^\n]+)", re.MULTILINE), # Standard bullets + re.compile(r"^\s*\d+[\.\)]\s*([^\n]+)", re.MULTILINE), # Numbered lists +] + +_COMMA_SEP_PATTERN = re.compile(r",\s*(?=[A-Z])") + +_JOB_TYPE_PATTERNS = [ + re.compile( + r"\b(full[- ]?time|part[- ]?time|contract|freelance|intern|temporary)\b", re.IGNORECASE + ), + re.compile(r"\b(permanent|fixed[- ]?term)\b", re.IGNORECASE), +] + +_EXPERIENCE_LEVEL_PATTERNS = [ + re.compile( + r"\b(entry[- ]?level|junior|mid[- ]?level|senior|staff|principal|lead)\b", re.IGNORECASE + ), + re.compile(r"\b(associate|vice[- ]?president|director|executive)\b", re.IGNORECASE), +] + # Optional import for URL fetching try: import requests @@ -366,7 +419,7 @@ def _parse_indeed(self, html: str) -> JobDetails: # Extract position position = self._extract_by_selectors(soup, self.INDEED_SELECTORS["position"]) if not position: - h1 = soup.find("h1", class_=re.compile(r"jobsearch-JobInfoHeader")) + h1 = soup.find("h1", class_=_INDEED_HEADER_PATTERN) # type: ignore[call-overload] position = h1.get_text(strip=True) if h1 else "" # Extract location @@ -438,7 +491,7 @@ def _parse_generic(self, html: str) -> JobDetails: if title_tag: title = title_tag.get_text(strip=True) # Remove common suffixes - position = re.sub(r"\s*[-|]\s*.*$", "", title) + position = _INDEED_TITLE_SUFFIX_PATTERN.sub("", title) # Extract location location = self._extract_text_by_pattern(html, r"(?:location|based|office)[:\s]+([^<>\n]+)") @@ -450,7 +503,7 @@ def _parse_generic(self, html: str) -> JobDetails: requirements = [] req_heading = soup.find( ["h1", "h2", "h3", "h4", "h5", "h6"], - string=re.compile(r"requirements|qualifications|skills", re.IGNORECASE), + string=_REQ_HEADING_PATTERN, # type: ignore[call-overload] ) if req_heading: # Get the next sibling element(s) containing the list @@ -465,7 +518,7 @@ def _parse_generic(self, html: str) -> JobDetails: responsibilities = [] resp_heading = soup.find( ["h1", "h2", "h3", "h4", "h5", "h6"], - string=re.compile(r"responsibilities|duties|what you", re.IGNORECASE), + string=_RESP_HEADING_PATTERN, # type: ignore[call-overload] ) if resp_heading: next_elem = resp_heading.find_next_sibling(["ul", "ol", "div", "p"]) @@ -529,7 +582,7 @@ def _find_by_selectors(self, soup: BeautifulSoup, selectors: List[str]) -> Optio return elem return None - def _extract_text_by_pattern(self, text: str, pattern: str) -> Optional[str]: + def _extract_text_by_pattern(self, text: str, pattern: Union[str, re.Pattern]) -> Optional[str]: """ Extract text using regex pattern. @@ -540,7 +593,11 @@ def _extract_text_by_pattern(self, text: str, pattern: str) -> Optional[str]: Returns: Extracted text or None """ - match = re.search(pattern, text, re.IGNORECASE) + if isinstance(pattern, str): + match = re.search(pattern, text, re.IGNORECASE) + else: + match = pattern.search(text) + if match: return match.group(1).strip() return None @@ -555,22 +612,13 @@ def _extract_salary_from_text(self, text: str) -> Optional[str]: Returns: Salary string or None """ - # Common salary patterns - patterns = [ - r"\$[\d,]+(?:\s*[-–to]+\s*\$[\d,]+)?", # $100k - $150k - r"\$[\d,]+k(?:\s*[-–to]+\s*\$[\d,]+k)?", # $100k - $150k - r"[\d,]+k(?:\s*[-–to]+\s*[\d,]+k)", # 100k - 150k - r"(?:salary|pay|compensation)[:\s]*(\$[^<>\n]+)", # Salary: $X - r"(?:per|/)\s*(?:year|annum)[:\s]*(\$[^<>\n]+)", # per year: $X - ] - - for pattern in patterns: - match = re.search(pattern, text, re.IGNORECASE) + for pattern in _SALARY_PATTERNS: + match = pattern.search(text) if match: salary = match.group(0) if match.lastindex is None else match.group(1) # Clean up the salary string - salary = re.sub(r"\s+", " ", salary.strip()) - if "$" not in salary and re.search(r"\d+k", salary): + salary = _SALARY_CLEAN_PATTERN.sub(" ", salary.strip()) + if "$" not in salary and _SALARY_K_PATTERN.search(salary): salary = "$" + salary return salary @@ -593,13 +641,9 @@ def _extract_sections_from_description(self, description: str) -> Tuple[List[str return requirements, responsibilities # Find section boundaries using regex - # Match section headers with optional colon, at start of line or after newline - req_pattern = r"(?:^|\n)\s*(requirements?|qualifications?|what we(?:'re)? looking for|what you(?:'ll)? bring)\s*:?\s*\n" - resp_pattern = r"(?:^|\n)\s*(responsibilities?|duties?|what you(?:'ll)? do|your impact|key responsibilities)\s*:?\s*\n" - # Find positions of section headers - req_match = re.search(req_pattern, description, re.IGNORECASE) - resp_match = re.search(resp_pattern, description, re.IGNORECASE) + req_match = _REQ_SECTION_PATTERN.search(description) + resp_match = _RESP_SECTION_PATTERN.search(description) req_start = req_match.start() if req_match else -1 resp_start = resp_match.start() if resp_match else -1 @@ -618,13 +662,9 @@ def _extract_sections_from_description(self, description: str) -> Tuple[List[str # Extract responsibilities section if resp_start >= 0: # Find end of responsibilities section (look for next section or end) - next_section_patterns = [ - r"(?:^|\n)\s*(benefits|compensation|perks|about|company|team)\s*:?\s*\n", - r"(?:^|\n)\s*(requirements?|qualifications?)\s*:?\s*\n", - ] resp_end = len(description) - for pattern in next_section_patterns: - next_match = re.search(pattern, description[resp_start:], re.IGNORECASE) + for pattern in _NEXT_SECTION_PATTERNS: + next_match = pattern.search(description[resp_start:]) if next_match: resp_end = resp_start + next_match.start() break @@ -673,13 +713,8 @@ def _extract_items_from_text(self, text: str) -> List[str]: ] # Match bullet points - bullet_patterns = [ - r"[•\-\*]\s*([^\n]+)", # Standard bullets - r"^\s*\d+[\.\)]\s*([^\n]+)", # Numbered lists - ] - - for pattern in bullet_patterns: - matches = re.findall(pattern, text, re.MULTILINE) + for pattern in _BULLET_PATTERNS: + matches = pattern.findall(text) if matches: items = [m.strip() for m in matches if m.strip() and len(m.strip()) > 5] break @@ -706,7 +741,7 @@ def _extract_items_from_text(self, text: str) -> List[str]: # If still no items, try comma-separated if not items: - parts = re.split(r",\s*(?=[A-Z])", text) + parts = _COMMA_SEP_PATTERN.split(text) items = [p.strip() for p in parts if p.strip() and len(p.strip()) > 5] return items[:15] @@ -734,7 +769,7 @@ def _extract_list_items(self, element: Tag) -> List[str]: return [item for item in items if len(item) > 3][:15] - def _extract_list_by_keyword(self, html: str, keyword: str) -> List[str]: + def _extract_list_by_keyword(self, html: str, keyword: Union[str, re.Pattern]) -> List[str]: """ Extract list items near a keyword. @@ -747,8 +782,11 @@ def _extract_list_by_keyword(self, html: str, keyword: str) -> List[str]: """ soup = BeautifulSoup(html, "lxml") + if isinstance(keyword, str): + keyword = re.compile(keyword, re.IGNORECASE) + # Find element containing the keyword - for elem in soup.find_all(string=re.compile(keyword, re.IGNORECASE)): + for elem in soup.find_all(string=keyword): # type: ignore[call-overload] parent = elem.find_parent(["div", "section", "ul", "li"]) if parent: # Look for list items in parent or siblings @@ -805,13 +843,8 @@ def _extract_job_type(self, html: str) -> Optional[str]: Returns: Job type string or None """ - patterns = [ - r"\b(full[- ]?time|part[- ]?time|contract|freelance|intern|temporary)\b", - r"\b(permanent|fixed[- ]?term)\b", - ] - - for pattern in patterns: - match = re.search(pattern, html, re.IGNORECASE) + for pattern in _JOB_TYPE_PATTERNS: + match = pattern.search(html) if match: return match.group(1).lower().replace("-", "-") @@ -827,13 +860,8 @@ def _extract_experience_level(self, html: str) -> Optional[str]: Returns: Experience level string or None """ - patterns = [ - r"\b(entry[- ]?level|junior|mid[- ]?level|senior|staff|principal|lead)\b", - r"\b(associate|vice[- ]?president|director|executive)\b", - ] - - for pattern in patterns: - match = re.search(pattern, html, re.IGNORECASE) + for pattern in _EXPERIENCE_LEVEL_PATTERNS: + match = pattern.search(html) if match: return match.group(1).lower().replace("-", "-")