From cae923dad606ef9a646c16f8a75f73d16a279458 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Fri, 1 May 2026 04:15:58 +0000 Subject: [PATCH] Pre-compile regex patterns in job parser Co-authored-by: anchapin <6326294+anchapin@users.noreply.github.com> --- cli/integrations/job_parser.py | 171 ++++++++++++++++++++------------- 1 file changed, 102 insertions(+), 69 deletions(-) diff --git a/cli/integrations/job_parser.py b/cli/integrations/job_parser.py index 2c3cdf6..e13db87 100644 --- a/cli/integrations/job_parser.py +++ b/cli/integrations/job_parser.py @@ -19,7 +19,7 @@ import re from dataclasses import asdict, dataclass, field from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, Union from bs4 import BeautifulSoup, Tag @@ -29,6 +29,68 @@ except ImportError: requests = None +# Pre-compile regex patterns for performance +_INDEED_HEADER_PATTERN = re.compile(r"jobsearch-JobInfoHeader") +_TITLE_SUFFIX_PATTERN = re.compile(r"\s*[-|]\s*.*$") +_COMPANY_PATTERN = re.compile(r'company["\s:]+([^"<>\n]+)', re.IGNORECASE) +_GENERIC_COMPANY_PATTERN = re.compile( + r'(?:company|employer|organization|hiring)[:\s]+([^"<>\n]+)', re.IGNORECASE +) +_LINKEDIN_COMPANY_PATTERN = re.compile( + r'(?:company|employer|organization)["\s:]+([^"<>\n]+)', re.IGNORECASE +) +_LOCATION_PATTERN = re.compile(r"(?:location|based|office)[:\s]+([^<>\n]+)", re.IGNORECASE) + +_REQ_HEADING_PATTERN = re.compile(r"requirements|qualifications|skills", re.IGNORECASE) +_RESP_HEADING_PATTERN = re.compile(r"responsibilities|duties|what you", re.IGNORECASE) +_REQUIREMENTS_KEYWORD = re.compile(r"requirements", re.IGNORECASE) +_RESPONSIBILITIES_KEYWORD = re.compile(r"responsibilities", re.IGNORECASE) + +_SALARY_PATTERNS = [ + re.compile(r"\$[\d,]+(?:\s*[-–to]+\s*\$[\d,]+)?", re.IGNORECASE), # $100k - $150k + re.compile(r"\$[\d,]+k(?:\s*[-–to]+\s*\$[\d,]+k)?", re.IGNORECASE), # $100k - $150k + re.compile(r"[\d,]+k(?:\s*[-–to]+\s*[\d,]+k)", re.IGNORECASE), # 100k - 150k + re.compile(r"(?:salary|pay|compensation)[:\s]*(\$[^<>\n]+)", re.IGNORECASE), # Salary: $X + re.compile(r"(?:per|/)\s*(?:year|annum)[:\s]*(\$[^<>\n]+)", re.IGNORECASE), # per year: $X +] +_CLEAN_SALARY_PATTERN = re.compile(r"\s+") +_K_SALARY_PATTERN = re.compile(r"\d+k", re.IGNORECASE) + +_REQ_SECTION_PATTERN = re.compile( + r"(?:^|\n)\s*(requirements?|qualifications?|what we(?:'re)? looking for|what you(?:'ll)? bring)\s*:?\s*\n", + re.IGNORECASE, +) +_RESP_SECTION_PATTERN = re.compile( + r"(?:^|\n)\s*(responsibilities?|duties?|what you(?:'ll)? do|your impact|key responsibilities)\s*:?\s*\n", + re.IGNORECASE, +) +_NEXT_SECTION_PATTERNS = [ + re.compile( + r"(?:^|\n)\s*(benefits|compensation|perks|about|company|team)\s*:?\s*\n", re.IGNORECASE + ), + re.compile(r"(?:^|\n)\s*(requirements?|qualifications?)\s*:?\s*\n", re.IGNORECASE), +] + +_BULLET_PATTERNS = [ + re.compile(r"[•\-\*]\s*([^\n]+)", re.MULTILINE), # Standard bullets + re.compile(r"^\s*\d+[\.\)]\s*([^\n]+)", re.MULTILINE), # Numbered lists +] +_COMMA_SPLIT_PATTERN = re.compile(r",\s*(?=[A-Z])") + +_JOB_TYPE_PATTERNS = [ + re.compile( + r"\b(full[- ]?time|part[- ]?time|contract|freelance|intern|temporary)\b", re.IGNORECASE + ), + re.compile(r"\b(permanent|fixed[- ]?term)\b", re.IGNORECASE), +] + +_EXPERIENCE_LEVEL_PATTERNS = [ + re.compile( + r"\b(entry[- ]?level|junior|mid[- ]?level|senior|staff|principal|lead)\b", re.IGNORECASE + ), + re.compile(r"\b(associate|vice[- ]?president|director|executive)\b", re.IGNORECASE), +] + @dataclass class JobDetails: @@ -297,9 +359,7 @@ def _parse_linkedin(self, html: str) -> JobDetails: company = self._extract_by_selectors(soup, self.LINKEDIN_SELECTORS["company"]) if not company: # Fallback: look for common patterns - company = self._extract_text_by_pattern( - html, r'(?:company|employer|organization)["\s:]+([^"<>\n]+)' - ) + company = self._extract_text_by_pattern(html, _LINKEDIN_COMPANY_PATTERN) # Extract position position = self._extract_by_selectors(soup, self.LINKEDIN_SELECTORS["position"]) @@ -361,12 +421,12 @@ def _parse_indeed(self, html: str) -> JobDetails: company = self._extract_by_selectors(soup, self.INDEED_SELECTORS["company"]) if not company: # Fallback patterns - company = self._extract_text_by_pattern(html, r'company["\s:]+([^"<>\n]+)') + company = self._extract_text_by_pattern(html, _COMPANY_PATTERN) # Extract position position = self._extract_by_selectors(soup, self.INDEED_SELECTORS["position"]) if not position: - h1 = soup.find("h1", class_=re.compile(r"jobsearch-JobInfoHeader")) + h1 = soup.find("h1", class_=_INDEED_HEADER_PATTERN) # type: ignore[call-overload] position = h1.get_text(strip=True) if h1 else "" # Extract location @@ -419,9 +479,7 @@ def _parse_generic(self, html: str) -> JobDetails: soup = BeautifulSoup(html, "lxml") # Try to extract company from various patterns - company = self._extract_text_by_pattern( - html, r'(?:company|employer|organization|hiring)[:\s]+([^"<>\n]+)' - ) + company = self._extract_text_by_pattern(html, _GENERIC_COMPANY_PATTERN) if not company: # Look for company in meta tags meta_company = soup.find("meta", attrs={"name": "company"}) @@ -438,10 +496,10 @@ def _parse_generic(self, html: str) -> JobDetails: if title_tag: title = title_tag.get_text(strip=True) # Remove common suffixes - position = re.sub(r"\s*[-|]\s*.*$", "", title) + position = _TITLE_SUFFIX_PATTERN.sub("", title) # Extract location - location = self._extract_text_by_pattern(html, r"(?:location|based|office)[:\s]+([^<>\n]+)") + location = self._extract_text_by_pattern(html, _LOCATION_PATTERN) # Extract salary salary = self._extract_salary_from_text(html) @@ -450,7 +508,7 @@ def _parse_generic(self, html: str) -> JobDetails: requirements = [] req_heading = soup.find( ["h1", "h2", "h3", "h4", "h5", "h6"], - string=re.compile(r"requirements|qualifications|skills", re.IGNORECASE), + string=_REQ_HEADING_PATTERN, # type: ignore[call-overload] ) if req_heading: # Get the next sibling element(s) containing the list @@ -459,20 +517,20 @@ def _parse_generic(self, html: str) -> JobDetails: requirements = self._extract_list_items(next_elem) if not requirements: # Try to find by text pattern - requirements = self._extract_list_by_keyword(html, "requirements") + requirements = self._extract_list_by_keyword(html, _REQUIREMENTS_KEYWORD) # Extract responsibilities section responsibilities = [] resp_heading = soup.find( ["h1", "h2", "h3", "h4", "h5", "h6"], - string=re.compile(r"responsibilities|duties|what you", re.IGNORECASE), + string=_RESP_HEADING_PATTERN, # type: ignore[call-overload] ) if resp_heading: next_elem = resp_heading.find_next_sibling(["ul", "ol", "div", "p"]) if next_elem: responsibilities = self._extract_list_items(next_elem) if not responsibilities: - responsibilities = self._extract_list_by_keyword(html, "responsibilities") + responsibilities = self._extract_list_by_keyword(html, _RESPONSIBILITIES_KEYWORD) # Detect remote status remote = self._detect_remote_status(html) @@ -529,18 +587,21 @@ def _find_by_selectors(self, soup: BeautifulSoup, selectors: List[str]) -> Optio return elem return None - def _extract_text_by_pattern(self, text: str, pattern: str) -> Optional[str]: + def _extract_text_by_pattern(self, text: str, pattern: Union[str, re.Pattern]) -> Optional[str]: """ Extract text using regex pattern. Args: text: Text to search - pattern: Regex pattern + pattern: Regex pattern string or compiled pattern Returns: Extracted text or None """ - match = re.search(pattern, text, re.IGNORECASE) + if isinstance(pattern, re.Pattern): + match = pattern.search(text) + else: + match = re.search(pattern, text, re.IGNORECASE) if match: return match.group(1).strip() return None @@ -555,22 +616,13 @@ def _extract_salary_from_text(self, text: str) -> Optional[str]: Returns: Salary string or None """ - # Common salary patterns - patterns = [ - r"\$[\d,]+(?:\s*[-–to]+\s*\$[\d,]+)?", # $100k - $150k - r"\$[\d,]+k(?:\s*[-–to]+\s*\$[\d,]+k)?", # $100k - $150k - r"[\d,]+k(?:\s*[-–to]+\s*[\d,]+k)", # 100k - 150k - r"(?:salary|pay|compensation)[:\s]*(\$[^<>\n]+)", # Salary: $X - r"(?:per|/)\s*(?:year|annum)[:\s]*(\$[^<>\n]+)", # per year: $X - ] - - for pattern in patterns: - match = re.search(pattern, text, re.IGNORECASE) + for pattern in _SALARY_PATTERNS: + match = pattern.search(text) if match: salary = match.group(0) if match.lastindex is None else match.group(1) # Clean up the salary string - salary = re.sub(r"\s+", " ", salary.strip()) - if "$" not in salary and re.search(r"\d+k", salary): + salary = _CLEAN_SALARY_PATTERN.sub(" ", salary.strip()) + if "$" not in salary and _K_SALARY_PATTERN.search(salary): salary = "$" + salary return salary @@ -592,14 +644,9 @@ def _extract_sections_from_description(self, description: str) -> Tuple[List[str if not description: return requirements, responsibilities - # Find section boundaries using regex - # Match section headers with optional colon, at start of line or after newline - req_pattern = r"(?:^|\n)\s*(requirements?|qualifications?|what we(?:'re)? looking for|what you(?:'ll)? bring)\s*:?\s*\n" - resp_pattern = r"(?:^|\n)\s*(responsibilities?|duties?|what you(?:'ll)? do|your impact|key responsibilities)\s*:?\s*\n" - # Find positions of section headers - req_match = re.search(req_pattern, description, re.IGNORECASE) - resp_match = re.search(resp_pattern, description, re.IGNORECASE) + req_match = _REQ_SECTION_PATTERN.search(description) + resp_match = _RESP_SECTION_PATTERN.search(description) req_start = req_match.start() if req_match else -1 resp_start = resp_match.start() if resp_match else -1 @@ -618,13 +665,9 @@ def _extract_sections_from_description(self, description: str) -> Tuple[List[str # Extract responsibilities section if resp_start >= 0: # Find end of responsibilities section (look for next section or end) - next_section_patterns = [ - r"(?:^|\n)\s*(benefits|compensation|perks|about|company|team)\s*:?\s*\n", - r"(?:^|\n)\s*(requirements?|qualifications?)\s*:?\s*\n", - ] resp_end = len(description) - for pattern in next_section_patterns: - next_match = re.search(pattern, description[resp_start:], re.IGNORECASE) + for pattern in _NEXT_SECTION_PATTERNS: + next_match = pattern.search(description[resp_start:]) if next_match: resp_end = resp_start + next_match.start() break @@ -673,13 +716,8 @@ def _extract_items_from_text(self, text: str) -> List[str]: ] # Match bullet points - bullet_patterns = [ - r"[•\-\*]\s*([^\n]+)", # Standard bullets - r"^\s*\d+[\.\)]\s*([^\n]+)", # Numbered lists - ] - - for pattern in bullet_patterns: - matches = re.findall(pattern, text, re.MULTILINE) + for pattern in _BULLET_PATTERNS: + matches = pattern.findall(text) if matches: items = [m.strip() for m in matches if m.strip() and len(m.strip()) > 5] break @@ -706,7 +744,7 @@ def _extract_items_from_text(self, text: str) -> List[str]: # If still no items, try comma-separated if not items: - parts = re.split(r",\s*(?=[A-Z])", text) + parts = _COMMA_SPLIT_PATTERN.split(text) items = [p.strip() for p in parts if p.strip() and len(p.strip()) > 5] return items[:15] @@ -734,21 +772,26 @@ def _extract_list_items(self, element: Tag) -> List[str]: return [item for item in items if len(item) > 3][:15] - def _extract_list_by_keyword(self, html: str, keyword: str) -> List[str]: + def _extract_list_by_keyword(self, html: str, keyword: Union[str, re.Pattern]) -> List[str]: """ Extract list items near a keyword. Args: html: HTML content - keyword: Keyword to search for + keyword: Keyword to search for (string or compiled regex) Returns: List of extracted items """ soup = BeautifulSoup(html, "lxml") + if isinstance(keyword, str): + search_string = re.compile(keyword, re.IGNORECASE) + else: + search_string = keyword + # Find element containing the keyword - for elem in soup.find_all(string=re.compile(keyword, re.IGNORECASE)): + for elem in soup.find_all(string=search_string): # type: ignore[call-overload] parent = elem.find_parent(["div", "section", "ul", "li"]) if parent: # Look for list items in parent or siblings @@ -805,13 +848,8 @@ def _extract_job_type(self, html: str) -> Optional[str]: Returns: Job type string or None """ - patterns = [ - r"\b(full[- ]?time|part[- ]?time|contract|freelance|intern|temporary)\b", - r"\b(permanent|fixed[- ]?term)\b", - ] - - for pattern in patterns: - match = re.search(pattern, html, re.IGNORECASE) + for pattern in _JOB_TYPE_PATTERNS: + match = pattern.search(html) if match: return match.group(1).lower().replace("-", "-") @@ -827,13 +865,8 @@ def _extract_experience_level(self, html: str) -> Optional[str]: Returns: Experience level string or None """ - patterns = [ - r"\b(entry[- ]?level|junior|mid[- ]?level|senior|staff|principal|lead)\b", - r"\b(associate|vice[- ]?president|director|executive)\b", - ] - - for pattern in patterns: - match = re.search(pattern, html, re.IGNORECASE) + for pattern in _EXPERIENCE_LEVEL_PATTERNS: + match = pattern.search(html) if match: return match.group(1).lower().replace("-", "-")