diff --git a/.github/workflows/url-security-check.yml b/.github/workflows/url-security-check.yml new file mode 100644 index 0000000..0234e61 --- /dev/null +++ b/.github/workflows/url-security-check.yml @@ -0,0 +1,329 @@ +name: URL Security Check + +on: + pull_request: + push: + branches: [ main, master ] + schedule: + # Run monthly on the 1st at 2 AM UTC + - cron: '0 2 1 * *' + workflow_dispatch: # Allow manual trigger + +jobs: + url-security-scan: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Check URLs for inappropriate content + run: | + python3 << 'EOF' + import re, subprocess, sys, requests, os, json + from concurrent.futures import ThreadPoolExecutor + + def check_urls_with_google_safe_browsing(urls_batch): + """Check URLs using Google Safe Browsing API""" + try: + api_key = os.getenv('GOOGLE_SAFE_BROWSING_API_KEY') + if not api_key: + print("⚠️ Google Safe Browsing API key not found, skipping external check") + return {} + + api_url = f"https://safebrowsing.googleapis.com/v4/threatMatches:find?key={api_key}" + + payload = { + "client": { + "clientId": "aws-modernization-security", + "clientVersion": "1.0" + }, + "threatInfo": { + "threatTypes": [ + "MALWARE", + "SOCIAL_ENGINEERING", + "UNWANTED_SOFTWARE", + "POTENTIALLY_HARMFUL_APPLICATION" + ], + "platformTypes": ["ANY_PLATFORM"], + "threatEntryTypes": ["URL"], + "threatEntries": [{"url": url} for url in urls_batch] + } + } + + response = requests.post(api_url, json=payload, timeout=10) + + if response.status_code == 200: + result = response.json() + blocked_urls = {} + + if 'matches' in result: + for match in result['matches']: + url = match['threat']['url'] + threat_type = match['threatType'] + blocked_urls[url] = f"Google Safe Browsing: {threat_type}" + + return blocked_urls + else: + print(f"⚠️ Google Safe Browsing API error: {response.status_code}") + return {} + + except Exception as e: + print(f"⚠️ Google Safe Browsing check failed: {str(e)}") + return {} + + def extract_urls_from_file(file_path): + """Extract URLs with and without protocols from any file""" + try: + with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: + content = f.read() + + urls = set() + + # 1. Standard HTTP/HTTPS URLs + http_urls = re.findall(r'https?://[^\s<>"\'`\)]+', content) + urls.update(http_urls) + + # 2. Protocol-relative URLs (//example.com) + protocol_relative = re.findall(r'//[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}[^\s<>"\'`\)]*', content) + urls.update(['https:' + url for url in protocol_relative]) + + # 3. Domain-only URLs (example.com, www.example.com) + domain_pattern = r'\b(?:www\.)?[a-zA-Z0-9-]+\.[a-zA-Z]{2,}(?:\.[a-zA-Z]{2,})?(?:/[^\s<>"\'`\)]*)?' + potential_domains = re.findall(domain_pattern, content) + + # Filter out common false positives + for domain in potential_domains: + if not any(skip in domain.lower() for skip in [ + 'localhost', '127.0.0.1', 'example.com', 'test.com', + '.js', '.css', '.json', '.xml', '.py', '.java', + 'version.', 'config.', 'package.', 'github.com' + ]): + urls.add('https://' + domain) + + return list(urls) + except: + return [] + + def extract_urls_from_diff(): + """Extract URLs from git diff (commit changes only)""" + try: + result = subprocess.run(['git', 'diff', 'HEAD~1', 'HEAD'], capture_output=True, text=True) + urls = set() + for line in result.stdout.split('\n'): + if line.startswith('+') and not line.startswith('+++'): + line_urls = extract_urls_from_content(line) + urls.update(line_urls) + return list(urls) + except: + return [] + + def extract_urls_from_content(content): + """Helper to extract URLs from a single content string""" + urls = set() + + # Standard HTTP/HTTPS URLs + http_urls = re.findall(r'https?://[^\s<>"\'`\)]+', content) + urls.update(http_urls) + + # Protocol-relative URLs + protocol_relative = re.findall(r'//[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}[^\s<>"\'`\)]*', content) + urls.update(['https:' + url for url in protocol_relative]) + + # Domain-only URLs + domain_pattern = r'\b(?:www\.)?[a-zA-Z0-9-]+\.[a-zA-Z]{2,}(?:\.[a-zA-Z]{2,})?(?:/[^\s<>"\'`\)]*)?' + potential_domains = re.findall(domain_pattern, content) + + for domain in potential_domains: + if not any(skip in domain.lower() for skip in [ + 'localhost', '127.0.0.1', 'example.com', 'test.com', + '.js', '.css', '.json', '.xml', '.py', '.java', + 'version.', 'config.', 'package.', 'github.com' + ]): + urls.add('https://' + domain) + + return list(urls) + + def find_all_urls(): + """Scan all files for URLs (monthly scan)""" + all_urls = set() + for root, dirs, files in os.walk('.'): + # Skip common directories + dirs[:] = [d for d in dirs if d not in ['.git', 'node_modules', '__pycache__', '.venv']] + + for file in files: + # Skip binary files + if file.endswith(('.png', '.jpg', '.jpeg', '.gif', '.ico', '.svg', '.pdf', + '.zip', '.tar', '.gz', '.exe', '.bin', '.dll')): + continue + + file_path = os.path.join(root, file) + urls = extract_urls_from_file(file_path) + all_urls.update(urls) + + return list(all_urls) + + def check_url_content(url): + """Check URL content for inappropriate material""" + try: + if not url.startswith(('http://', 'https://')): + url = 'https://' + url + + print(f" 🔍 Checking URL: {url}") + + # First check the URL itself for inappropriate keywords + url_lower = url.lower() + inappropriate_keywords = [ + 'porn', 'xxx', 'sex', 'nude', 'erotic', 'nsfw', '18+', 'explicit', + 'hardcore', 'webcam', 'escort', 'fetish', 'adult-content', + 'cam-girl', 'live-sex', 'free-porn', 'hot-girls', 'naked' + ] + + # Check URL for inappropriate keywords + for keyword in inappropriate_keywords: + if keyword in url_lower: + # Skip educational contexts in URL + if any(edu in url_lower for edu in [ + 'adult-education', 'adult-learning', 'continuing-education', + 'sex-education', 'sexual-health', 'medical', 'academic' + ]): + continue + print(f" ❌ URL contains inappropriate keyword: {keyword}") + return True, f"inappropriate URL pattern: {keyword}" + + print(f" ✅ URL pattern clean, fetching content...") + + # Then check the actual content if URL is clean + response = requests.get(url, timeout=15, allow_redirects=True, + headers={'User-Agent': 'Mozilla/5.0 (compatible; SecurityBot/1.0)'}) + + print(f" 📊 Response status: {response.status_code}") + print(f" 📏 Content length: {len(response.text)} characters") + + # Get more content for analysis (50KB instead of 5KB) + content = response.text[:50000].lower() + full_url = response.url.lower() + + # Get page title + title = "" + title_match = re.search(r'