diff --git a/.github/workflows/url-security-check.yml b/.github/workflows/url-security-check.yml new file mode 100644 index 0000000..0234e61 --- /dev/null +++ b/.github/workflows/url-security-check.yml @@ -0,0 +1,329 @@ +name: URL Security Check + +on: + pull_request: + push: + branches: [ main, master ] + schedule: + # Run monthly on the 1st at 2 AM UTC + - cron: '0 2 1 * *' + workflow_dispatch: # Allow manual trigger + +jobs: + url-security-scan: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Check URLs for inappropriate content + run: | + python3 << 'EOF' + import re, subprocess, sys, requests, os, json + from concurrent.futures import ThreadPoolExecutor + + def check_urls_with_google_safe_browsing(urls_batch): + """Check URLs using Google Safe Browsing API""" + try: + api_key = os.getenv('GOOGLE_SAFE_BROWSING_API_KEY') + if not api_key: + print("⚠️ Google Safe Browsing API key not found, skipping external check") + return {} + + api_url = f"https://safebrowsing.googleapis.com/v4/threatMatches:find?key={api_key}" + + payload = { + "client": { + "clientId": "aws-modernization-security", + "clientVersion": "1.0" + }, + "threatInfo": { + "threatTypes": [ + "MALWARE", + "SOCIAL_ENGINEERING", + "UNWANTED_SOFTWARE", + "POTENTIALLY_HARMFUL_APPLICATION" + ], + "platformTypes": ["ANY_PLATFORM"], + "threatEntryTypes": ["URL"], + "threatEntries": [{"url": url} for url in urls_batch] + } + } + + response = requests.post(api_url, json=payload, timeout=10) + + if response.status_code == 200: + result = response.json() + blocked_urls = {} + + if 'matches' in result: + for match in result['matches']: + url = match['threat']['url'] + threat_type = match['threatType'] + blocked_urls[url] = f"Google Safe Browsing: {threat_type}" + + return blocked_urls + else: + print(f"⚠️ Google Safe Browsing API error: {response.status_code}") + return {} + + except Exception as e: + print(f"⚠️ Google Safe Browsing check failed: {str(e)}") + return {} + + def extract_urls_from_file(file_path): + """Extract URLs with and without protocols from any file""" + try: + with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: + content = f.read() + + urls = set() + + # 1. Standard HTTP/HTTPS URLs + http_urls = re.findall(r'https?://[^\s<>"\'`\)]+', content) + urls.update(http_urls) + + # 2. Protocol-relative URLs (//example.com) + protocol_relative = re.findall(r'//[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}[^\s<>"\'`\)]*', content) + urls.update(['https:' + url for url in protocol_relative]) + + # 3. Domain-only URLs (example.com, www.example.com) + domain_pattern = r'\b(?:www\.)?[a-zA-Z0-9-]+\.[a-zA-Z]{2,}(?:\.[a-zA-Z]{2,})?(?:/[^\s<>"\'`\)]*)?' + potential_domains = re.findall(domain_pattern, content) + + # Filter out common false positives + for domain in potential_domains: + if not any(skip in domain.lower() for skip in [ + 'localhost', '127.0.0.1', 'example.com', 'test.com', + '.js', '.css', '.json', '.xml', '.py', '.java', + 'version.', 'config.', 'package.', 'github.com' + ]): + urls.add('https://' + domain) + + return list(urls) + except: + return [] + + def extract_urls_from_diff(): + """Extract URLs from git diff (commit changes only)""" + try: + result = subprocess.run(['git', 'diff', 'HEAD~1', 'HEAD'], capture_output=True, text=True) + urls = set() + for line in result.stdout.split('\n'): + if line.startswith('+') and not line.startswith('+++'): + line_urls = extract_urls_from_content(line) + urls.update(line_urls) + return list(urls) + except: + return [] + + def extract_urls_from_content(content): + """Helper to extract URLs from a single content string""" + urls = set() + + # Standard HTTP/HTTPS URLs + http_urls = re.findall(r'https?://[^\s<>"\'`\)]+', content) + urls.update(http_urls) + + # Protocol-relative URLs + protocol_relative = re.findall(r'//[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}[^\s<>"\'`\)]*', content) + urls.update(['https:' + url for url in protocol_relative]) + + # Domain-only URLs + domain_pattern = r'\b(?:www\.)?[a-zA-Z0-9-]+\.[a-zA-Z]{2,}(?:\.[a-zA-Z]{2,})?(?:/[^\s<>"\'`\)]*)?' + potential_domains = re.findall(domain_pattern, content) + + for domain in potential_domains: + if not any(skip in domain.lower() for skip in [ + 'localhost', '127.0.0.1', 'example.com', 'test.com', + '.js', '.css', '.json', '.xml', '.py', '.java', + 'version.', 'config.', 'package.', 'github.com' + ]): + urls.add('https://' + domain) + + return list(urls) + + def find_all_urls(): + """Scan all files for URLs (monthly scan)""" + all_urls = set() + for root, dirs, files in os.walk('.'): + # Skip common directories + dirs[:] = [d for d in dirs if d not in ['.git', 'node_modules', '__pycache__', '.venv']] + + for file in files: + # Skip binary files + if file.endswith(('.png', '.jpg', '.jpeg', '.gif', '.ico', '.svg', '.pdf', + '.zip', '.tar', '.gz', '.exe', '.bin', '.dll')): + continue + + file_path = os.path.join(root, file) + urls = extract_urls_from_file(file_path) + all_urls.update(urls) + + return list(all_urls) + + def check_url_content(url): + """Check URL content for inappropriate material""" + try: + if not url.startswith(('http://', 'https://')): + url = 'https://' + url + + print(f" 🔍 Checking URL: {url}") + + # First check the URL itself for inappropriate keywords + url_lower = url.lower() + inappropriate_keywords = [ + 'porn', 'xxx', 'sex', 'nude', 'erotic', 'nsfw', '18+', 'explicit', + 'hardcore', 'webcam', 'escort', 'fetish', 'adult-content', + 'cam-girl', 'live-sex', 'free-porn', 'hot-girls', 'naked' + ] + + # Check URL for inappropriate keywords + for keyword in inappropriate_keywords: + if keyword in url_lower: + # Skip educational contexts in URL + if any(edu in url_lower for edu in [ + 'adult-education', 'adult-learning', 'continuing-education', + 'sex-education', 'sexual-health', 'medical', 'academic' + ]): + continue + print(f" ❌ URL contains inappropriate keyword: {keyword}") + return True, f"inappropriate URL pattern: {keyword}" + + print(f" ✅ URL pattern clean, fetching content...") + + # Then check the actual content if URL is clean + response = requests.get(url, timeout=15, allow_redirects=True, + headers={'User-Agent': 'Mozilla/5.0 (compatible; SecurityBot/1.0)'}) + + print(f" 📊 Response status: {response.status_code}") + print(f" 📏 Content length: {len(response.text)} characters") + + # Get more content for analysis (50KB instead of 5KB) + content = response.text[:50000].lower() + full_url = response.url.lower() + + # Get page title + title = "" + title_match = re.search(r']*>([^<]+)', content) + if title_match: + title = title_match.group(1).lower() + print(f" 📄 Page title: {title}") + + # Show first 500 chars of content for debugging + content_preview = content[:500].replace('\n', ' ').replace('\r', ' ') + print(f" 📝 Content preview: {content_preview}...") + + full_analysis = f"{full_url} {title} {content}" + + # Check content for inappropriate material + for keyword in inappropriate_keywords: + if keyword in full_analysis: + # Skip educational contexts + if any(edu in full_analysis for edu in [ + 'adult education', 'adult learning', 'continuing education', + 'sex education', 'sexual health', 'medical', 'academic' + ]): + print(f" ℹ️ Found '{keyword}' but in educational context, allowing") + continue + print(f" ❌ Found inappropriate content: {keyword}") + return True, f"inappropriate content: {keyword}" + + print(f" ✅ Content analysis complete - no violations found") + return False, None + + except Exception as e: + print(f" ⚠️ Error checking {url}: {str(e)}") + print(f" ⚠️ Marking as UNKNOWN (not clean) due to access failure") + # Don't mark failed requests as clean - this could hide malicious content + return False, f"access_failed: {str(e)}" + + # Determine scan type based on trigger + if os.getenv('GITHUB_EVENT_NAME') == 'schedule': + urls = find_all_urls() + scan_type = "Monthly full repository scan" + else: + urls = extract_urls_from_diff() + scan_type = "Commit diff scan" + + if not urls: + print(f"✅ {scan_type}: No URLs found") + sys.exit(0) + + print(f"🔍 {scan_type}: Found {len(urls)} URLs to check...") + + # Step 1: Google Safe Browsing check (fast, batch) + print("🛡️ Checking URLs with Google Safe Browsing API...") + google_blocked = check_urls_with_google_safe_browsing(urls) + + # Step 2: Content analysis for remaining URLs + remaining_urls = [url for url in urls if url not in google_blocked] + print(f"🔍 Analyzing content for {len(remaining_urls)} URLs...") + + content_blocked = {} + if remaining_urls: + with ThreadPoolExecutor(max_workers=10) as executor: + results = list(executor.map(check_url_content, remaining_urls)) + + for i, (is_blocked, reason) in enumerate(results): + url = remaining_urls[i] + if is_blocked: + content_blocked[url] = reason + + # Combine all blocked URLs + all_blocked = {**google_blocked, **content_blocked} + + # Report results + for url in urls: + if url in all_blocked: + print(f"❌ BLOCKED: {url} - {all_blocked[url]}") + else: + print(f"✅ Clean: {url}") + + if all_blocked: + if os.getenv('GITHUB_EVENT_NAME') == 'schedule': + print(f"\n🚨 SECURITY ALERT: {len(all_blocked)} compromised URLs found!") + else: + print(f"\n❌ SECURITY CHECK FAILED: {len(all_blocked)} violations detected!") + + for url, reason in all_blocked.items(): + print(f" - {url} - {reason}") + sys.exit(1) + else: + print(f"\n✅ All {len(urls)} URLs passed security check") + sys.exit(0) + EOF + + - name: Revert Malicious Commit + if: failure() && github.event_name == 'push' + run: | + echo "🔄 Reverting commit with inappropriate content..." + git config --global user.name "Security Bot" + git config --global user.email "security-bot@github.com" + + # Revert the latest commit + git revert HEAD --no-edit + + # Push the revert commit + git push origin ${{ github.ref_name }} + + echo "✅ Malicious commit reverted successfully" + + - name: Notify Slack on Security Failure + if: failure() + run: | + # Determine if this was a revert action + if [[ "${{ github.event_name }}" == "push" ]]; then + ACTION_TYPE="🔄 COMMIT REVERTED" + MESSAGE="Inappropriate content was automatically reverted from the repository." + else + ACTION_TYPE="🚨 SECURITY ALERT" + MESSAGE="Inappropriate URLs detected during scheduled scan." + fi + + curl -X POST "${{ secrets.SLACK_WEBHOOK_URL }}" \ + -H "Content-Type: application/json" \ + --data "{ + \"Content\": \"$ACTION_TYPE\\nRepository: ${{ github.repository }}\\nBranch: ${{ github.ref_name }}\\nCommit: ${{ github.sha }}\\n\\n$MESSAGE\\n\\nAction: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}\" + }"