aws-samples · Pjv93 · Sep 15, 2025 · Sep 15, 2025 · Sep 15, 2025 · Sep 15, 2025
diff --git a/.github/workflows/url-security-check.yml b/.github/workflows/url-security-check.yml
@@ -0,0 +1,329 @@
+name: URL Security Check
+
+on:
+  pull_request:
+  push:
+    branches: [ main, master ]
+  schedule:
+    # Run monthly on the 1st at 2 AM UTC
+    - cron: '0 2 1 * *'
+  workflow_dispatch: # Allow manual trigger
+
+jobs:
+  url-security-scan:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+
+    - name: Check URLs for inappropriate content
+      run: |
+        python3 << 'EOF'
+        import re, subprocess, sys, requests, os, json
+        from concurrent.futures import ThreadPoolExecutor
+
+        def check_urls_with_google_safe_browsing(urls_batch):
+            """Check URLs using Google Safe Browsing API"""
+            try:
+                api_key = os.getenv('GOOGLE_SAFE_BROWSING_API_KEY')
+                if not api_key:
+                    print("⚠️  Google Safe Browsing API key not found, skipping external check")
+                    return {}
+
+                api_url = f"https://safebrowsing.googleapis.com/v4/threatMatches:find?key={api_key}"
+
+                payload = {
+                    "client": {
+                        "clientId": "aws-modernization-security",
+                        "clientVersion": "1.0"
+                    },
+                    "threatInfo": {
+                        "threatTypes": [
+                            "MALWARE",
+                            "SOCIAL_ENGINEERING", 
+                            "UNWANTED_SOFTWARE",
+                            "POTENTIALLY_HARMFUL_APPLICATION"
+                        ],
+                        "platformTypes": ["ANY_PLATFORM"],
+                        "threatEntryTypes": ["URL"],
+                        "threatEntries": [{"url": url} for url in urls_batch]
+                    }
+                }
+
+                response = requests.post(api_url, json=payload, timeout=10)
+
+                if response.status_code == 200:
+                    result = response.json()
+                    blocked_urls = {}
+
+                    if 'matches' in result:
+                        for match in result['matches']:
+                            url = match['threat']['url']
+                            threat_type = match['threatType']
+                            blocked_urls[url] = f"Google Safe Browsing: {threat_type}"
+
+                    return blocked_urls
+                else:
+                    print(f"⚠️  Google Safe Browsing API error: {response.status_code}")
+                    return {}
+
+            except Exception as e:
+                print(f"⚠️  Google Safe Browsing check failed: {str(e)}")
+                return {}
+
+        def extract_urls_from_file(file_path):
+            """Extract URLs with and without protocols from any file"""
+            try:
+                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+                    content = f.read()
+
+                    urls = set()
+
+                    # 1. Standard HTTP/HTTPS URLs
+                    http_urls = re.findall(r'https?://[^\s<>"\'`\)]+', content)
+                    urls.update(http_urls)
+
+                    # 2. Protocol-relative URLs (//example.com)
+                    protocol_relative = re.findall(r'//[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}[^\s<>"\'`\)]*', content)
+                    urls.update(['https:' + url for url in protocol_relative])
+
+                    # 3. Domain-only URLs (example.com, www.example.com)
+                    domain_pattern = r'\b(?:www\.)?[a-zA-Z0-9-]+\.[a-zA-Z]{2,}(?:\.[a-zA-Z]{2,})?(?:/[^\s<>"\'`\)]*)?'
+                    potential_domains = re.findall(domain_pattern, content)
+
+                    # Filter out common false positives
+                    for domain in potential_domains:
+                        if not any(skip in domain.lower() for skip in [
+                            'localhost', '127.0.0.1', 'example.com', 'test.com',
+                            '.js', '.css', '.json', '.xml', '.py', '.java',
+                            'version.', 'config.', 'package.', 'github.com'
+                        ]):
+                            urls.add('https://' + domain)
+
+                    return list(urls)
+            except:
+                return []
+
+        def extract_urls_from_diff():
+            """Extract URLs from git diff (commit changes only)"""
+            try:
+                result = subprocess.run(['git', 'diff', 'HEAD~1', 'HEAD'], capture_output=True, text=True)
+                urls = set()
+                for line in result.stdout.split('\n'):
+                    if line.startswith('+') and not line.startswith('+++'):
+                        line_urls = extract_urls_from_content(line)
+                        urls.update(line_urls)
+                return list(urls)
+            except: 
+                return []
+
+        def extract_urls_from_content(content):
+            """Helper to extract URLs from a single content string"""
+            urls = set()
+
+            # Standard HTTP/HTTPS URLs
+            http_urls = re.findall(r'https?://[^\s<>"\'`\)]+', content)
+            urls.update(http_urls)
+
+            # Protocol-relative URLs
+            protocol_relative = re.findall(r'//[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}[^\s<>"\'`\)]*', content)
+            urls.update(['https:' + url for url in protocol_relative])
+
+            # Domain-only URLs
+            domain_pattern = r'\b(?:www\.)?[a-zA-Z0-9-]+\.[a-zA-Z]{2,}(?:\.[a-zA-Z]{2,})?(?:/[^\s<>"\'`\)]*)?'
+            potential_domains = re.findall(domain_pattern, content)
+
+            for domain in potential_domains:
+                if not any(skip in domain.lower() for skip in [
+                    'localhost', '127.0.0.1', 'example.com', 'test.com',
+                    '.js', '.css', '.json', '.xml', '.py', '.java',
+                    'version.', 'config.', 'package.', 'github.com'
+                ]):
+                    urls.add('https://' + domain)
+
+            return list(urls)
+
+        def find_all_urls():
+            """Scan all files for URLs (monthly scan)"""
+            all_urls = set()
+            for root, dirs, files in os.walk('.'):
+                # Skip common directories
+                dirs[:] = [d for d in dirs if d not in ['.git', 'node_modules', '__pycache__', '.venv']]
+
+                for file in files:
+                    # Skip binary files
+                    if file.endswith(('.png', '.jpg', '.jpeg', '.gif', '.ico', '.svg', '.pdf', 
+                                    '.zip', '.tar', '.gz', '.exe', '.bin', '.dll')):
+                        continue
+
+                    file_path = os.path.join(root, file)
+                    urls = extract_urls_from_file(file_path)
+                    all_urls.update(urls)
+
+            return list(all_urls)
+
+        def check_url_content(url):
+            """Check URL content for inappropriate material"""
+            try:
+                if not url.startswith(('http://', 'https://')):
+                    url = 'https://' + url
+
+                print(f"  🔍 Checking URL: {url}")
+
+                # First check the URL itself for inappropriate keywords
+                url_lower = url.lower()
+                inappropriate_keywords = [
+                    'porn', 'xxx', 'sex', 'nude', 'erotic', 'nsfw', '18+', 'explicit', 
+                    'hardcore', 'webcam', 'escort', 'fetish', 'adult-content',
+                    'cam-girl', 'live-sex', 'free-porn', 'hot-girls', 'naked'
+                ]
+
+                # Check URL for inappropriate keywords
+                for keyword in inappropriate_keywords:
+                    if keyword in url_lower:
+                        # Skip educational contexts in URL
+                        if any(edu in url_lower for edu in [
+                            'adult-education', 'adult-learning', 'continuing-education',
+                            'sex-education', 'sexual-health', 'medical', 'academic'
+                        ]):
+                            continue
+                        print(f"    ❌ URL contains inappropriate keyword: {keyword}")
+                        return True, f"inappropriate URL pattern: {keyword}"
+
+                print(f"    ✅ URL pattern clean, fetching content...")
+
+                # Then check the actual content if URL is clean
+                response = requests.get(url, timeout=15, allow_redirects=True,
+                                     headers={'User-Agent': 'Mozilla/5.0 (compatible; SecurityBot/1.0)'})
+
+                print(f"    📊 Response status: {response.status_code}")
+                print(f"    📏 Content length: {len(response.text)} characters")
+
+                # Get more content for analysis (50KB instead of 5KB)
+                content = response.text[:50000].lower()
+                full_url = response.url.lower()
+
+                # Get page title
+                title = ""
+                title_match = re.search(r'<title[^>]*>([^<]+)</title>', content)
+                if title_match:
+                    title = title_match.group(1).lower()
+                    print(f"    📄 Page title: {title}")
+
+                # Show first 500 chars of content for debugging
+                content_preview = content[:500].replace('\n', ' ').replace('\r', ' ')
+                print(f"    📝 Content preview: {content_preview}...")
+
+                full_analysis = f"{full_url} {title} {content}"
+
+                # Check content for inappropriate material
+                for keyword in inappropriate_keywords:
+                    if keyword in full_analysis:
+                        # Skip educational contexts
+                        if any(edu in full_analysis for edu in [
+                            'adult education', 'adult learning', 'continuing education',
+                            'sex education', 'sexual health', 'medical', 'academic'
+                        ]):
+                            print(f"    ℹ️  Found '{keyword}' but in educational context, allowing")
+                            continue
+                        print(f"    ❌ Found inappropriate content: {keyword}")
+                        return True, f"inappropriate content: {keyword}"
+
+                print(f"    ✅ Content analysis complete - no violations found")
+                return False, None
+
+            except Exception as e:
+                print(f"    ⚠️  Error checking {url}: {str(e)}")
+                print(f"    ⚠️  Marking as UNKNOWN (not clean) due to access failure")
+                # Don't mark failed requests as clean - this could hide malicious content
+                return False, f"access_failed: {str(e)}"
+
+        # Determine scan type based on trigger
+        if os.getenv('GITHUB_EVENT_NAME') == 'schedule':
+            urls = find_all_urls()
+            scan_type = "Monthly full repository scan"
+        else:
+            urls = extract_urls_from_diff()
+            scan_type = "Commit diff scan"
+
+        if not urls:
+            print(f"✅ {scan_type}: No URLs found")
+            sys.exit(0)
+
+        print(f"🔍 {scan_type}: Found {len(urls)} URLs to check...")
+
+        # Step 1: Google Safe Browsing check (fast, batch)
+        print("🛡️  Checking URLs with Google Safe Browsing API...")
+        google_blocked = check_urls_with_google_safe_browsing(urls)
+
+        # Step 2: Content analysis for remaining URLs
+        remaining_urls = [url for url in urls if url not in google_blocked]
+        print(f"🔍 Analyzing content for {len(remaining_urls)} URLs...")
+
+        content_blocked = {}
+        if remaining_urls:
+            with ThreadPoolExecutor(max_workers=10) as executor:
+                results = list(executor.map(check_url_content, remaining_urls))
+
+                for i, (is_blocked, reason) in enumerate(results):
+                    url = remaining_urls[i]
+                    if is_blocked:
+                        content_blocked[url] = reason
+
+        # Combine all blocked URLs
+        all_blocked = {**google_blocked, **content_blocked}
+
+        # Report results
+        for url in urls:
+            if url in all_blocked:
+                print(f"❌ BLOCKED: {url} - {all_blocked[url]}")
+            else:
+                print(f"✅ Clean: {url}")
+
+        if all_blocked:
+            if os.getenv('GITHUB_EVENT_NAME') == 'schedule':
+                print(f"\n🚨 SECURITY ALERT: {len(all_blocked)} compromised URLs found!")
+            else:
+                print(f"\n❌ SECURITY CHECK FAILED: {len(all_blocked)} violations detected!")
+
+            for url, reason in all_blocked.items():
+                print(f"  - {url} - {reason}")
+            sys.exit(1)
+        else:
+            print(f"\n✅ All {len(urls)} URLs passed security check")
+            sys.exit(0)
+        EOF
+
+    - name: Revert Malicious Commit
+      if: failure() && github.event_name == 'push'
+      run: |
+        echo "🔄 Reverting commit with inappropriate content..."
+        git config --global user.name "Security Bot"
+        git config --global user.email "security-bot@github.com"
+
+        # Revert the latest commit
+        git revert HEAD --no-edit
+
+        # Push the revert commit
+        git push origin ${{ github.ref_name }}
+
+        echo "✅ Malicious commit reverted successfully"
+
+    - name: Notify Slack on Security Failure
+      if: failure()
+      run: |
+        # Determine if this was a revert action
+        if [[ "${{ github.event_name }}" == "push" ]]; then
+          ACTION_TYPE="🔄 COMMIT REVERTED"
+          MESSAGE="Inappropriate content was automatically reverted from the repository."
+        else
+          ACTION_TYPE="🚨 SECURITY ALERT"
+          MESSAGE="Inappropriate URLs detected during scheduled scan."
+        fi
+
+        curl -X POST "${{ secrets.SLACK_WEBHOOK_URL }}" \
+        -H "Content-Type: application/json" \
+        --data "{
+          \"Content\": \"$ACTION_TYPE\\nRepository: ${{ github.repository }}\\nBranch: ${{ github.ref_name }}\\nCommit: ${{ github.sha }}\\n\\n$MESSAGE\\n\\nAction: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}\"
+        }"