Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 74 additions & 3 deletions images.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,86 @@

import re
import os
import requests
from bs4 import BeautifulSoup
import time
import random

PAGE_START = 1
#default 30 pages
PAGE_END = 30
DIR_PATH = '/path/to/1024'

#1024 has much urls and it is changing always
URL = 'http://cl.comcl.org/'
#URL = 'http://t66y.com/'
# Function to find a working URL through search
def find_working_url():
search_terms = ["草榴社区", "1024 cl", "t66y"]
search_url = "https://www.google.com/search?q="

# Try different search terms
for term in search_terms:
try:
# Add a random delay to avoid being blocked
time.sleep(random.uniform(1, 3))

# Use a custom user agent to avoid being blocked
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Perform the search
response = requests.get(search_url + term, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')

# Extract search results
search_results = soup.find_all('a')
potential_urls = []

# Common domains used by the site
known_domains = ['t66y.com', 'cl.', '1024', 'caoliu']

# Check each result
for result in search_results:
href = result.get('href')
if href and 'http' in href:
# Extract actual URL from Google redirect
if '/url?q=' in href:
href = href.split('/url?q=')[1].split('&')[0]

# Check if this URL contains any of the known domains
if any(domain in href for domain in known_domains):
# Verify the URL is accessible
try:
test_response = requests.get(href, headers=headers, timeout=5)
if test_response.status_code == 200:
# Ensure URL ends with a slash
if not href.endswith('/'):
href += '/'
potential_urls.append(href)
except:
continue

# Return the first working URL found
if potential_urls:
return potential_urls[0]

except Exception as e:
print(f"Error searching with term '{term}': {str(e)}")
continue

# If no URL is found, return one of the known URLs as fallback
return "http://t66y.com/"

# Try to find a working URL, otherwise use the default
try:
URL = find_working_url()
print(f"Using automatically found URL: {URL}")
except Exception as e:
print(f"Error finding URL automatically: {str(e)}")
# Fallback URLs
URL = 'http://cl.comcl.org/'
# Alternative URL
# URL = 'http://t66y.com/'

START_URL = URL + 'thread0806.php?fid=8'

class Handler(BaseHandler):
Expand Down