@@ -66,78 +66,159 @@ def extract_links_from_markdown(content: str) -> List[Tuple[str, int]]:
6666 return links
6767
6868
69+ def is_downloadable_file (url : str , content_type : str = None ) -> bool :
70+ """
71+ Check if URL points to a downloadable file (not HTML).
72+ Checks both URL extension and Content-Type header.
73+ """
74+ # Check Content-Type header first if available
75+ if content_type :
76+ content_type_lower = content_type .lower ()
77+ # If it's clearly HTML, it's not a downloadable file
78+ if 'text/html' in content_type_lower :
79+ return False
80+ # If it's a known file type, it's downloadable
81+ file_type_indicators = [
82+ 'application/pdf' ,
83+ 'application/zip' ,
84+ 'application/x-zip' ,
85+ 'application/gzip' ,
86+ 'application/x-gzip' ,
87+ 'application/octet-stream' ,
88+ 'image/' ,
89+ 'video/' ,
90+ 'audio/' ,
91+ 'application/json' , # JSON files
92+ 'text/plain' , # Plain text files (but not HTML)
93+ ]
94+ if any (indicator in content_type_lower for indicator in file_type_indicators ):
95+ return True
96+
97+ # Check URL extension as fallback
98+ url_lower = url .lower ()
99+ file_extensions = [
100+ '.pdf' , '.zip' , '.gz' , '.tar' , '.rar' , '.7z' ,
101+ '.jpg' , '.jpeg' , '.png' , '.gif' , '.svg' , '.webp' , '.ico' ,
102+ '.mp4' , '.mp3' , '.avi' , '.mov' , '.wmv' ,
103+ '.doc' , '.docx' , '.xls' , '.xlsx' , '.ppt' , '.pptx' ,
104+ '.csv' , '.tsv' , '.json' , '.xml' ,
105+ '.exe' , '.dmg' , '.deb' , '.rpm' ,
106+ ]
107+ return any (url_lower .endswith (ext ) for ext in file_extensions )
108+
109+
69110def check_url (url : str , timeout : int = 10 , allow_redirects : bool = True ) -> Tuple [bool , int , str ]:
70111 """
71112 Check if URL is accessible.
72113 Returns: (is_valid, status_code, error_message)
73- Uses GET request to check content for false positives (pages that return 200 but show 404).
114+ Uses HEAD request first to avoid downloading files, falls back to GET for HTML pages
115+ that may need content verification for false 404 detection.
74116 """
75117 try :
76118 headers = {'User-Agent' : 'Mozilla/5.0 (compatible; LinkChecker/1.0)' }
119+ MAX_BODY_SIZE = 8192 # Only read first 8KB for HTML content verification
77120
78121 if USE_REQUESTS :
79- # Use GET instead of HEAD to check page content for false positives
80- response = requests .get (
81- url ,
82- timeout = timeout ,
83- allow_redirects = allow_redirects ,
84- headers = headers
85- )
86- status_code = response .status_code
87-
88- # Check if page is actually valid (not a 404 page with 200 status)
89- is_valid = status_code < 400
90- if is_valid and status_code == 200 :
91- # Check for common 404 indicators in HTML content
92- content_lower = response .text .lower ()
122+ # Try HEAD request first (doesn't download body)
123+ try :
124+ head_response = requests .head (
125+ url ,
126+ timeout = timeout ,
127+ allow_redirects = allow_redirects ,
128+ headers = headers
129+ )
130+ status_code = head_response .status_code
131+ content_type = head_response .headers .get ('Content-Type' , '' )
132+
133+ # Check if page is valid
134+ is_valid = status_code < 400
93135
94- # Extract title tag content more reliably
95- title_match = None
96- title_pattern = r'<title[^>]*>(.*?)</title>'
97- title_matches = re .findall (title_pattern , content_lower , re .DOTALL | re .IGNORECASE )
98- if title_matches :
99- title_text = title_matches [0 ].strip ()
100- # Check for various 404 indicators in title
101- if any (phrase in title_text for phrase in [
102- 'page not found' ,
103- 'not found' ,
104- '404' ,
105- 'page does not exist' ,
106- 'couldn\' t find the page' ,
107- 'we couldn\' t find'
108- ]):
109- is_valid = False
110- error_message = f"200 (but page shows '{ title_matches [0 ].strip ()} ' in title)"
136+ # If it's a downloadable file, just use HEAD result (no need to download)
137+ if is_downloadable_file (url , content_type ):
138+ error_message = f"{ status_code } { head_response .reason } " if not is_valid else ""
139+ return (is_valid , status_code , error_message )
140+
141+ # For HTML pages with 200 status, verify it's not a false 404
142+ # by downloading a small portion
143+ if is_valid and status_code == 200 :
144+ # Use GET with stream=True to limit download size
145+ get_response = requests .get (
146+ url ,
147+ timeout = timeout ,
148+ allow_redirects = allow_redirects ,
149+ headers = headers ,
150+ stream = True
151+ )
152+ # Read only first MAX_BODY_SIZE bytes
153+ content_chunk = b''
154+ for chunk in get_response .iter_content (chunk_size = 1024 ):
155+ content_chunk += chunk
156+ if len (content_chunk ) >= MAX_BODY_SIZE :
157+ break
158+
159+ content_lower = content_chunk .decode ('utf-8' , errors = 'ignore' ).lower ()
160+
161+ # Extract title tag content
162+ title_pattern = r'<title[^>]*>(.*?)</title>'
163+ title_matches = re .findall (title_pattern , content_lower , re .DOTALL | re .IGNORECASE )
164+ if title_matches :
165+ title_text = title_matches [0 ].strip ()
166+ # Check for various 404 indicators in title
167+ if any (phrase in title_text for phrase in [
168+ 'page not found' ,
169+ 'not found' ,
170+ '404' ,
171+ 'page does not exist' ,
172+ 'couldn\' t find the page' ,
173+ 'we couldn\' t find'
174+ ]):
175+ is_valid = False
176+ error_message = f"200 (but page shows '{ title_matches [0 ].strip ()} ' in title)"
177+ else :
178+ error_message = ""
111179 else :
112- error_message = ""
180+ # No title tag found, check body content for 404 indicators
181+ if 'page not found' in content_lower [:MAX_BODY_SIZE ] or '404' in content_lower [:MAX_BODY_SIZE ]:
182+ is_valid = False
183+ error_message = "200 (but page content suggests 404)"
184+ else :
185+ error_message = ""
113186 else :
114- # No title tag found, check body content for 404 indicators
115- if 'page not found' in content_lower [:5000 ] or '404' in content_lower [:5000 ]:
116- is_valid = False
117- error_message = "200 (but page content suggests 404)"
118- else :
119- error_message = ""
120- else :
121- error_message = f"{ status_code } { response .reason } " if not is_valid else ""
122- else :
123- # Fallback to urllib - use GET to check content
124- request = urllib .request .Request (url , headers = headers )
125- try :
126- with urllib .request .urlopen (request , timeout = timeout ) as response :
127- status_code = response .status
187+ error_message = f"{ status_code } { head_response .reason } " if not is_valid else ""
188+
189+ except requests .exceptions .RequestException as e :
190+ # If HEAD fails (e.g., 405 Method Not Allowed), try GET with limited size
191+ try :
192+ get_response = requests .get (
193+ url ,
194+ timeout = timeout ,
195+ allow_redirects = allow_redirects ,
196+ headers = headers ,
197+ stream = True
198+ )
199+ status_code = get_response .status_code
200+ content_type = get_response .headers .get ('Content-Type' , '' )
201+
128202 is_valid = status_code < 400
129203
130- # For 200 status, read content to check for false positives
204+ # If it's a downloadable file, don't read body
205+ if is_downloadable_file (url , content_type ):
206+ error_message = f"{ status_code } { get_response .reason } " if not is_valid else ""
207+ return (is_valid , status_code , error_message )
208+
209+ # For HTML with 200, check content (already streaming, limit read)
131210 if is_valid and status_code == 200 :
132- content = response .read ().decode ('utf-8' , errors = 'ignore' ).lower ()
211+ content_chunk = b''
212+ for chunk in get_response .iter_content (chunk_size = 1024 ):
213+ content_chunk += chunk
214+ if len (content_chunk ) >= MAX_BODY_SIZE :
215+ break
133216
134- # Extract title tag content more reliably
135- title_match = None
217+ content_lower = content_chunk .decode ('utf-8' , errors = 'ignore' ).lower ()
136218 title_pattern = r'<title[^>]*>(.*?)</title>'
137- title_matches = re .findall (title_pattern , content , re .DOTALL | re .IGNORECASE )
219+ title_matches = re .findall (title_pattern , content_lower , re .DOTALL | re .IGNORECASE )
138220 if title_matches :
139221 title_text = title_matches [0 ].strip ()
140- # Check for various 404 indicators in title
141222 if any (phrase in title_text for phrase in [
142223 'page not found' ,
143224 'not found' ,
@@ -151,18 +232,110 @@ def check_url(url: str, timeout: int = 10, allow_redirects: bool = True) -> Tupl
151232 else :
152233 error_message = ""
153234 else :
154- # No title tag found, check body content for 404 indicators
155- if 'page not found' in content [:5000 ] or '404' in content [:5000 ]:
235+ if 'page not found' in content_lower [:MAX_BODY_SIZE ] or '404' in content_lower [:MAX_BODY_SIZE ]:
156236 is_valid = False
157237 error_message = "200 (but page content suggests 404)"
158238 else :
159239 error_message = ""
160240 else :
161- error_message = f"{ status_code } " if not is_valid else ""
162- except urllib .error .HTTPError as e :
163- status_code = e .code
164- is_valid = status_code < 400
165- error_message = f"{ status_code } { e .reason } "
241+ error_message = f"{ status_code } { get_response .reason } " if not is_valid else ""
242+ except Exception as inner_e :
243+ return (False , 0 , str (inner_e ))
244+ else :
245+ # Fallback to urllib
246+ # Try HEAD first
247+ try :
248+ head_request = urllib .request .Request (url , headers = headers , method = 'HEAD' )
249+ try :
250+ with urllib .request .urlopen (head_request , timeout = timeout ) as response :
251+ status_code = response .status
252+ content_type = response .headers .get ('Content-Type' , '' )
253+ is_valid = status_code < 400
254+
255+ # If downloadable file, just use HEAD result
256+ if is_downloadable_file (url , content_type ):
257+ error_message = f"{ status_code } " if not is_valid else ""
258+ return (is_valid , status_code , error_message )
259+
260+ # For HTML with 200, need to verify with GET (limited)
261+ if is_valid and status_code == 200 :
262+ # Use GET but read limited amount
263+ get_request = urllib .request .Request (url , headers = headers )
264+ with urllib .request .urlopen (get_request , timeout = timeout ) as get_response :
265+ content = get_response .read (MAX_BODY_SIZE ).decode ('utf-8' , errors = 'ignore' ).lower ()
266+ title_pattern = r'<title[^>]*>(.*?)</title>'
267+ title_matches = re .findall (title_pattern , content , re .DOTALL | re .IGNORECASE )
268+ if title_matches :
269+ title_text = title_matches [0 ].strip ()
270+ if any (phrase in title_text for phrase in [
271+ 'page not found' ,
272+ 'not found' ,
273+ '404' ,
274+ 'page does not exist' ,
275+ 'couldn\' t find the page' ,
276+ 'we couldn\' t find'
277+ ]):
278+ is_valid = False
279+ error_message = f"200 (but page shows '{ title_matches [0 ].strip ()} ' in title)"
280+ else :
281+ error_message = ""
282+ else :
283+ if 'page not found' in content [:MAX_BODY_SIZE ] or '404' in content [:MAX_BODY_SIZE ]:
284+ is_valid = False
285+ error_message = "200 (but page content suggests 404)"
286+ else :
287+ error_message = ""
288+ else :
289+ error_message = f"{ status_code } " if not is_valid else ""
290+ except urllib .error .HTTPError as e :
291+ status_code = e .code
292+ is_valid = status_code < 400
293+ error_message = f"{ status_code } { e .reason } "
294+ except (urllib .error .URLError , ValueError ) as e :
295+ # HEAD not supported or failed, try GET with limited read
296+ try :
297+ get_request = urllib .request .Request (url , headers = headers )
298+ with urllib .request .urlopen (get_request , timeout = timeout ) as response :
299+ status_code = response .status
300+ content_type = response .headers .get ('Content-Type' , '' )
301+ is_valid = status_code < 400
302+
303+ # If downloadable file, don't read more than headers
304+ if is_downloadable_file (url , content_type ):
305+ error_message = f"{ status_code } " if not is_valid else ""
306+ return (is_valid , status_code , error_message )
307+
308+ # For HTML with 200, read limited content
309+ if is_valid and status_code == 200 :
310+ content = response .read (MAX_BODY_SIZE ).decode ('utf-8' , errors = 'ignore' ).lower ()
311+ title_pattern = r'<title[^>]*>(.*?)</title>'
312+ title_matches = re .findall (title_pattern , content , re .DOTALL | re .IGNORECASE )
313+ if title_matches :
314+ title_text = title_matches [0 ].strip ()
315+ if any (phrase in title_text for phrase in [
316+ 'page not found' ,
317+ 'not found' ,
318+ '404' ,
319+ 'page does not exist' ,
320+ 'couldn\' t find the page' ,
321+ 'we couldn\' t find'
322+ ]):
323+ is_valid = False
324+ error_message = f"200 (but page shows '{ title_matches [0 ].strip ()} ' in title)"
325+ else :
326+ error_message = ""
327+ else :
328+ if 'page not found' in content [:MAX_BODY_SIZE ] or '404' in content [:MAX_BODY_SIZE ]:
329+ is_valid = False
330+ error_message = "200 (but page content suggests 404)"
331+ else :
332+ error_message = ""
333+ else :
334+ error_message = f"{ status_code } " if not is_valid else ""
335+ except urllib .error .HTTPError as e :
336+ status_code = e .code
337+ is_valid = status_code < 400
338+ error_message = f"{ status_code } { e .reason } "
166339
167340 return (is_valid , status_code , error_message )
168341 except Exception as e :
0 commit comments