Skip to content

Commit 5af64e1

Browse files
committed
Do not download files when checking links
1 parent 9a5cb64 commit 5af64e1

File tree

3 files changed

+235
-80
lines changed

3 files changed

+235
-80
lines changed

.agents/PINECONE-cli.md

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,7 @@ jobs:
234234

235235
- name: Install Pinecone CLI
236236
run: |
237-
curl -L https://github.com/pinecone-io/cli/releases/latest/download/pinecone_linux_amd64.tar.gz | tar xz
237+
curl -L https://github.com/pinecone-io/cli/releases/latest/download/pc_Linux_x86_64.tar.gz | tar xz
238238
sudo mv pinecone /usr/local/bin/
239239
240240
- name: Create Production Index
@@ -253,7 +253,7 @@ jobs:
253253
# Install Pinecone CLI in Docker image
254254
FROM alpine:latest
255255
RUN apk add --no-cache curl
256-
RUN curl -L https://github.com/pinecone-io/cli/releases/latest/download/pinecone_linux_amd64.tar.gz | tar xz -C /usr/local/bin/
256+
RUN curl -L https://github.com/pinecone-io/cli/releases/latest/download/pc_Linux_x86_64.tar.gz | tar xz -C /usr/local/bin/
257257
COPY deploy-index.sh /scripts/
258258
RUN chmod +x /scripts/deploy-index.sh
259259
```
@@ -289,4 +289,3 @@ pc index delete --name old-index
289289
- **Official CLI Documentation**: [https://docs.pinecone.io/reference/cli/command-reference](https://docs.pinecone.io/reference/cli/command-reference)
290290
- **CLI GitHub Repository**: [https://github.com/pinecone-io/cli](https://github.com/pinecone-io/cli)
291291
- **CLI Releases**: [https://github.com/pinecone-io/cli/releases](https://github.com/pinecone-io/cli/releases)
292-
- **CLI Examples**: [https://docs.pinecone.io/guides/cli](https://docs.pinecone.io/guides/cli)

utils/README_check_links.md

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,23 +4,6 @@
44

55
**Created**: `check_links.py` - A Python script to extract and verify external links in markdown files
66

7-
### Link Check Results
8-
9-
**Total External Links Found**: 22 unique URLs
10-
11-
**Broken Links Found**: 1 ❌
12-
13-
### Broken Link Details
14-
15-
- **File**: `.agents/AGENTS-cli.md`
16-
- **Line**: 233
17-
- **URL**: `https://github.com/pinecone-io/cli/releases/latest/download/pinecone_linux_amd64.tar.gz`
18-
- **Error**: 404 Not Found
19-
20-
**Issue**: This specific download URL format (`/latest/download/`) appears to be invalid. GitHub releases typically use format like `/releases/download/v1.0.0/filename`.
21-
22-
**Suggested Fix**: Replace with direct link to releases page or use Homebrew installation method shown elsewhere in the document.
23-
247
## Usage
258

269
### Extract Links Only (No Checking)

utils/check_links.py

Lines changed: 233 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -66,78 +66,159 @@ def extract_links_from_markdown(content: str) -> List[Tuple[str, int]]:
6666
return links
6767

6868

69+
def is_downloadable_file(url: str, content_type: str = None) -> bool:
70+
"""
71+
Check if URL points to a downloadable file (not HTML).
72+
Checks both URL extension and Content-Type header.
73+
"""
74+
# Check Content-Type header first if available
75+
if content_type:
76+
content_type_lower = content_type.lower()
77+
# If it's clearly HTML, it's not a downloadable file
78+
if 'text/html' in content_type_lower:
79+
return False
80+
# If it's a known file type, it's downloadable
81+
file_type_indicators = [
82+
'application/pdf',
83+
'application/zip',
84+
'application/x-zip',
85+
'application/gzip',
86+
'application/x-gzip',
87+
'application/octet-stream',
88+
'image/',
89+
'video/',
90+
'audio/',
91+
'application/json', # JSON files
92+
'text/plain', # Plain text files (but not HTML)
93+
]
94+
if any(indicator in content_type_lower for indicator in file_type_indicators):
95+
return True
96+
97+
# Check URL extension as fallback
98+
url_lower = url.lower()
99+
file_extensions = [
100+
'.pdf', '.zip', '.gz', '.tar', '.rar', '.7z',
101+
'.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.ico',
102+
'.mp4', '.mp3', '.avi', '.mov', '.wmv',
103+
'.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx',
104+
'.csv', '.tsv', '.json', '.xml',
105+
'.exe', '.dmg', '.deb', '.rpm',
106+
]
107+
return any(url_lower.endswith(ext) for ext in file_extensions)
108+
109+
69110
def check_url(url: str, timeout: int = 10, allow_redirects: bool = True) -> Tuple[bool, int, str]:
70111
"""
71112
Check if URL is accessible.
72113
Returns: (is_valid, status_code, error_message)
73-
Uses GET request to check content for false positives (pages that return 200 but show 404).
114+
Uses HEAD request first to avoid downloading files, falls back to GET for HTML pages
115+
that may need content verification for false 404 detection.
74116
"""
75117
try:
76118
headers = {'User-Agent': 'Mozilla/5.0 (compatible; LinkChecker/1.0)'}
119+
MAX_BODY_SIZE = 8192 # Only read first 8KB for HTML content verification
77120

78121
if USE_REQUESTS:
79-
# Use GET instead of HEAD to check page content for false positives
80-
response = requests.get(
81-
url,
82-
timeout=timeout,
83-
allow_redirects=allow_redirects,
84-
headers=headers
85-
)
86-
status_code = response.status_code
87-
88-
# Check if page is actually valid (not a 404 page with 200 status)
89-
is_valid = status_code < 400
90-
if is_valid and status_code == 200:
91-
# Check for common 404 indicators in HTML content
92-
content_lower = response.text.lower()
122+
# Try HEAD request first (doesn't download body)
123+
try:
124+
head_response = requests.head(
125+
url,
126+
timeout=timeout,
127+
allow_redirects=allow_redirects,
128+
headers=headers
129+
)
130+
status_code = head_response.status_code
131+
content_type = head_response.headers.get('Content-Type', '')
132+
133+
# Check if page is valid
134+
is_valid = status_code < 400
93135

94-
# Extract title tag content more reliably
95-
title_match = None
96-
title_pattern = r'<title[^>]*>(.*?)</title>'
97-
title_matches = re.findall(title_pattern, content_lower, re.DOTALL | re.IGNORECASE)
98-
if title_matches:
99-
title_text = title_matches[0].strip()
100-
# Check for various 404 indicators in title
101-
if any(phrase in title_text for phrase in [
102-
'page not found',
103-
'not found',
104-
'404',
105-
'page does not exist',
106-
'couldn\'t find the page',
107-
'we couldn\'t find'
108-
]):
109-
is_valid = False
110-
error_message = f"200 (but page shows '{title_matches[0].strip()}' in title)"
136+
# If it's a downloadable file, just use HEAD result (no need to download)
137+
if is_downloadable_file(url, content_type):
138+
error_message = f"{status_code} {head_response.reason}" if not is_valid else ""
139+
return (is_valid, status_code, error_message)
140+
141+
# For HTML pages with 200 status, verify it's not a false 404
142+
# by downloading a small portion
143+
if is_valid and status_code == 200:
144+
# Use GET with stream=True to limit download size
145+
get_response = requests.get(
146+
url,
147+
timeout=timeout,
148+
allow_redirects=allow_redirects,
149+
headers=headers,
150+
stream=True
151+
)
152+
# Read only first MAX_BODY_SIZE bytes
153+
content_chunk = b''
154+
for chunk in get_response.iter_content(chunk_size=1024):
155+
content_chunk += chunk
156+
if len(content_chunk) >= MAX_BODY_SIZE:
157+
break
158+
159+
content_lower = content_chunk.decode('utf-8', errors='ignore').lower()
160+
161+
# Extract title tag content
162+
title_pattern = r'<title[^>]*>(.*?)</title>'
163+
title_matches = re.findall(title_pattern, content_lower, re.DOTALL | re.IGNORECASE)
164+
if title_matches:
165+
title_text = title_matches[0].strip()
166+
# Check for various 404 indicators in title
167+
if any(phrase in title_text for phrase in [
168+
'page not found',
169+
'not found',
170+
'404',
171+
'page does not exist',
172+
'couldn\'t find the page',
173+
'we couldn\'t find'
174+
]):
175+
is_valid = False
176+
error_message = f"200 (but page shows '{title_matches[0].strip()}' in title)"
177+
else:
178+
error_message = ""
111179
else:
112-
error_message = ""
180+
# No title tag found, check body content for 404 indicators
181+
if 'page not found' in content_lower[:MAX_BODY_SIZE] or '404' in content_lower[:MAX_BODY_SIZE]:
182+
is_valid = False
183+
error_message = "200 (but page content suggests 404)"
184+
else:
185+
error_message = ""
113186
else:
114-
# No title tag found, check body content for 404 indicators
115-
if 'page not found' in content_lower[:5000] or '404' in content_lower[:5000]:
116-
is_valid = False
117-
error_message = "200 (but page content suggests 404)"
118-
else:
119-
error_message = ""
120-
else:
121-
error_message = f"{status_code} {response.reason}" if not is_valid else ""
122-
else:
123-
# Fallback to urllib - use GET to check content
124-
request = urllib.request.Request(url, headers=headers)
125-
try:
126-
with urllib.request.urlopen(request, timeout=timeout) as response:
127-
status_code = response.status
187+
error_message = f"{status_code} {head_response.reason}" if not is_valid else ""
188+
189+
except requests.exceptions.RequestException as e:
190+
# If HEAD fails (e.g., 405 Method Not Allowed), try GET with limited size
191+
try:
192+
get_response = requests.get(
193+
url,
194+
timeout=timeout,
195+
allow_redirects=allow_redirects,
196+
headers=headers,
197+
stream=True
198+
)
199+
status_code = get_response.status_code
200+
content_type = get_response.headers.get('Content-Type', '')
201+
128202
is_valid = status_code < 400
129203

130-
# For 200 status, read content to check for false positives
204+
# If it's a downloadable file, don't read body
205+
if is_downloadable_file(url, content_type):
206+
error_message = f"{status_code} {get_response.reason}" if not is_valid else ""
207+
return (is_valid, status_code, error_message)
208+
209+
# For HTML with 200, check content (already streaming, limit read)
131210
if is_valid and status_code == 200:
132-
content = response.read().decode('utf-8', errors='ignore').lower()
211+
content_chunk = b''
212+
for chunk in get_response.iter_content(chunk_size=1024):
213+
content_chunk += chunk
214+
if len(content_chunk) >= MAX_BODY_SIZE:
215+
break
133216

134-
# Extract title tag content more reliably
135-
title_match = None
217+
content_lower = content_chunk.decode('utf-8', errors='ignore').lower()
136218
title_pattern = r'<title[^>]*>(.*?)</title>'
137-
title_matches = re.findall(title_pattern, content, re.DOTALL | re.IGNORECASE)
219+
title_matches = re.findall(title_pattern, content_lower, re.DOTALL | re.IGNORECASE)
138220
if title_matches:
139221
title_text = title_matches[0].strip()
140-
# Check for various 404 indicators in title
141222
if any(phrase in title_text for phrase in [
142223
'page not found',
143224
'not found',
@@ -151,18 +232,110 @@ def check_url(url: str, timeout: int = 10, allow_redirects: bool = True) -> Tupl
151232
else:
152233
error_message = ""
153234
else:
154-
# No title tag found, check body content for 404 indicators
155-
if 'page not found' in content[:5000] or '404' in content[:5000]:
235+
if 'page not found' in content_lower[:MAX_BODY_SIZE] or '404' in content_lower[:MAX_BODY_SIZE]:
156236
is_valid = False
157237
error_message = "200 (but page content suggests 404)"
158238
else:
159239
error_message = ""
160240
else:
161-
error_message = f"{status_code}" if not is_valid else ""
162-
except urllib.error.HTTPError as e:
163-
status_code = e.code
164-
is_valid = status_code < 400
165-
error_message = f"{status_code} {e.reason}"
241+
error_message = f"{status_code} {get_response.reason}" if not is_valid else ""
242+
except Exception as inner_e:
243+
return (False, 0, str(inner_e))
244+
else:
245+
# Fallback to urllib
246+
# Try HEAD first
247+
try:
248+
head_request = urllib.request.Request(url, headers=headers, method='HEAD')
249+
try:
250+
with urllib.request.urlopen(head_request, timeout=timeout) as response:
251+
status_code = response.status
252+
content_type = response.headers.get('Content-Type', '')
253+
is_valid = status_code < 400
254+
255+
# If downloadable file, just use HEAD result
256+
if is_downloadable_file(url, content_type):
257+
error_message = f"{status_code}" if not is_valid else ""
258+
return (is_valid, status_code, error_message)
259+
260+
# For HTML with 200, need to verify with GET (limited)
261+
if is_valid and status_code == 200:
262+
# Use GET but read limited amount
263+
get_request = urllib.request.Request(url, headers=headers)
264+
with urllib.request.urlopen(get_request, timeout=timeout) as get_response:
265+
content = get_response.read(MAX_BODY_SIZE).decode('utf-8', errors='ignore').lower()
266+
title_pattern = r'<title[^>]*>(.*?)</title>'
267+
title_matches = re.findall(title_pattern, content, re.DOTALL | re.IGNORECASE)
268+
if title_matches:
269+
title_text = title_matches[0].strip()
270+
if any(phrase in title_text for phrase in [
271+
'page not found',
272+
'not found',
273+
'404',
274+
'page does not exist',
275+
'couldn\'t find the page',
276+
'we couldn\'t find'
277+
]):
278+
is_valid = False
279+
error_message = f"200 (but page shows '{title_matches[0].strip()}' in title)"
280+
else:
281+
error_message = ""
282+
else:
283+
if 'page not found' in content[:MAX_BODY_SIZE] or '404' in content[:MAX_BODY_SIZE]:
284+
is_valid = False
285+
error_message = "200 (but page content suggests 404)"
286+
else:
287+
error_message = ""
288+
else:
289+
error_message = f"{status_code}" if not is_valid else ""
290+
except urllib.error.HTTPError as e:
291+
status_code = e.code
292+
is_valid = status_code < 400
293+
error_message = f"{status_code} {e.reason}"
294+
except (urllib.error.URLError, ValueError) as e:
295+
# HEAD not supported or failed, try GET with limited read
296+
try:
297+
get_request = urllib.request.Request(url, headers=headers)
298+
with urllib.request.urlopen(get_request, timeout=timeout) as response:
299+
status_code = response.status
300+
content_type = response.headers.get('Content-Type', '')
301+
is_valid = status_code < 400
302+
303+
# If downloadable file, don't read more than headers
304+
if is_downloadable_file(url, content_type):
305+
error_message = f"{status_code}" if not is_valid else ""
306+
return (is_valid, status_code, error_message)
307+
308+
# For HTML with 200, read limited content
309+
if is_valid and status_code == 200:
310+
content = response.read(MAX_BODY_SIZE).decode('utf-8', errors='ignore').lower()
311+
title_pattern = r'<title[^>]*>(.*?)</title>'
312+
title_matches = re.findall(title_pattern, content, re.DOTALL | re.IGNORECASE)
313+
if title_matches:
314+
title_text = title_matches[0].strip()
315+
if any(phrase in title_text for phrase in [
316+
'page not found',
317+
'not found',
318+
'404',
319+
'page does not exist',
320+
'couldn\'t find the page',
321+
'we couldn\'t find'
322+
]):
323+
is_valid = False
324+
error_message = f"200 (but page shows '{title_matches[0].strip()}' in title)"
325+
else:
326+
error_message = ""
327+
else:
328+
if 'page not found' in content[:MAX_BODY_SIZE] or '404' in content[:MAX_BODY_SIZE]:
329+
is_valid = False
330+
error_message = "200 (but page content suggests 404)"
331+
else:
332+
error_message = ""
333+
else:
334+
error_message = f"{status_code}" if not is_valid else ""
335+
except urllib.error.HTTPError as e:
336+
status_code = e.code
337+
is_valid = status_code < 400
338+
error_message = f"{status_code} {e.reason}"
166339

167340
return (is_valid, status_code, error_message)
168341
except Exception as e:

0 commit comments

Comments
 (0)