-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPDF Combiner.py
298 lines (252 loc) · 11.5 KB
/
PDF Combiner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
"""A PDF processing utility for downloading and combining PDFs from the White House OMB memoranda page.
This module provides functionality to download PDF memoranda from the White House Office of
Management and Budget (OMB) website and combine them into a single PDF document. It includes
logging capabilities and configuration management.
Classes:
Config: Application configuration container with default paths and URLs
PDFProcessor: Main class handling PDF operations (downloading, processing, combining)
Functions:
setup_application_logging: Configures application-wide logging
main: Application entry point
Example:
$ python pdf_combiner.py
Dependencies:
- requests
- beautifulsoup4
- PyPDF2
- logging
- os
- datetime
- typing
- dataclasses
- re
- urllib
The application will:
1. Fetch PDF links from the OMB memoranda page
2. Download individual PDFs to a specified directory
3. Combine all downloaded PDFs into a single document
4. Save the combined PDF with a timestamp
5. Log all operations to both console and file
Notes:
- Requires write permissions in the specified download and output directories
- Uses rotating file logs with 5MB size limit and 3 backup files
- Handles both relative and absolute URLs
- Implements error handling and logging for all major operations
"""
import os
from io import BytesIO
from typing import List, Optional, Dict
from urllib.parse import urlparse, urljoin
from dataclasses import dataclass
import logging
from logging.handlers import RotatingFileHandler
import re
from datetime import datetime
import requests
from bs4 import BeautifulSoup
from PyPDF2 import PdfReader, PdfWriter
from requests.exceptions import HTTPError, RequestException
from typing import Dict, List, Tuple
import time
def setup_application_logging(name: str, log_dir: str) -> logging.Logger:
"""Configure application-wide logging with file and console handlers."""
os.makedirs(log_dir, exist_ok=True)
logger = logging.getLogger(name)
logger.setLevel(logging.INFO)
if logger.handlers:
return logger
formatter = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
# Console handler
console = logging.StreamHandler()
console.setFormatter(formatter)
logger.addHandler(console)
# Rotating file handler
file_handler = RotatingFileHandler(
os.path.join(log_dir, f"{name}.log"),
maxBytes=5*1024*1024, # 5MB
backupCount=3
)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
return logger
@dataclass
class Config:
"""Application configuration."""
BASE_URL: str = 'https://www.whitehouse.gov'
MEMO_URL: str = 'https://www.whitehouse.gov/omb/information-for-agencies/memoranda/'
A11_URL: str = 'https://www.whitehouse.gov/wp-content/uploads/2018/06/a11_web_toc.pdf'
DOWNLOAD_DIR: str = os.path.join(os.path.expanduser('~'), 'Downloads', 'DOJ EA Proposal', 'Downloaded_PDFs')
OUTPUT_DIR: str = os.path.join(os.path.expanduser('~'), 'Downloads', 'DOJ EA Proposal', 'Combined_Files')
LOG_DIR: str = os.path.join(os.path.dirname(__file__), 'logs')
PDF_PATTERN: str = r'.*\.pdf$'
WORD_LIMIT: int = 50000 # Maximum words per combined PDF
CACHE_FILE: str = os.path.join(os.path.dirname(__file__), 'download_cache.json')
FORCE_DOWNLOAD: bool = False
class PDFProcessor:
"""Handles PDF download, processing and combination operations."""
def __init__(self, config: Config):
self.config = config
self.logger = setup_application_logging("pdf_combiner", config.LOG_DIR)
self._validate_paths()
def _validate_paths(self) -> None:
"""Ensure all required paths exist and are writable."""
for path in [self.config.DOWNLOAD_DIR, self.config.OUTPUT_DIR]:
if not os.path.isabs(path):
raise ValueError(f"Path must be absolute: {path}")
os.makedirs(path, exist_ok=True)
if not os.access(path, os.W_OK):
raise PermissionError(f"Directory not writable: {path}")
def get_pdf_links(self) -> List[str]:
"""Retrieve PDF links from the memo page."""
try:
self.logger.info("Fetching PDF links from memo page...")
response = requests.get(self.config.MEMO_URL, timeout=30)
response.raise_for_status()
self.logger.info("Parsing webpage content...")
soup = BeautifulSoup(response.content, 'html.parser')
links = soup.find_all('a', href=re.compile(self.config.PDF_PATTERN))
pdf_links = [self._get_absolute_url(link['href']) for link in links]
self.logger.info(f"Found {len(pdf_links)} PDF links")
return pdf_links
except Exception as e:
self.logger.error(f"Error getting PDF links: {str(e)}")
return []
def _get_absolute_url(self, relative_url: str) -> str:
"""Convert relative URL to absolute URL."""
if bool(urlparse(relative_url).netloc):
return relative_url
return urljoin(self.config.BASE_URL, relative_url)
def _should_download(self, url: str, target_path: str) -> bool:
"""Check if file should be downloaded."""
if self.config.FORCE_DOWNLOAD:
return True
if not os.path.exists(target_path):
return True
try:
response = requests.head(url, timeout=30)
remote_size = int(response.headers.get('content-length', 0))
local_size = os.path.getsize(target_path)
if remote_size != local_size:
return True
self.logger.debug(f"Skipping existing file: {target_path}")
return False
except Exception as e:
self.logger.warning(f"Error checking file {url}: {str(e)}")
return True
def download_pdfs(self, urls: List[str], max_retries: int = 3) -> Tuple[Dict[str, str], List[str]]:
"""
Download PDFs from provided URLs.
Returns tuple of (successful_downloads, failed_urls)
"""
downloaded_files = {}
failed_urls = []
for url in urls:
retries = 0
while retries < max_retries:
try:
filename = os.path.join(self.config.DOWNLOAD_DIR, os.path.basename(url))
if not self._should_download(url, filename):
downloaded_files[url] = filename
break
response = requests.get(url, timeout=30)
if response.status_code == 404:
self.logger.warning(f"File not found (404): {url}")
failed_urls.append(url)
break
response.raise_for_status()
with open(filename, 'wb') as f:
f.write(response.content)
downloaded_files[url] = filename
self.logger.info(f"Downloaded: {url}")
break
except HTTPError as he:
if he.response.status_code == 404:
self.logger.warning(f"File not found (404): {url}")
failed_urls.append(url)
break
retries += 1
if retries == max_retries:
self.logger.error(f"Max retries reached for {url}: {str(he)}")
failed_urls.append(url)
else:
time.sleep(1) # Wait before retry
except Exception as e:
self.logger.error(f"Error downloading {url}: {str(e)}")
failed_urls.append(url)
break
return downloaded_files, failed_urls
def _count_words(self, pdf_reader: PdfReader) -> int:
"""Count approximate words in a PDF."""
word_count = 0
for page in pdf_reader.pages:
text = page.extract_text()
words = text.split()
word_count += len(words)
return word_count
def combine_pdfs(self, downloaded_files: Dict[str, str]) -> List[str]:
"""Combine downloaded PDFs into multiple files based on word limit."""
output_paths = []
current_merger = PdfWriter()
current_word_count = 0
file_counter = 1
try:
for filepath in downloaded_files.values():
reader = PdfReader(filepath)
file_words = self._count_words(reader)
# If adding this file would exceed limit, save current and start new
if current_word_count + file_words > self.config.WORD_LIMIT and current_word_count > 0:
output_path = os.path.join(
self.config.OUTPUT_DIR,
f'combined_memos_part{file_counter}_{datetime.now().strftime("%Y%m%d_%H%M%S")}.pdf'
)
with open(output_path, 'wb') as output_file:
current_merger.write(output_file)
output_paths.append(output_path)
self.logger.info(f"Created combined PDF part {file_counter} at: {output_path}")
# Reset for next file
current_merger = PdfWriter()
current_word_count = 0
file_counter += 1
# Add pages to current merger
for page in reader.pages:
current_merger.add_page(page)
current_word_count += file_words
# Save final file if there's anything left
if current_word_count > 0:
output_path = os.path.join(
self.config.OUTPUT_DIR,
f'combined_memos_part{file_counter}_{datetime.now().strftime("%Y%m%d_%H%M%S")}.pdf'
)
with open(output_path, 'wb') as output_file:
current_merger.write(output_file)
output_paths.append(output_path)
self.logger.info(f"Created final combined PDF part {file_counter} at: {output_path}")
return output_paths
except Exception as e:
self.logger.error(f"Error combining PDFs: {str(e)}")
return []
def main() -> None:
try:
config = Config()
processor = PDFProcessor(config)
pdf_links = processor.get_pdf_links()
if not pdf_links:
processor.logger.error("No PDF links found")
return
downloaded_files, failed_urls = processor.download_pdfs(pdf_links)
if failed_urls:
processor.logger.warning(f"Failed to download {len(failed_urls)} files")
if downloaded_files:
output_paths = processor.combine_pdfs(downloaded_files)
if output_paths:
processor.logger.info(f"Created {len(output_paths)} combined PDF files")
except Exception as e:
processor.logger.exception("Fatal error in main application")
raise
finally:
processor.logger.info("Application shutting down")
if __name__ == "__main__":
main()