-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
150 lines (121 loc) · 4.26 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env python3
"""
PDF Processor - A Flask web application for processing PDF files.
This application provides a web interface for uploading PDF files,
extracting their text content, and presenting it in a clean format.
It supports files up to 50MB and processes them locally for privacy.
Features:
- PDF file upload with size validation
- Text extraction with formatting preservation
- Local file processing
- Secure file handling
- Clean web interface
Dependencies:
- Flask: Web framework
- PyPDF2: PDF processing
- python-magic: File type validation
- Werkzeug: File handling utilities
"""
import os
import tempfile
from flask import Flask, request, jsonify, render_template, send_from_directory
from werkzeug.utils import secure_filename
import PyPDF2
import magic
# Initialize Flask application
app = Flask(__name__)
# Configuration
app.config['MAX_CONTENT_LENGTH'] = 50 * 1024 * 1024 # 50MB limit
app.config['UPLOAD_FOLDER'] = tempfile.gettempdir() # Use system temp directory
ALLOWED_EXTENSIONS = {'pdf'} # Only allow PDF files
def allowed_file(filename):
"""
Check if the uploaded file has an allowed extension.
Args:
filename (str): Name of the uploaded file
Returns:
bool: True if file extension is allowed, False otherwise
"""
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
def is_pdf(file_path):
"""
Verify that the uploaded file is actually a PDF using libmagic.
Args:
file_path (str): Path to the uploaded file
Returns:
bool: True if file is a PDF, False otherwise
"""
mime = magic.Magic(mime=True)
return mime.from_file(file_path) == 'application/pdf'
def extract_text_from_pdf(file_path):
"""
Extract text content from a PDF file.
Args:
file_path (str): Path to the PDF file
Returns:
str: Extracted text content
Raises:
Exception: If PDF processing fails
"""
try:
with open(file_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
text = []
for page in reader.pages:
text.append(page.extract_text())
return '\n'.join(text)
except Exception as e:
return str(e)
@app.route('/')
def index():
"""Render the main page."""
return render_template('index.html')
@app.route('/static/<path:path>')
def send_static(path):
"""Serve static files."""
return send_from_directory('static', path)
@app.route('/upload', methods=['POST'])
def upload_file():
"""
Handle file upload and processing.
Expects a PDF file in the request, processes it, and returns the extracted text.
Implements several security measures:
- File type validation
- Size limit enforcement
- Secure filename handling
- Temporary file cleanup
Returns:
JSON response with either:
- success: {'text': extracted_text}
- error: {'error': error_message}
"""
# Check if file was uploaded
if 'file' not in request.files:
return jsonify({'error': 'No file provided'}), 400
file = request.files['file']
if file.filename == '':
return jsonify({'error': 'No file selected'}), 400
# Validate file type
if not allowed_file(file.filename):
return jsonify({'error': 'Invalid file type. Please upload a PDF file.'}), 400
try:
# Secure the filename and save the file
filename = secure_filename(file.filename)
filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
file.save(filepath)
# Verify file is actually a PDF
if not is_pdf(filepath):
os.remove(filepath)
return jsonify({'error': 'Invalid PDF file'}), 400
# Process the PDF and extract text
text = extract_text_from_pdf(filepath)
# Clean up - remove the temporary file
os.remove(filepath)
return jsonify({'text': text})
except Exception as e:
# Handle any processing errors
return jsonify({'error': str(e)}), 500
if __name__ == '__main__':
# Start the Flask development server
# Note: In production, use a proper WSGI server like gunicorn
app.run(debug=True, host='0.0.0.0', port=8080)