-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
122 lines (102 loc) · 3.45 KB
/
main.py
File metadata and controls
122 lines (102 loc) · 3.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import argparse
import sys
from crawler.core import crawl_site
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description=(
"Scan a single webpage and download tabular/document files "
"with optional type/year filters."
)
)
parser.add_argument("--url", help="Base URL to start crawling from")
parser.add_argument(
"--types",
default="",
help="Comma-separated file types (e.g. csv,xlsx,docx). Defaults to all supported.",
)
parser.add_argument(
"--output",
default="downloads",
help="Output directory for downloaded files (default: downloads)",
)
parser.add_argument(
"--run-name",
default="",
help=(
"Optional folder name for this run. "
"If omitted, a timestamped run folder is created automatically."
),
)
parser.add_argument(
"--delay",
type=float,
default=0.5,
help="Delay in seconds between requests (default: 0.5)",
)
parser.add_argument(
"--timeout",
type=float,
default=15.0,
help="Request timeout in seconds (default: 15)",
)
return parser.parse_args()
def prompt_for_run_settings() -> dict:
print("Tabular Data Scraper - Interactive Mode")
print("Press Enter to accept defaults shown in [brackets].")
while True:
url = input("Website URL: ").strip()
if url:
break
print("URL is required.")
types_raw = input(
"File types (comma-separated, blank = all supported) []: "
).strip()
file_types = [item.strip() for item in types_raw.split(",") if item.strip()]
output_dir = input("Output folder [downloads]: ").strip() or "downloads"
run_name = input("Run name (optional) []: ").strip() or None
delay_raw = input("Delay seconds [0.5]: ").strip() or "0.5"
timeout_raw = input("Timeout seconds [15]: ").strip() or "15"
try:
delay_seconds = max(float(delay_raw), 0.0)
timeout = max(float(timeout_raw), 1.0)
except ValueError as exc:
raise ValueError("Delay/timeout must be numbers") from exc
return {
"base_url": url,
"file_types": file_types or None,
"output_dir": output_dir,
"run_name": run_name,
"delay_seconds": delay_seconds,
"timeout": timeout,
}
def main() -> int:
interactive_mode = len(sys.argv) == 1
try:
if interactive_mode:
settings = prompt_for_run_settings()
else:
args = parse_args()
if not args.url:
print("[ERROR] --url is required when using command-line options")
return 2
file_types = [item.strip() for item in args.types.split(",") if item.strip()]
settings = {
"base_url": args.url,
"file_types": file_types or None,
"output_dir": args.output,
"run_name": args.run_name.strip() or None,
"delay_seconds": max(args.delay, 0.0),
"timeout": max(args.timeout, 1.0),
}
crawl_site(**settings)
except ValueError as exc:
print(f"[ERROR] {exc}")
return 2
except KeyboardInterrupt:
print("[ERROR] Interrupted by user")
return 130
if interactive_mode:
input("\nDone. Press Enter to close...")
return 0
if __name__ == "__main__":
sys.exit(main())