-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patha.py
70 lines (49 loc) · 1.89 KB
/
a.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from pathlib import Path
import aiofiles
from tqdm import tqdm
from dynabook_scraper.utils import json
from dynabook_scraper.utils.common import download_file, run_concurrently
from dynabook_scraper.utils.paths import content_dir, data_dir, downloads_dir
from dynabook_scraper.utils.uvloop import async_run
async def process(file: Path):
if not file.is_file() or not file.name.endswith("_crawl_result.json"):
return
content_id = file.name.split("_")[0]
async with aiofiles.open(file) as f:
j = await json.aload(f)
if "actual_size" in j:
return
tqdm.write(str(j))
if j["status_code"] != 200:
return
if "rescue_strategy" in j:
filename = Path(j["original_url"]).name
mirror_filename = Path(j["mirror_url"]).name
if filename != mirror_filename:
bad = downloads_dir / content_id / mirror_filename
good = downloads_dir / content_id / filename
if bad.is_file():
bad.rename(good)
else:
filename = Path(j["url"]).name
j["url"] = f"assets/content/{content_id}/{filename}"
path = Path(data_dir / j["url"])
if not path.is_file() and path.parent.is_dir():
for p in path.parent.iterdir():
if p.name.lower() == path.name.lower():
p.rename(path)
break
if not path.is_file():
if not "rescue_strategy" in j:
async with aiofiles.open(content_dir / f"{content_id}.json") as f:
cj = await json.aload(f)
await download_file(cj["contentFile"], downloads_dir / content_id)
try:
j["actual_size"] = path.stat().st_size
except FileNotFoundError:
breakpoint()
async with aiofiles.open(file, "w") as f:
await json.adump(j, f)
async def main():
await run_concurrently(20, process, list(content_dir.iterdir()))
async_run(main())