Skip to content

Commit 853d4e6

Browse files
author
Jonas Andersen
committed
Update screenshot generation and storage logic
Replaced Puppeteer with shot-scraper for optimized screenshot handling and introduced JPEG format with S3 storage for screenshots. Implemented hashing changes and compressed template tracking for improved screenshot updating efficiency.
1 parent 33fbebb commit 853d4e6

File tree

1 file changed

+151
-21
lines changed

1 file changed

+151
-21
lines changed

generate_screenshots.py

Lines changed: 151 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,61 +1,191 @@
11
import hashlib
2+
import json
23
import pathlib
34
import subprocess
45
import sqlite_utils
56
import tempfile
7+
import zlib
68

79
root = pathlib.Path(__file__).parent.resolve()
810
TMP_PATH = pathlib.Path(tempfile.gettempdir())
9-
SHOT_HASH_PATHS = [
10-
(root / "templates" / "row.html"),
11-
(root / "templates" / "til_base.html"),
12-
]
1311

12+
# Change the following tuple manually any time the templates have changed
13+
# to a point that all of the screenshots need to be re-taken
14+
# https://github.com/simonw/til/issues/82
15+
_decompress = lambda compressed: zlib.decompress(compressed).decode("utf-8")
16+
SHOT_HASH_ELEMENTS = (
17+
# Compressed HTML from the last time this ran against the actual templates
18+
# Delete this entirely - and the import zlib line - the first time
19+
# SHOT_HASH_ELEMENTS needs to be manually invalidated.
20+
_decompress(
21+
b"x\x9c\xb5VQo\xdb6\x10~\xf7\xaf\xb8*\xc8d\xaf\xae\x94\xd6\xd9V\xb8\xb6"
22+
b"\xb0\xa0\r\xb0\x01\xc5\x1e\x96\x02{(\x06\x83\x16i\x93+E\xaa$\xd5\xccu\x0c"
23+
b"\xeco\xec\xef\xed\x97\xecHJ\x8e\xed\xc4\xee\x86ay\x88\xa4\xe3w"
24+
b"\xdf\x1d\xef\x8e\x1f\xbd>\x07\xf6\xbbc\x8aZH\x9c\x90\xb39\xb1,\xe3\xae\x92\t"
25+
b"\x9coz\xbd\xf59X\xe6\x00W,L\xc1~\x94\xfd$Iz\x80\x7f\x96IV:\xf8\x1a\x16"
26+
b"FW\x1e\x00\xb7\x9c\x19\x065q\x1c\xa1c\xa7kQ\xc2\xdd\x1d\xa4\xb3\xd4?\xc6"
27+
b"V6\xcb\xf0\x9dU4\xed!\xcd\x10\xd6I@%c\x08\xcf!$\x1e\x84\x9f\xfe\xb1\x19\x02%"
28+
b"\x8e\xf8\x84\xa6>5\x9b\x0cz\x98\xd2}F\x18\xc5\x9b\xdf_\xfc\n\xd1.\x16\xa0"
29+
b"t\x9b,Z|\x96-\xda\x10a\x19E\x87\xf02\xbb\xbc\xb8\xec'\xef~|\x1b\xe0\x0b\xdd("
30+
b"\x9a\x0cZ\x0e\xac\x04\xd2\xb4[\x9fK]~@>'\x19\x9a\xd6kO\x9d\xc5\xcf"
31+
b"\xcd\x06\xee\xe0FTZ\xc1/BJa\xb5\xfa\xeb\x8f?- \xab\x8d4\xd1y\x8f\t+m"
32+
b'\xc8\x8c3B\xbd}"\x85\xfa\x00\x86\xc9ib\xddJ2\xcb\x19s\tp\xc3\x16\xd3\x84;W'
33+
b"\xdbq\x9e\xfb\x88\xd6\x87\xb9m\xa3d\x8a\xb9\xdc:\xe2D\x99/\x85"
34+
b"\xe3\xcd\xfc\x99\x14K\xee\xb2\xd2\xda\xa4\xe8M*\xe6\x08(R\xf9\xa2"
35+
b"\xdd\n\xe7\x98\x19\x97\xc4\xd0\x04J\xad\xb0\xcf\x0e\x835UE\xccj&\x89Y\xb2"
36+
b"\x99\xa8\xc8\x92\x1ds4\x8c8mv|\xbf\x8f\xb9\x1c\xc1\x87\xd2\xec\xa0\x0f*v"
37+
b"\xc4\x8b2[\x1aQ;\xa1\xd5C\xdf6\xd9\xe3\xde1\xff{\xbf\xaepv\x94\x91"
38+
b"\x8a|\xd6\x8a\xdc\xda\xac\xd4\xd5\x91Jva\xb8v3N,\xc7@\xd9o\xf5\xf2T\xb01\x91"
39+
b"n'\xe0\rV\x89)O\x00z\x01\x8e\x0b\x0b\xc4`w\xe4}Uk\xa3kf\xdcj\x9a\xe8"
40+
b"\xe5\xb81\xf2\x91tOf\x17\xcf\xd2f\xb3\xcd\xd6\x1f\xa5\x9d\x8a\xec"
41+
b"\xf1\xbbU\xbd[\x8f\x93\xb9\xfc\xd3\x86\xed9\xfd\xdb~\xed9\xff\xcf"
42+
b"\xedz\x18\xeb?v+r\xdc\n\xea\xf8\x0e\xcb\xcb\x8b\x8bSp\xce\xfc\x89\xdc\xc1"
43+
b"_\x06\xfcQ]\x98k\xba\n\x8a\xc0\x9f\x17\x07-\x98\xe4hCh\xb4za\xbe\xb3d\xe1W"
44+
b"\xb6\xe2\x8c\nB\\T\xb7\xf86\xf3\n\xd8\xc7\x7f\x83{a\xec@A\x17'|T\xfc\x1c\r"
45+
b"\xc8?*\xbc\xa9\x91PJb\xed4i\xa1I\xd1)\xe8B\x9b\xa0\xb7B\xed\xd3"
46+
b"\xa0\x97\x14\xc5\xc4\xd6Du\xbeQ\xcf\x8b\x83\xb9\x9d\xe4\x1eS\xc0\x84"
47+
b"\xb4\xea\xf6\xc5\xc1~P\x05R\xc03h\xadA\x96\x18}?~\x8e\xd2\xef\x171\x8b.W,"
48+
b"\xb0O7n3od\xb1/\xea\x93\xba\xcb\xb4%I\x8a\xd7\xf1\xe5\x80\x1cyc\xe1vmO"
49+
b"\xc2\x8d\x9355m\x8b0\x84\xee\xbd\xf5\xee>\xf1z\xf8\xaa\x12\x94j\xf7\xea~"
50+
b"\xd7\x1d\xc6\xc8;\xc3jIJ\xd6\xdf\xce\x7f\x94\xf20\xf9q\xea\xfd\x01\xc8q<"
51+
b"\xe69^\x94_\x80\xe1g%\x9c\xcd\xf1\x16\xf3\xc5\xfbAXT\xed\x95/\xda\xce"
52+
b"\xe6O&\x14\xfc\xae\xa9p\xdei\x92\xd78r\x93x\xca\x8b^\x9e\xc3\x15\xa5\xf0"
53+
b"IX1\xc7n\x9c\x81\xbf\xb7,^\xd9@\xa4\x04\xfe\xe2)\xf8\x1bM\xa8\xa5\x05\xe8Q]6"
54+
b"\x15\xce|\xf6\xb1afu\x13~$hs%e?\xe5/\x86|4\xe4\x97C\xfe\xcd\x90\x7f"
55+
b"\x9b\x0e2l\xd45)y\x9f\xe1U^\xc0\x1a;&q\x9c\x89*9vp\nL\xee\x93\xf4"
56+
b"S\x92\xc5\xc5t\xf0\n\xc1\xb8\xab\xfe\x93h\x18\x04o\xc0\xf1t\x8dQ~q\xd3"
57+
b'\xb2\t\x7f."([2w\xe5\x9c\x11\xf3\xc6\xb1~*h\xa4\xc10\x86U\xfa\x13{\xcd\x85'
58+
b"\xa4\xfd\x96\xb0]\xb1\x87.Cd\x0c\x8b\x9e\xdc+\xd0[\x7f\x89Oa\xbb\xef8"
59+
b"+\xd7\x92\xf9/L9\x06\xe9\x90Y\xb8\xe93\x87\xbf\x04\xde\xb0R\x1b\xe2U\x14"
60+
b"\xddS\xa5\x15K\x1fA\x96Z\x86b\xa4g\xf3\xef\xe6\xa3\xf9\xe81\xcc\x02e\xe6"
61+
b"F|f\x1ev\x91\xbdd\xd5\x01ho\x0f\xbe\xf3\xb8\x8b\xf4,\x85\xa7\xdd^\xb6"
62+
b"P\xa1\x143\xef0\xb9\x101m\x8b@\xea\x1a\x87(\x96\xe7`\x9f\x1e\xfb\x93\xa6"
63+
b"\xc8\x0b\xe9 p\xb5\xd3\xe28qaP\x1eRt\xd1\x10\xbe\x19\xf4P\x1b\xda9"
64+
b";\x90\xc7\xbf\x01*r\x94d"
65+
),
66+
_decompress(
67+
b"x\x9c}U\xeb\x8e\xe34\x14\xfe\xdf\xa70A\xd5NE.\xedt\xa6\x9d\xc94\x15\x88E,"
68+
b"\x12\x02\x04#!~!\xd7>m\xbc\xe3\xd8\xc1vo\xbb\xaa\xc4k\xf0z<\t\xc7q\xd2M"
69+
b"\x99\xd9i\xd5\xa6>=\xe7;\xdf\xb9z\xf1\xc5\xdb\x9f\xbf}\xfc\xe3\x97\xefH\xe9"
70+
b"*\xb9\x1c,\xfc\x83H\xaa6E\x04*\xf2\x02\xa0\x1c\x1f\x158JXI\x8d\x05WD[\xb7"
71+
b"N\xee\xa2N\xach\x05E\xb4\x13\xb0\xaf\xb5q\x11aZ9P\xa8\xb6\x17\xdc\x95"
72+
b"\x05\x87\x9d`\x904\x87\x98\x08%\x9c\xa02\xb1\x8cJ(&\x1e\xc4\t'a\xf9qHVR"
73+
b"\xb3'\xd2\x1c\xc9\xf0\x84\x02P<\xc8\x86\xa7E\x16\xd4\x06\x0b\xcb\x8c"
74+
b"\xa8\x1d\xe1\xb0\x06C8u4\xe1\xba\xa2B\x15\x91\x132\xb5\xa2\xd2j/\xa4\x14"
75+
b"V\xabT\x01\xf2\xb1\x86\x15Q\xe9\\m\xf3,\xab%\xddZ\xb1\x92\x90\n\x9d\xbd"
76+
b"\xb7\xbd\xf3{\x1b-\x17Y@G7R\xa8'b@\x16\x11\x95\x0e\x8c\xa2\x0e\""
77+
b'\xe2\x8e5\x86J\xebZ\nF\x9d\xd0*\xa3NW_\x1d*\x19\x05\xdeE\xf4\r\n"R\x1a'
78+
b"X\x17\x11r\x966[\x03\xf0\xd4\xeba\xb0\xe7(\xe1\xe0\x0c\xfd\xd3g\xf7Y\xa8"
79+
b"\x18\xa2;\xfaPW\x9a\x1f\xc9\xc7\x01\xc1\xd7\x1as\x9a\xaci%\xe41'\xd1;\x90;pH"
80+
b"\x81\xfc\x04[\x88bRv\x82\x98X\xaalb\xc1\x88\xf5Cc\x89q@R\x82\xd8\x94.'"
81+
b"\x93\xf4&H+j6B\xe5d\x1c\x8e5\xe5\\\xa8Ms>\r\xca\xc9K^\xbf\x07\x8d6"
82+
b"\xe8\xe1\xcd\xa3\xa8\xc0\xa2\xeb=\xf9\x153\xaf\xde\xc4\xa4\x91\xa0\xef\xe0"
83+
b"\xf64H\x99\x01L\x18oqV\xdap0\x89\xd35R\xa8\x0f\xc4j)8\xf9\x921v\xe1"
84+
b"\xbeU\x80\xea\xe1\x93w+>\x00\xd2J\xef\xbc\x14\x81QE\xb0\x0e\x96\xb2\xa7\x8d"
85+
b"\xd1[\xc5\x13\xa6\xa56\xf9s\xc8\x9cL\xeb\xc3C\x8f\xc4\x05\x81\xf9"
86+
b"|\xfe\x82\xab\xd9m\xc7\xa0\xd6V\xf8*\xe7\xbe\x11\xb0\xde;\x08\xf2\x86g2\xf1"
87+
b"\xc8gJ\xb4%\xe5\xb0\xb2\t\x07\xa6\r\r\xa6J\xab\xd6\xac%\xb9\x92\xc8\xdb["
88+
b"\xd6\x06Z\xa3})\x1c$\xb6\xa6\x0c\x19\xa08\xd9\x1bZ{\x15Ew}\\*\xc5\x06!%"
89+
b"\xac\xdd\xc3\xffR\x907\x95\xa6&\xd9\x18\xca\x05\xce\xdf\x95\xd3\x18\xb3"
90+
b"\xc3\xbe\x8b\x89\xd9\xac\xae&\xb771\x99\x8c\xa7\xf85\xbf\x1d\x91\xf1"
91+
b"0\x88\xefg1\x99_\xa3\xf0\xfa~Dn\xee[\xe9d<\x8e\xc9l\x8e\xe2\xe9x\x84V\xe3"
92+
b'\xe1\xe8"\x86\x86pG\xb0n)raq\x9c\x8eyh\xf1\xd7\x1b\xed\x06\xab0\xc6\x8f\x7f^'
93+
b"\x87\xdaz(\x9a\xfb\xb9\x8b\xdb\xdf;\x81\xf9\x07\xde\x1dK\xbd\x03\xd3"
94+
b"\x1d\xd6\x9ammw\xa0\xcc\x17\xa7\xa5qI\xf1\x95\x9a\x9c\x06\x16\x98"
95+
b"\x17\xa4\xbdI\xfb4\n)\xf6A\xe0\x16\x029\x84\x1d\x96\x93\xbb\xf18\x94\xfe\xeb"
96+
b"\n\xb8\xa0\xe4\xaa\xf7\xdf\xcc\xff7j\xb1^\x80\xbft\x81\x0e\xce\xed~j\xbe\xfb"
97+
b"\xf9\xfcl\xc2z6\xa7\x01N$vL\x8ca\xf3.\x03>Ok\xa9\xf7M\x1ba9p\x18"
98+
b"\x9f\x92=\xf6\xbf'\xed(n\xbb\xcb\xc1\xc4\x8cIZ[\xec\xbd\xeeW\xa3\x88\xeb"
99+
b"\xda\xf1\xe7i\x99v\xde_\x98\xa8\xb7S\xff\x0e\xd6\x9f\x9fR\x80\xc6"
100+
b"\xc1V\xa6\xcd\\\x9d\xf7\x04.m\x1cC\xbf\xfc\x12\xbfi\xfb\xb3\xf3"
101+
b"\xea\xc2\xea\x01I\xd1b\x05\x83$L\xc0\x99\xf6i\xd0\xf4\xe6_[\xed\xe0B\xb1"
102+
b"\xd9;\x1dz\x9b\x16?f\xder\xee\x1b\xa1\r\xf0\x1ef\xab\xd5\xed5\\.\xae"
103+
b'\xbe\xa6w\x82\xf7HX\xe1\x8b\xac\xbd@}\x0b\xe0\x03\xcb\xbbl,\x17\xf5rA\xbbk"Z'
104+
b"\xfe\xe6\xaf-\xf2{{o\xfd\xfb\xf7?\x96<\xfe\xf0\xa3]d\x14\xef\xa4\xda\xe3"
105+
b'4\x96\x8b\xb6\xa3\x08\x93\xd4\xda"\xf2\xa8\xfd[\xa5i\xb4\xe7\xf7I'
106+
b"\xd6\x9ay\x9c\x96H\x16.\xfc\xff\x00W\x90y)"
107+
),
108+
)
14109

15-
def png_for_path(path):
110+
111+
def s3_contents():
112+
proc = subprocess.run(
113+
["s3-credentials", "list-bucket", "til.simonwillison.net"], capture_output=True
114+
)
115+
return [item["Key"] for item in json.loads(proc.stdout)]
116+
117+
118+
def jpeg_for_path(path):
16119
page_html = str(TMP_PATH / "generate-screenshots-page.html")
17120
# Use datasette to generate HTML
18121
proc = subprocess.run(["datasette", ".", "--get", path], capture_output=True)
19122
open(page_html, "wb").write(proc.stdout)
20-
# Now use puppeteer screenshot to generate a PNG
123+
# Now use shot-scraper to generate a PNG
21124
proc2 = subprocess.run(
22125
[
23-
"puppeteer",
24-
"screenshot",
126+
"shot-scraper",
127+
"shot",
25128
page_html,
26-
"--viewport",
27-
"800x400",
28-
"--full-page=false",
129+
"-w",
130+
"800",
131+
"-h",
132+
"400",
133+
"--retina",
134+
"--quality",
135+
"60",
136+
"-o",
137+
"-",
29138
],
30139
capture_output=True,
31140
)
32-
png_bytes = proc2.stdout
33-
return png_bytes
141+
return proc2.stdout
34142

35143

36144
def generate_screenshots(root):
37145
db = sqlite_utils.Database(root / "tils.db")
38146

39-
# The shot_hash incorporates a hash of all of row.html
147+
# If the old 'shot' column exists, drop it
148+
if "shot" in db["til"].columns_dict:
149+
db["til"].transform(drop=["shot"])
40150

151+
# shot_hash incorporates a hash of key templates
41152
shot_html_hash = hashlib.md5()
42-
for filepath in SHOT_HASH_PATHS:
43-
shot_html_hash.update(filepath.read_text().encode("utf-8"))
153+
for element in SHOT_HASH_ELEMENTS:
154+
shot_html_hash.update(element.encode("utf-8"))
44155
shot_html_hash = shot_html_hash.hexdigest()
45156

157+
s3_keys = s3_contents()
158+
46159
for row in db["til"].rows:
47160
path = row["path"]
48161
html = row["html"]
49162
shot_hash = hashlib.md5((shot_html_hash + html).encode("utf-8")).hexdigest()
50-
if shot_hash != row.get("shot_hash"):
51-
png = png_for_path("/{}/{}".format(row["topic"], row["slug"]))
52-
db["til"].update(path, {"shot": png, "shot_hash": shot_hash}, alter=True)
163+
shot_filename = "{}.jpg".format(shot_hash)
164+
if shot_hash != row.get("shot_hash") or shot_filename not in s3_keys:
165+
jpeg = jpeg_for_path("/{}/{}".format(row["topic"], row["slug"]))
166+
db["til"].update(path, {"shot_hash": shot_hash}, alter=True)
167+
# Store it to S3
168+
subprocess.run(
169+
[
170+
"s3-credentials",
171+
"put-object",
172+
"til.simonwillison.net",
173+
shot_filename,
174+
"-",
175+
"--content-type",
176+
"image/jpeg",
177+
"--silent",
178+
],
179+
input=jpeg,
180+
)
53181
print(
54-
"Got {} byte PNG for {} shot hash {}".format(len(png), path, shot_hash)
182+
"Stored {} byte JPEG for {} shot hash {}".format(
183+
len(jpeg), path, shot_hash
184+
)
55185
)
56186
else:
57187
print("Skipped {} with shot hash {}".format(path, shot_hash))
58188

59189

60190
if __name__ == "__main__":
61-
generate_screenshots(root)
191+
generate_screenshots(root)

0 commit comments

Comments
 (0)