-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathplaywright_serp.py
More file actions
81 lines (63 loc) · 2.25 KB
/
playwright_serp.py
File metadata and controls
81 lines (63 loc) · 2.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from playwright.sync_api import Playwright, sync_playwright, expect
from time import sleep
from random import randint
import json
import os
# --- put your inputs in scratchpad ---
from scratchpad import query_list
queries = query_list
output_dir = "scratch"
def act_natural():
sleep(randint(1,5))
def run(playwright: Playwright) -> None:
browser = playwright.chromium.launch(headless=False)
context = browser.new_context()
page = context.new_page()
#----------------------
all_queries = queries
all_links = {}
for next_query in all_queries:
try:
output_file = os.path.join(output_dir, f'{next_query.replace(" ","_")}.csv')
with open(output_file, "w") as f:
f.write("site,url\n")
sites = {}
links = []
q = next_query.replace(" ","+")
url = f"https://duckduckgo.com/?q={q}&hps=1&start=1&ia=web"
page.goto(url)
try:
for i in range(5):
page.get_by_role("link", name="More results").click()
act_natural()
except Exception as ex:
print(ex)
all_results = page.get_by_test_id("result-title-a")
result_count = all_results.count()
for i in range(result_count):
r = all_results.nth(i)
a = r.get_attribute("href")
if "duckduckgo" not in a:
links.append(a)
for a in links:
site = a.split("//")[1].split("/")[0]
if "www" in site:
site = site.split("www.")[1]
if site not in sites:
sites[site] = a
with open(output_file, "a") as f_out:
for s, r in sites.items():
f_out.write(f'{s},{r}\n')
all_links[next_query] = links
except Exception as ex:
context.close()
browser.close()
print(ex)
sleep(randint(10,30))
with open('search_results.json', 'w') as f:
json.dump(all_links, f)
# ---------------------
context.close()
browser.close()
with sync_playwright() as playwright:
run(playwright)