-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbook_info.py
51 lines (43 loc) · 1.79 KB
/
book_info.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# 抓取页面内的书籍信息,包括书名、亚马逊购买链接、图片链接、分类
import sqlite3
import requests
from bs4 import BeautifulSoup
import time
import random
conn = sqlite3.connect('book_page.db')
cursor = conn.cursor()
cursor.execute('''CREATE TABLE IF NOT EXISTS books
(book_name TEXT, amazon_link TEXT, pic_link TEXT, cate TEXT)''')
links = cursor.execute("SELECT * FROM link").fetchall()
total = cursor.execute("SELECT COUNT(*) FROM link").fetchone()[0]
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}
start = time.time()
for link in links:
try:
req = requests.get(url=link[1], headers=headers)
time.sleep(random.random())
cate = link[1][27:-5]
req.encoding = 'utf-8'
html = str(req.text).replace('<br>', ' ')
soup = BeautifulSoup(html, 'lxml')
contentList = soup.find_all('div1')
for el in contentList:
# print([el.text.lstrip(), el.myframe.a['href'], el.myframe.a.img['data-src'], cate])
cursor.execute('''INSERT INTO books VALUES (?, ?, ?, ?)''', (el.text.lstrip(), el.myframe.a['href'], el.myframe.a.img['data-src'], cate))
conn.commit() # 每轮提交一次
total -= 1
print(f"{link[1]} finished, 剩下 {total} 项")
except Exception as e:
pass
continue
# ...
# https://www.doradolist.com/technology-and-society.html finished, 剩下 3 项
# https://www.doradolist.com/wireless.html finished, 剩下 2 项
# https://www.doradolist.com/others.html finished, 剩下 1 项
# https://www.doradolist.com/nontechnical.html finished, 剩下 0 项
# 总用时:869.1676423549652
end = time.time()
print(f"总用时:{end - start}")
conn.close()