-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathspider.py
More file actions
78 lines (77 loc) · 2.7 KB
/
spider.py
File metadata and controls
78 lines (77 loc) · 2.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import re
import requests
import xlsxwriter
import string
import threading
import time
from bs4 import BeautifulSoup
from xlrd import open_workbook
times = 0
def error(i):
with open('/home/dawson_chen/error.txt','a') as f:
f.write("error question:"+str(i))
f.close()
def writefile(info):
workbook = xlsxwriter.Workbook('/home/dawson_chen/spider.xlsx')
worksheet = workbook.add_worksheet()
row = 1
for each in (info):
worksheet.write(row, 1, each[0])
worksheet.write(row, 2 ,each[1])
worksheet.write(row, 3, each[2])
row += 1
workbook.close()
info = []
def do_something():
url = 'https://www.diandianwen.com/ask/initAskDetail/'
headers = {'User-Agent' : 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36'}
global info
#i = depth
for i in range (1,18091):
try:
html = url + str(i)
a = ''
b = ''
c = ''
r = requests.get(html, timeout=10, headers=headers)
soup = BeautifulSoup(r.text,'lxml')
a = str(soup.find_all('span',attrs="adop-date"))
for child in soup.find_all('div', attrs ="adop-answer"):
b +=str(child.find_all('span'))
if(len(b) == 0):
b += str(child.find_all('p'))
for child in soup.find_all('div',attrs = "answ-text"):
b += str(child.find_all('p'))
#print(b)
#print(type(b))
c = str(soup('title'))
a = re.findall(r'<span class="adop-date">(.*).0</span>',a)
answer = re.findall(r'>(.*?)</span>',b)
#print(type(answer))
if (answer == []):
answer = re.findall(r'>[ ]{0,}(.*?)[ ]{0,}</p>',b)
if(answer == []):
b = ''
for child in soup.find_all('div' ,attrs="adop-answer"):
b+=str(child.find_all('p'))
answer = re.findall(r'>(.*?)</span>', b)
# print(type(answer))
if (answer == []):
answer = re.findall(r'>[ ]{0,}(.*?)[ ]{0,}</p>', b)
c = re.findall(r'<title>(.*?)- 点点问税</title>',c)
tim = ''.join(a)
answer = ''.join(answer)
question = ''.join(c)
print("问题:"+str(i))
print(tim)
print(question)
print(answer)
info.append([question , answer ,tim])
#time.sleep(1)
#print (info)
except Exception as e:
print (e)
print("error question:"+str(i))
error(i)
writefile(info)
do_something()