forked from RimoChan/sese-engine
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path文.py
67 lines (58 loc) · 1.75 KB
/
文.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import re
import posixpath
from urllib.parse import urlparse
from typing import Tuple, List
import lxml.html
from 虫 import 爬
import tldextract
def 缩(url):
t = tldextract.extract(url)
return f'{t.domain}.{t.suffix}'
def 摘要(url: str, **d) -> Tuple[str, str, str, List[str]]:
raw = 爬(url, **d)
if not raw:
return '', '', '', []
q = urlparse(url)
基 = f'{q.scheme}://{q.netloc}'
root = lxml.html.document_fromstring(raw)
text = []
href = []
title = ''
description = ''
def dfs(r):
nonlocal title, description
if r.tag in ('script', 'style'):
return
if r.tag == 'meta' and r.attrib.get('name', '').lower() == 'description':
description = r.attrib.get('content', '')
if r.tag == 'a':
s = r.attrib.get('href')
if s:
s = s.split('#')[0]
if s:
qs = urlparse(s)
if qs.scheme not in ('', 'http', 'https'):
return
if qs.scheme == '':
s = 基 + posixpath.normpath(posixpath.join(q.path, '..', qs.path))
try:
urlparse(s)
except Exception:
None
else:
href.append(s)
s = r.text
if s:
if not isinstance(r.tag, str):
return
s = re.sub('\s+', ' ', s)
s = s.strip()
if s:
if r.tag == 'title':
title = s
else:
text.append(s)
for x in r:
dfs(x)
dfs(root)
return title, description, ' '.join(text), href