-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscrape.py
executable file
·155 lines (133 loc) · 4.86 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import os
import re
import htmlentitydefs
import calendar
import urllib2
import shutil
from BeautifulSoup import BeautifulSoup as BS
from BeautifulSoup import BeautifulStoneSoup as BSS
import utils
# Abbreviations on Silvestrini's website are in Vatican style
##
# Removes HTML or XML character references and entities from a text string.
# Taken from http://effbot.org/zone/re-sub.htm#unescape-html
#
# @param text The HTML (or XML) source text.
# @return The plain text, as a Unicode string, if necessary.
def unescape(text):
def fixup(m):
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
return re.sub("&#?\w+;", fixup, text)
def my_generator(piece):
for thing in piece.childGenerator():
if 'name' in thing.__dict__ and thing.name == 'p':
for thing2 in my_generator(thing):
yield thing2
else:
yield thing
def interpret_div(div, h3_title, terminator, skip_heading=False):
quote = None
text = None
for thing in my_generator(div):
if 'name' in thing.__dict__:
if thing.name == 'h3':
if thing.text.upper().startswith(h3_title):
text = ''
elif thing.name == 'em':
if quote is None:
quote = thing.text
elif thing.name == 'br':
if text is not None and text != '':
text += '\n\n'
elif thing.name in ['strong', 'div']:
pass
else:
assert False, "Unkown tag %s" % (thing.name)
else:
if terminator is not None and thing.strip().startswith(terminator):
break
elif text is not None and thing.strip() != '':
if text == '' and skip_heading and thing.strip().startswith('Da'):
pass
else:
text += thing.strip()
return quote, text
def scrape_file(fhtml):
text = fhtml.read()
soup = BS(text, convertEntities=BSS.HTML_ENTITIES)
div_prima = soup.find('div', id='div_prima_lettura')
div_salmo = soup.find('div', id='div_salmo')
div_seconda = soup.find('div', id='div_seconda_lettura')
div_vangelo = soup.find('div', id='div_vangelo')
res = []
res.append(interpret_div(div_prima, 'PRIMA LETTURA', 'C: Parola di Dio', skip_heading=True))
res.append(interpret_div(div_salmo, 'SALMO RESPONSORIALE', None))
second = interpret_div(div_seconda, 'SECONDA LETTURA', 'C: Parola di Dio', skip_heading=True)
if second != (None, None):
res.append(second)
res.append(interpret_div(div_vangelo, 'VANGELO', 'C: Parola del Signore'))
# Some substitutions for known bugs in the input files
text_substs = [(u'\x96', u'-'),
(u'\u2019', u'\''),
(u'\xa0', u' '), # Non breaking space
(u'\x87', u'‡'),
(u'á', u'à')]
quote_substs = [(u'Sal.', u'Sal '),
(u'–', u'-')]
for i in xrange(len(res)):
quote, text = res[i]
if quote is not None:
for tag, value in quote_substs:
quote = quote.replace(tag, value)
if text is not None:
for tag, value in text_substs:
text = text.replace(tag, value)
# Strip extremal whitespaces, leaving only a newline at the end
text = text.strip() + "\n"
res[i] = quote, text
return res
DOWNLOAD_TEMPLATE = 'http://liturgia.silvestrini.org/letture/%04d-%02d-%02d.html'
def download_month(year, month):
try:
os.mkdir('scrape')
except OSError:
pass
os.chdir('scrape')
for day in utils.real_itermonthdays(year, month):
if day == 0:
continue
filename = '%04d-%02d-%02d.html' % (year, month, day)
# Skip already downloaded files
if os.path.exists(filename):
continue
url = DOWNLOAD_TEMPLATE % (year, month, day)
res = urllib2.urlopen(url)
assert res is not None
# First retrieve the whole page, then save it
data = res.read()
with open(filename, 'w') as fout:
fout.write(data)
print >> sys.stderr, "Created file %s" % filename
if __name__ == '__main__':
#with open('test/2013-01-01.html') as fhtml:
# scrape_file(fhtml)
download_month(*map(int, sys.argv[1:]))