-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscrape-erowid.py
executable file
·87 lines (66 loc) · 2.42 KB
/
scrape-erowid.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env python
# Fetch random Erowid experience reports and save them into the "text/" folder.
# Runs forever until you hit control-C to quit it
from __future__ import division
import os, sys, time, urllib2, random, re, glob
#-------------------------------------------------------------------------------------------
# HELPERS AND SETUP
def writeFile(fn,data):
f = file(fn,'w'); f.write(data); f.close()
def removeHTML(s):
htmlRegexes = ['</?.{1,30}?>']
for htmlRegex in htmlRegexes:
s = re.sub(htmlRegex, '', s)
return s
baseurl = 'http://www.erowid.org/experiences/exp.php?ID=%s'
maxID = 67000
dir = 'text/'
if not os.path.exists(dir):
os.mkdir(dir)
#-------------------------------------------------------------------------------------------
# MAIN
print '---------------------------------------------------------------------------------\\'
print
print 'Downloading random Erowid experience reports'
print
print 'Hit control-C to quit'
print
while True:
print '-----'
time.sleep(1)
id = random.randint(1,maxID)
url = baseurl % id
if glob.glob('%s%s *.txt'%(dir,id)):
print 'We already downloaded that one. Skipping.'
continue
print 'Fetching url: %s'%url
page = urllib2.urlopen(url).read()
if 'Unable to read experience ' in page:
print 'No report at that ID number.'
continue
# find the list of substances
dosechart = page.split('DoseChart')[1]
lines = dosechart.splitlines()
substances = []
for line in lines:
if '<td><a ' in line:
substance = line.split("'>")[1].split('<')[0]
substanceUrl = line.split("href='")[1].split("'")[0]
substanceUrl = substanceUrl.split('/')[-2]
substances.append(substanceUrl)
substances = sorted(list(set(substances)))
fn = dir + 'erowid ' + ' '.join([str(id)]+substances)
title = page.split('class="title">')[1].split('</div>')[0]
print 'Substances: %s'%substances
print 'Filename: %s'%fn
print 'Title: %s'%title
# get main text and remove common unicode characters
body = page.split('Start Body -->')[1].split('<!-- End Body')[0]
body = body.replace('\r','')
body = body.replace('\x92',"'")
body = body.replace('\x93','"')
body = body.replace('\x94','"')
body = body.replace('\x97',' -- ')
body = removeHTML(body)
body = body.strip()
writeFile(fn, title + '\n\n' + body + '\n')