-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTabDownloader.py
170 lines (134 loc) · 6.22 KB
/
TabDownloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import hashlib
import urllib2
import requests
from bs4 import BeautifulSoup
from os.path import join
import re
import ast
import time
from TabSpider import TabSpider
from TabData import TabData
from User import User
from Comment import Comment
class TabDownloader:
def __init__(self, domain, tab_page, salt,delay, tab_dir=""):
self.domain = domain
self.tab_page = tab_page
self.salt = salt
self.tab_dir=tab_dir
self.last_time = 0
self.delay = delay
def process_comment(self,comment):
result = Comment()
# Get content
result.content = comment.find("div", attrs={"class":"comment_content"}).text.strip()
# Get rating
result.rating = int(comment.find("ins").text.replace("+",""))
# Get author
result.author = comment.find("a", attrs={"href":re.compile("profile")}).text.strip()
# Get date
result.date = comment.find("span", attrs={"class":"gray1"}).text.split("wrote on ")[-1].strip()
return result
def tab_download(self, request_url, get_comments=False):
print"Getting info on tab "+ request_url[31:]
# TabData file, will be returned at the end
tab_data = TabData()
# Load page
self.wait()
request = requests.get(request_url)
soup = BeautifulSoup(request.text)
# Check if tab is removed
if len(soup.find_all(class_="t_blocked")) > 0:
return None
# Find session ID
session_id = soup.find("input", attrs={"name":"session_id"})
if session_id:
session = session_id.get("value")
else:
return None
# Find Tab ID
tab_id = soup.find("input", attrs={"name":"tab_id"}).get("value")
# Get hash
sha1 = hashlib.sha1(self.salt+tab_id).hexdigest()
try:
# Get TAB file
file_url = "http://www."+self.domain+"/"+self.tab_page+"?tab_id=%s&session_id=%s&token=%s" %(tab_id,session,sha1)
tab_file = urllib2.urlopen(file_url)
# URL does not contain filename, need to get it from headers
filename = tab_file.headers['Content-Disposition'].split("=")[-1]
# Write it to disk
local_file = open(self.tab_dir+filename, "wb")
#local_file.write(f.read())
local_file.close()
tab_data.tab_file = join(self.tab_dir+filename)
except urllib2.HTTPError, e:
print "HTTP Error:",e.code , url
except urllib2.URLError, e:
print "URL Error:",e.reason , url
# Get title and version
tab_info = ast.literal_eval("{"+soup(text=re.compile("'name': '.+?',"))[0].split("= {")[-1].split(";")[0])
tab_data.title = tab_info["name"]
tab_data.version = tab_info["version"]
# Get tab contributor, if there is no contributor, instruments needs -1 index
shift = 1
tab_data.tabber = None
t_dtde = soup.find_all("div", attrs={"class":"t_dtde"})
if len(t_dtde) == 3:
shift = 0
tab_data.tabber = t_dtde[0].text.strip()
if len(t_dtde) > 1:
# Get instruments
tab_data.instruments = t_dtde[2-shift].text.strip().split(",")
# Get rating
tab_data.rating = len(soup.find_all("a", attrs={"class":"cur"}))
# Get number of ratings, empty if no ratings
tab_data.num_ratings = soup.find("div", attrs={"class":"v_c"}).text.strip().split("x ")[-1]
if not tab_data.num_ratings:
tab_data.num_ratings = 0
# Get num views
tab_data.num_views = soup.find("div", attrs={"class":"stats"}).text.strip().split(" views")[0].replace(",","")
# Get num comments
tab_data.num_comments = soup.find("span", attrs={"class":"t_cbl"}).text.strip().split(" comments")[0]
if tab_data.num_comments == "no":
tab_data.num_comments = 0
# Get comment content
if get_comments:
comment_url = "http://tabs.%s/comment/?tabid=%s" %(self.domain,tab_id)
self.wait()
request = requests.get(comment_url)
soup2 = BeautifulSoup(request.text.strip())
# Comments are flat in html, but nested based on their "level" as denoted in their class
# To deal with this, we use a stack to make sure subcomments are appropriately added.
# Recursion would be optimal, but is not possible due to the flat structure
comment_level=0
current_level_comments = []
stack = []
for comment in soup2.find_all("div", class_=re.compile("lev")):
new_level = int(comment.get("class")[2].split("lev")[-1])
if new_level > comment_level:
stack.append(current_level_comments)
current_level_comments = []
current_level_comments.append(self.process_comment(comment))
comment_level = new_level
elif new_level < comment_level:
# when popping, you may drop multiple levels
while comment_level != new_level:
popped = stack.pop()
popped[-1].child_comments = current_level_comments
current_level_comments = popped
comment_level -= 1
current_level_comments.append(self.process_comment(comment))
else:
current_level_comments.append(self.process_comment(comment))
# Pop until list is empty
while stack:
popped = stack.pop()
popped[-1].child_comments = current_level_comments
current_level_comments = popped
tab_data.comments = current_level_comments
return tab_data
def wait(self):
while self.last_time > time.time():
pass
self.last_time = time.time()+(self.delay/1000)
return