-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathcrawlUsers.py
More file actions
66 lines (50 loc) · 2.01 KB
/
crawlUsers.py
File metadata and controls
66 lines (50 loc) · 2.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from seleniumCrawler import SeleniumCrawler
import json
import sys
import os
from pyvirtualdisplay import Display
import time
if __name__ == "__main__":
if len(sys.argv) == 2:
linkfile = sys.argv[1]
print "Extracting userdata for " + linkfile
if os.path.exists(linkfile):
print "extracting ... "
with open(linkfile,'rb') as f:
lines = f.readlines()
usernames = {}
for l in lines[1:]:
parts = l.split('|')
name = parts[4]
if name not in usernames:
usernames[name] = 1
else:
usernames[name]+=1
with open("usernames.json" , 'wb') as f:
json.dump(usernames,f)
else:
with open("usernames.json" , 'rb') as f:
usernames = json.load(f)
print "found %d usernames to look for" %(len(usernames))
#crawlnames = ['sagarjoglekar', 'realdonaldtrump' , 'mad_astronaut' , 'billnye' , 'iamsrk']
crawlnames = usernames.keys();
crawledFiles = os.listdir("UserCrawlDir/")
crawledUsers = []
for l in crawledFiles:
with open("UserCrawlDir/"+l,'rb') as f:
usrs = json.load(f)
crawledUsers = list(set(crawledUsers + usrs.keys()))
finalNames = [k for k in crawlnames if k not in crawledUsers]
print "Starting crawl of %d usernames"%(len(finalNames))
for start, end in zip(range(0, len(finalNames), 200), range(200, len(finalNames), 200)):
print "Creating selenium object"
display = Display(visible=0, size=(800, 600))
display.start()
searchObj = SeleniumCrawler("sagarConfig.config")
data = searchObj.getUserInfo(finalNames[start:end])
print "Killing chrome driver and selenium"
searchObj.killBrowser()
display.stop()
with open("UserCrawlDir/fragment_"+ str(int(time.time())) + "_userstats.json",'wb') as f:
json.dump(data,f)
print "Saved User Stats Fragment!!"