-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcommon.py
79 lines (72 loc) · 2.1 KB
/
common.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# Create the text files containing training and test image
# names and their classes.
import fileinput
import urllib
import os
import bs4
import re
def get_avg(cells):
sum = 0;
count = 0;
for i in range(2, 12):
sum += (i - 1) * float(cells[i])
count += float(cells[i])
avg = sum / count
return avg
def get_high_list(percent):
f_in = open("AVA_dataset/AVA.txt");
lines = f_in.readlines();
processed_lines = [0] * len(lines)
i = 0
for line in lines:
cells = line.split()
avg = get_avg(cells)
processed_lines[i] = [line, avg]
i += 1
processed_lines.sort(key=lambda x: x[1])
return processed_lines[-int(percent*len(lines)):]
def get_low_list(percent):
f_in = open("AVA_dataset/AVA.txt");
lines = f_in.readlines();
processed_lines = [0] * len(lines)
i = 0
for line in lines:
cells = line.split()
avg = get_avg(cells)
processed_lines[i] = [line, avg]
i += 1
processed_lines.sort(key=lambda x: x[1])
return processed_lines[:int(percent*len(lines))]
def download_image(line):
cells = line.split(" ")
if(os.path.exists("../ava/"+cells[0]+"-"+cells[1]+".jpg")):
print cells[1] + " already downloaded"
return
id = cells[1]
url = "http://www.dpchallenge.com/image.php?IMAGE_ID="+id
code = urllib.urlopen(url).read()
soup = bs4.BeautifulSoup(code)
tbs = soup.findAll('td')
for tb in tbs:
if tb.has_key('class') and tb['class'][0] == "page-image":
imgs = tb.find_all('img')
img = imgs[1]
src = img.get('src')
print "Current src is " + src
urllib.urlretrieve(src, "../ava/"+cells[0]+"-"+id+'.jpg')
print cells[0]+": Photo " + id + " downloaded"
def download_image_by_filename(fileName):
cells = re.findall(r"\d+", fileName)
if(os.path.exists("../ava/"+fileName)):
print fileName + " already downloaded"
return
id = cells[1]
url = "http://www.dpchallenge.com/image.php?IMAGE_ID="+id
print "url is " + url
code = urllib.urlopen(url).read()
imgurls=re.findall('img .*?src="(.*?)"',code)
print imgurls
for imgurl in imgurls:
if("Copyrighted" in imgurl and id in imgurl):
urllib.urlretrieve(imgurl, "../ava/"+fileName)
print cells[0]+": Photo " + id + " downloaded"