-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathParseImageList.py
More file actions
119 lines (105 loc) · 4.42 KB
/
ParseImageList.py
File metadata and controls
119 lines (105 loc) · 4.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/python3
from Config import Config
from Download import Download
from datetime import datetime
from lxml import html
from lxml import etree
import re
class ParseImageList:
"""
Parse the image list from the server.
Creates a json file containing the images with different data.
"""
def __init__(self, mission_id=None, scope="both", use_range=1,
config=Config(), db=None):
self.mission_id = mission_id
self.scope = scope
self.use_range = use_range
self.images = []
self.config = config
self.db = db
self.no_need = True
self.new_start = False
self.json_obj = None
self.base_url = "http://eol.jsc.nasa.gov/SearchPhotos/"
self.url = self.base_url + "mrf.pl"
self.post_dict = {"MRFList": self.mission_id,
"scope": self.scope,
"UseRanges": self.use_range}
"""
Start from the last position if the image listing was broekn in
between.
"""
def start_over(self):
if(len(list(self.db.mission_image_status())) > 0):
new_start = list(self.db.mission_image_status())[0]
self.new_start = new_start[8]
self.no_need = False
"""
Load the image list from parsing or loading from database.
"""
def load(self):
self.start_over()
if(len(list(self.db.get_all_images(self.mission_id))) > 0 and
not self.new_start):
self.images = self.db.find_rest_images(self.mission_id)
else:
self.parse_web()
self.images = self.db.find_rest_images(self.mission_id)
"""
Parse text date to database text object.
"""
def parse_date(self, date):
date_to_return = datetime.utcnow()
if(re.search("^-?[0-9]+$", date)):
date_to_return = datetime.strptime(date, '%Y%m%d')
if(re.search(".*__", date)):
date = date.replace("__", "01")
date_to_return = datetime.strptime(date, '%Y%m%d')
return date_to_return
def save_progress_downloaded(self, image_id):
return self.db.update_image_downloaded(image_id)
def save_progress_uploaded(self, image_id):
return self.db.update_image_uploaded(image_id)
def get_images(self):
return self.images
def parse_web(self):
down = Download(self.url, as_var=True, post_dict=self.post_dict)
found_start = False
can_add = False
if(down.perform()):
web_string_etree = etree.fromstring(down.get_result().getvalue())
for element in web_string_etree.iter("script"):
redirect_url = element.text
redirect_url_array = redirect_url.split("\"")
down = Download(self.base_url + redirect_url_array[1], as_var=True)
if(down.perform()):
string_etree = html.fromstring(
down.get_result().getvalue())
table = string_etree.xpath("//table[@id='QueryResults']")
for element in table[0].iter("tr"):
list_of_elements = list(element.iter("td"))
if(len(list_of_elements) > 5):
a = list(list_of_elements[0].iter("a"))
if(found_start or self.no_need):
can_add = True
if(self.new_start):
if(self.new_start == a[0].text and not found_start):
found_start = True
if(can_add):
self.db.insert_image(a[0].attrib["href"],
a[0].text,
self.parse_date(
list_of_elements[1].text),
list_of_elements[2].text,
list_of_elements[3].text,
list_of_elements[4].text,
list_of_elements[5].text,
list_of_elements[6].text,
list_of_elements[7].text,
self.mission_id,
False, False)
self.db.update_mission_image_id(
self.mission_id, a[0].text)
self.db.update_mission_image_id(
self.mission_id, str(0))