|
| 1 | +#! /usr/bin/env python |
| 2 | + |
| 3 | +# Copyright (c) 2011 Victor Terron. All rights reserved. |
| 4 | +# Institute of Astrophysics of Andalusia, IAA-CSIC |
| 5 | +# |
| 6 | +# This file is part of phone. |
| 7 | +# |
| 8 | +# phone is free software: you can redistribute it and/or modify |
| 9 | +# it under the terms of the GNU General Public License as published by |
| 10 | +# the Free Software Foundation, either version 3 of the License, or |
| 11 | +# (at your option) any later version. |
| 12 | +# |
| 13 | +# This program is distributed in the hope that it will be useful, |
| 14 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 15 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 16 | +# GNU General Public License for more details. |
| 17 | +# |
| 18 | +# You should have received a copy of the GNU General Public License |
| 19 | +# along with this program. If not, see <http://www.gnu.org/licenses/>. |
| 20 | + |
| 21 | +import re |
| 22 | +import sys |
| 23 | +import urllib |
| 24 | +import optparse |
| 25 | +import HTMLParser |
| 26 | +import unicodedata |
| 27 | +import htmlentitydefs |
| 28 | + |
| 29 | + |
| 30 | +class Scraper(HTMLParser.HTMLParser): |
| 31 | + in_h4 = False |
| 32 | + in_link = False |
| 33 | + |
| 34 | + def handle_starttag(self, tag, attrs): |
| 35 | + attrs = dict(attrs) |
| 36 | + if tag == 'h4': |
| 37 | + self.in_h4 = True |
| 38 | + if tag == 'a' and 'href' in attrs: |
| 39 | + self.in_link = True |
| 40 | + self.chunks = [] |
| 41 | + self.url = attrs['href'] |
| 42 | + |
| 43 | + def handle_data(self, data): |
| 44 | + if self.in_link: |
| 45 | + self.chunks.append(data) |
| 46 | + def handle_endtag(self, tag): |
| 47 | + if tag == 'h4': |
| 48 | + self.in_h4 = False |
| 49 | + if tag == 'a': |
| 50 | + if self.in_h4 and self.in_link: |
| 51 | + print '%s (%s)' % (''.join(self.chunks), self.url) |
| 52 | + self.in_link = False |
| 53 | + |
| 54 | + |
| 55 | + |
| 56 | +# http://stackoverflow.com/questions/930303/python-string-cleanup-manipulation-accented-characters |
| 57 | +def remove_accents (unicrap): |
| 58 | + """This replaces UNICODE Latin-1 characters with |
| 59 | + something equivalent in 7-bit ASCII. All characters in the standard |
| 60 | + 7-bit ASCII range are preserved. In the 8th bit range all the Latin-1 |
| 61 | + accented letters are stripped of their accents. Most symbol characters |
| 62 | + are converted to something meaninful. Anything not converted is deleted. |
| 63 | + """ |
| 64 | + xlate={0xc0:'A', 0xc1:'A', 0xc2:'A', 0xc3:'A', 0xc4:'A', 0xc5:'A', |
| 65 | + 0xc6:'Ae', 0xc7:'C', |
| 66 | + 0xc8:'E', 0xc9:'E', 0xca:'E', 0xcb:'E', |
| 67 | + 0xcc:'I', 0xcd:'I', 0xce:'I', 0xcf:'I', |
| 68 | + 0xd0:'Th', 0xd1:'N', |
| 69 | + 0xd2:'O', 0xd3:'O', 0xd4:'O', 0xd5:'O', 0xd6:'O', 0xd8:'O', |
| 70 | + 0xd9:'U', 0xda:'U', 0xdb:'U', 0xdc:'U', |
| 71 | + 0xdd:'Y', 0xde:'th', 0xdf:'ss', |
| 72 | + 0xe0:'a', 0xe1:'a', 0xe2:'a', 0xe3:'a', 0xe4:'a', 0xe5:'a', |
| 73 | + 0xe6:'ae', 0xe7:'c', |
| 74 | + 0xe8:'e', 0xe9:'e', 0xea:'e', 0xeb:'e', |
| 75 | + 0xec:'i', 0xed:'i', 0xee:'i', 0xef:'i', |
| 76 | + 0xf0:'th', 0xf1:'n', |
| 77 | + 0xf2:'o', 0xf3:'o', 0xf4:'o', 0xf5:'o', 0xf6:'o', 0xf8:'o', |
| 78 | + 0xf9:'u', 0xfa:'u', 0xfb:'u', 0xfc:'u', |
| 79 | + 0xfd:'y', 0xfe:'th', 0xff:'y', |
| 80 | + 0xa1:'!', 0xa2:'{cent}', 0xa3:'{pound}', 0xa4:'{currency}', |
| 81 | + 0xa5:'{yen}', 0xa6:'|', 0xa7:'{section}', 0xa8:'{umlaut}', |
| 82 | + 0xa9:'{C}', 0xaa:'{^a}', 0xab:'<<', 0xac:'{not}', |
| 83 | + 0xad:'-', 0xae:'{R}', 0xaf:'_', 0xb0:'{degrees}', |
| 84 | + 0xb1:'{+/-}', 0xb2:'{^2}', 0xb3:'{^3}', 0xb4:"'", |
| 85 | + 0xb5:'{micro}', 0xb6:'{paragraph}', 0xb7:'*', 0xb8:'{cedilla}', |
| 86 | + 0xb9:'{^1}', 0xba:'{^o}', 0xbb:'>>', |
| 87 | + 0xbc:'{1/4}', 0xbd:'{1/2}', 0xbe:'{3/4}', 0xbf:'?', |
| 88 | + 0xd7:'*', 0xf7:'/' |
| 89 | + } |
| 90 | + |
| 91 | + r = '' |
| 92 | + for i in unicrap: |
| 93 | + if xlate.has_key(ord(i)): |
| 94 | + r += xlate[ord(i)] |
| 95 | + elif ord(i) >= 0x80: |
| 96 | + pass |
| 97 | + else: |
| 98 | + r += i |
| 99 | + return r |
| 100 | + |
| 101 | +# http://farmdev.com/talks/unicode/ |
| 102 | +def to_unicode_or_bust(obj, encoding='utf-8'): |
| 103 | + if isinstance(obj, basestring): |
| 104 | + if not isinstance(obj, unicode): |
| 105 | + obj = unicode(obj, encoding) |
| 106 | + return obj |
| 107 | + |
| 108 | + |
| 109 | +def unescape(s): |
| 110 | + "unescape HTML code refs; c.f. http://wiki.python.org/moin/EscapingHtml" |
| 111 | + return re.sub('&(%s);' % '|'.join(htmlentitydefs.name2codepoint), |
| 112 | + lambda m: unichr(htmlentitydefs.name2codepoint[m.group(1)]), s) |
| 113 | + |
| 114 | + |
| 115 | +class Person(object): |
| 116 | + def __init__(self, name, department, position, phone, email, encoding = 'iso-8859-1'): |
| 117 | + self.name = unescape(name.decode(encoding)) |
| 118 | + self.department = unescape(department.decode(encoding)) |
| 119 | + self.position = unescape(position.decode(encoding)) |
| 120 | + self.phone = phone.decode(encoding) |
| 121 | + self.email = unescape(email.decode(encoding)) |
| 122 | + |
| 123 | + @property |
| 124 | + def name_wo_accents(self): |
| 125 | + return remove_accents(self.name) |
| 126 | + |
| 127 | + |
| 128 | +class Guest(Person): |
| 129 | + |
| 130 | + def __init__(self, name, institution, dates, position, contact, email, encoding = 'iso-8859-1'): |
| 131 | + super(Guest, self).__init__(name, '', position, '', email, encoding = encoding) |
| 132 | + self.institution = unescape(institution.decode(encoding)) |
| 133 | + self.dates = unescape(dates.decode(encoding)) |
| 134 | + self.contact = unescape(contact.decode(encoding)) |
| 135 | + |
| 136 | +def alphabetical_query(): |
| 137 | + |
| 138 | + alphabetical = urllib.urlopen('http://www.iaa.es/personal/general.html').read() |
| 139 | + regexp = re.compile("<td nowrap> <b>([0-9]*)</b> </td><td nowrap>(<a .*?>)?(.*?)(</a>)?</td>" |
| 140 | + "<td nowrap>(.*?)</td><td nowrap>(.*?)</td><td nowrap>(.*?) </td></tr>", re.IGNORECASE) |
| 141 | + found_persons = [] |
| 142 | + for matched_line in regexp.findall(alphabetical): |
| 143 | + phone = matched_line[0].strip() |
| 144 | + name = matched_line[2].strip() |
| 145 | + department = matched_line[4].strip() |
| 146 | + position = matched_line[5].strip() |
| 147 | + email = matched_line[6].strip() |
| 148 | + found_persons.append(Person(name, department, position, phone, email)) |
| 149 | + return found_persons |
| 150 | + |
| 151 | + |
| 152 | +def directorio_telefonico(): |
| 153 | + |
| 154 | + directorio = urllib.urlopen('http://www.iaa.es/privado/directorio.html').read() |
| 155 | + # A row in the table will look like this. There are no <a></a> tags right now, |
| 156 | + # but who knows if they may be added in the future. |
| 157 | + # <tr><td nowrap align="center"><b> 639</b></td><td nowrap> |
| 158 | + # Amado Gonzalez, Pedro Jose</td><td nowrap> [email protected]</td></tr> |
| 159 | + |
| 160 | + regexp = re.compile("<tr><td nowrap align=\"center\"><b> ([0-9]*)</b></td>" |
| 161 | + "<td nowrap> (<a .*?>)?(.*?)(</a>)?</td><td nowrap> (.*?)</td></tr>", |
| 162 | + re.IGNORECASE) |
| 163 | + |
| 164 | + found_persons = [] |
| 165 | + for matched_line in regexp.findall(directorio): |
| 166 | + phone = matched_line[0].strip() |
| 167 | + name = matched_line[2].strip() |
| 168 | + email = matched_line[4].strip() |
| 169 | + found_persons.append(Person(name, '', '', phone, email)) |
| 170 | + return found_persons |
| 171 | + |
| 172 | + |
| 173 | +def personal_vinculado(): |
| 174 | + |
| 175 | + vinculado = urllib.urlopen('http://www.iaa.es/privado/personal/pv.html').read() |
| 176 | + # A row in the table will look like this. Note that the <a></a> tags are optional |
| 177 | + # <tr><td nowrap> <b></b> </td><td nowrap><a href="/~iagudo">Agudo |
| 178 | + # Rodriguez, Juan Ivan</a></td><td nowrap>DREG</td><td nowrap>Colaborador |
| 179 | + # Externo</td><td nowrap>iagudo </td></tr> |
| 180 | + |
| 181 | + regexp = re.compile("<tr><td nowrap> <b>([0-9]*)</b> </td><td nowrap>(<a .*?>)?(.*?)(</a>)?</td>" |
| 182 | + "<td nowrap>(.*?)</td><td nowrap>(.*?)</td><td nowrap>(.*?) </td></tr>", re.IGNORECASE) |
| 183 | + |
| 184 | + found_persons = [] |
| 185 | + for matched_line in regexp.findall(vinculado): |
| 186 | + phone = matched_line[0].strip() |
| 187 | + name = matched_line[2].strip() |
| 188 | + department = matched_line[4].strip() |
| 189 | + position = matched_line[5].strip() |
| 190 | + email = matched_line[6].strip() |
| 191 | + found_persons.append(Person(name, department, position, phone, email)) |
| 192 | + return found_persons |
| 193 | + |
| 194 | + |
| 195 | +def visitantes(): |
| 196 | + |
| 197 | + registro_visitas = urllib.urlopen('http://www.iaa.es/privado/visitas/').read() |
| 198 | + |
| 199 | + # <table width="100%" border="0" cellspacing="0" cellpadding="0" class="nlabel"> |
| 200 | + # <tr><td width="12" nowrap background="/img/label/ncl.gif"> </td> |
| 201 | + # <td nowrap background="/img/label/ncc.gif">Josep Maria Masque</td> |
| 202 | + # <td width="12" nowrap background="/img/label/ncr.gif"> </td> |
| 203 | + # <td width="600" background="/img/label/nbg.gif"> </td></tr> |
| 204 | + # </table> |
| 205 | + # |
| 206 | + # This is how an entry in the listing looks like |
| 207 | + # <table width="600" border="0" cellspacing="0" cellpadding="0" class="head"> |
| 208 | + # <tr><td width="12" nowrap> </td><td class="gbold">Stockholm University</td></tr> |
| 209 | + # <tr><td width="12" nowrap> </td><td class="bold">del 04 de Octubre de 2010, al 29 de Octubre de 2010</td></tr> |
| 210 | + # <tr><td width="12" nowrap> </td><td><span class="gbold">Cat.Prof.</span> Predoc<p> </p></td></tr> |
| 211 | + # <tr><td width="12" nowrap> </td><td><span class="bold">Persona contacto:</span> Romero Canizales, Cristina</td></tr> |
| 212 | + # <tr><td width="12" nowrap> </td><td><span class="bold">e-mail:</span> [email protected]</td></tr> |
| 213 | + # </table> |
| 214 | + |
| 215 | + |
| 216 | + regexp = re.compile( |
| 217 | + "<table width=\"100%\" border=\"0\" cellspacing=\"0\" cellpadding=\"0\" class=\"nlabel\">.*?" |
| 218 | + "<tr>.*?</td>.*?" |
| 219 | + "<td .*?>(.*?)</td>.*?" |
| 220 | + "<td .*?</td>.*?" |
| 221 | + "<td .*?</tr>.*?" |
| 222 | + "</table>.*?" |
| 223 | + "<table .*?>[\s]*" |
| 224 | + "<tr><td .*?</td><td .*?>(.*?)</td></tr>[\s]*" |
| 225 | + "<tr><td .*?</td><td .*?>(.*?)</td></tr>[\s]*" |
| 226 | + "<tr><td .*?</td><td><span .*?</span>(.*?)<p>.*?</p></td></tr>[\s]*" |
| 227 | + "<tr><td .*?</td><td><span .*?</span>(.*?)</td></tr>[\s]*" |
| 228 | + "<tr><td .*?</td><td><span .*?</span>(.*?)</td></tr>[\s]*" |
| 229 | + "</table>" |
| 230 | + , re.IGNORECASE | re.DOTALL) |
| 231 | + |
| 232 | + found_visitors = [] |
| 233 | + for matched_line in regexp.findall(registro_visitas): |
| 234 | + name = matched_line[0].strip() |
| 235 | + institution = matched_line[1].strip() |
| 236 | + dates = matched_line[2].strip() |
| 237 | + position = matched_line[3].strip() |
| 238 | + contact = matched_line[4].strip() |
| 239 | + email = matched_line[5].strip() |
| 240 | + found_visitors.append(Guest(name, institution, dates, position, contact, email)) |
| 241 | + return found_visitors |
| 242 | + |
| 243 | +def remove_duplicate_names(list_of_persons): |
| 244 | + unique_list = [] |
| 245 | + for person in list_of_persons: |
| 246 | + if person.name not in map(lambda x: x.name, unique_list): |
| 247 | + unique_list.append(person) |
| 248 | + return unique_list |
| 249 | + |
| 250 | +def name_search(target, encoding='utf-8'): |
| 251 | + target = remove_accents(target.decode(encoding)).lower() |
| 252 | + query = directorio_telefonico() + personal_vinculado() + visitantes() |
| 253 | + name_matches = filter(lambda x: target in x.name_wo_accents.lower(), query) |
| 254 | + return remove_duplicate_names(name_matches) |
| 255 | + |
| 256 | +def search(target, encoding='utf-8'): |
| 257 | + target = remove_accents(target.decode(encoding)).lower() |
| 258 | + query = directorio_telefonico() + personal_vinculado() + visitantes() |
| 259 | + matches = filter(lambda x: target in x.name_wo_accents.lower() or \ |
| 260 | + target in str(x.phone) or \ |
| 261 | + target in x.email, query) |
| 262 | + |
| 263 | + matches.sort(key = lambda x: x.name) |
| 264 | + return remove_duplicate_names(matches) |
| 265 | + |
| 266 | + |
| 267 | +if __name__ == "__main__": |
| 268 | + |
| 269 | + if sys.version_info < (2, 5): |
| 270 | + print "Error: phone requires Python 2.5 os later :-(" |
| 271 | + sys.exit(3) |
| 272 | + |
| 273 | + description = \ |
| 274 | + "phone is intended to serve as the white pages of the Institute of " \ |
| 275 | + "Astrophysics of Andalusia: a command-line application that queries the " \ |
| 276 | + "website in order to find the telephone number given a name or surname. " \ |
| 277 | + "Note that visiting and associate researchers are included in the " \ |
| 278 | + "search. The reverse telephone directory (grey pages) is also available, " \ |
| 279 | + "as searches may also be done by the phone number or e-mail address. All " \ |
| 280 | + "searches are case- and accent-insensitive." |
| 281 | + |
| 282 | + parser = optparse.OptionParser(description = description) |
| 283 | + parser.usage = "%prog NAME | PHONE | EMAIL" |
| 284 | + |
| 285 | + if len(sys.argv) != 2: |
| 286 | + parser.print_help() |
| 287 | + sys.exit(2) |
| 288 | + |
| 289 | + target = sys.argv[1] |
| 290 | + matches = search(target) |
| 291 | + |
| 292 | + if not matches: |
| 293 | + sys.exit(0) |
| 294 | + |
| 295 | + # If the email address is at the @iaa.es domain (and most of the addresses |
| 296 | + # certainly will), only the username is shown |
| 297 | + domain = '@iaa.es' |
| 298 | + for match in matches: |
| 299 | + if match.email.endswith(domain): |
| 300 | + match.email = match.email[:-len(domain)] |
| 301 | + |
| 302 | + # Determine the max length of each column |
| 303 | + name_column_length = max(map(lambda x: len(x.name), matches)) |
| 304 | + phone_column_length = max(map(lambda x: len(x.phone), matches)) |
| 305 | + email_column_length = max(map(lambda x: len(x.email), matches)) |
| 306 | + |
| 307 | + for person in matches: |
| 308 | + print person.name.ljust(name_column_length), '|', \ |
| 309 | + person.phone.ljust(phone_column_length), '|', \ |
| 310 | + person.email.ljust(email_column_length) |
| 311 | + |
| 312 | + sys.exit(0) |
| 313 | + |
0 commit comments