The original script, now at GitHub. Fiat lux!

Víctor Terrón · Víctor Terrón · commit 46b862c46c72 · 2012-03-29T19:11:43.000+02:00
I never really thought that anybody else could be interested in using
this script, but it seems that a small fraction among my almost two
hundred workmates could want to install it on their systems. And there
are a few features that may be worth implementing.
diff --git a/COPYING b/COPYING
diff --git a/MANIFEST b/MANIFEST
@@ -0,0 +1,5 @@
+COPYING
+MANIFEST
+README
+phone
+setup.py
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,6 @@
+include setup.py
+include phone
+global-exclude *~
+include README
+include COPYING
+include MANIFEST
diff --git a/README.rst b/README.rst
@@ -0,0 +1,8 @@
+IAA-CSIC Phone Book
+===================
+
+This Python script, **iaa-phone** is intended to serve as the *white pages* of the `Institute of Astrophysics of Andalusia <http://www.iaa.es>` (CSIC, Spain). It is a command-line application that queries the website in order to find the telephone number given a name or surname. Visiting and associate researchers are also included in the search.
+
+The reverse telephone directory (*grey pages*) is also available, as searches may also be done by the phone number or e-mail address.
+
+All searches are case- and accent-insensitive.
diff --git a/phone b/phone
@@ -0,0 +1,313 @@
+#! /usr/bin/env python
+
+# Copyright (c) 2011 Victor Terron. All rights reserved.
+# Institute of Astrophysics of Andalusia, IAA-CSIC
+#
+# This file is part of phone.
+#
+# phone is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import re
+import sys
+import urllib
+import optparse
+import HTMLParser
+import unicodedata
+import htmlentitydefs
+
+
+class Scraper(HTMLParser.HTMLParser):
+    in_h4 = False
+    in_link = False
+
+    def handle_starttag(self, tag, attrs):
+        attrs = dict(attrs)
+        if tag == 'h4':
+            self.in_h4 = True
+        if tag == 'a' and 'href' in attrs:
+            self.in_link = True
+            self.chunks = []
+            self.url = attrs['href']
+
+    def handle_data(self, data):
+        if self.in_link:
+            self.chunks.append(data)
+    def handle_endtag(self, tag):
+        if tag == 'h4':
+            self.in_h4 = False
+        if tag == 'a':
+            if self.in_h4 and self.in_link:
+                print '%s (%s)' % (''.join(self.chunks), self.url)
+            self.in_link = False
+
+
+
+# http://stackoverflow.com/questions/930303/python-string-cleanup-manipulation-accented-characters
+def remove_accents (unicrap):
+    """This replaces UNICODE Latin-1 characters with
+    something equivalent in 7-bit ASCII. All characters in the standard
+    7-bit ASCII range are preserved. In the 8th bit range all the Latin-1
+    accented letters are stripped of their accents. Most symbol characters
+    are converted to something meaninful. Anything not converted is deleted.
+    """
+    xlate={0xc0:'A', 0xc1:'A', 0xc2:'A', 0xc3:'A', 0xc4:'A', 0xc5:'A',
+    0xc6:'Ae', 0xc7:'C',
+    0xc8:'E', 0xc9:'E', 0xca:'E', 0xcb:'E',
+    0xcc:'I', 0xcd:'I', 0xce:'I', 0xcf:'I',
+    0xd0:'Th', 0xd1:'N',
+    0xd2:'O', 0xd3:'O', 0xd4:'O', 0xd5:'O', 0xd6:'O', 0xd8:'O',
+    0xd9:'U', 0xda:'U', 0xdb:'U', 0xdc:'U',
+    0xdd:'Y', 0xde:'th', 0xdf:'ss',
+    0xe0:'a', 0xe1:'a', 0xe2:'a', 0xe3:'a', 0xe4:'a', 0xe5:'a',
+    0xe6:'ae', 0xe7:'c',
+    0xe8:'e', 0xe9:'e', 0xea:'e', 0xeb:'e',
+    0xec:'i', 0xed:'i', 0xee:'i', 0xef:'i',
+    0xf0:'th', 0xf1:'n',
+    0xf2:'o', 0xf3:'o', 0xf4:'o', 0xf5:'o', 0xf6:'o', 0xf8:'o',
+    0xf9:'u', 0xfa:'u', 0xfb:'u', 0xfc:'u',
+    0xfd:'y', 0xfe:'th', 0xff:'y',
+    0xa1:'!', 0xa2:'{cent}', 0xa3:'{pound}', 0xa4:'{currency}',
+    0xa5:'{yen}', 0xa6:'|', 0xa7:'{section}', 0xa8:'{umlaut}',
+    0xa9:'{C}', 0xaa:'{^a}', 0xab:'<<', 0xac:'{not}',
+    0xad:'-', 0xae:'{R}', 0xaf:'_', 0xb0:'{degrees}',
+    0xb1:'{+/-}', 0xb2:'{^2}', 0xb3:'{^3}', 0xb4:"'",
+    0xb5:'{micro}', 0xb6:'{paragraph}', 0xb7:'*', 0xb8:'{cedilla}',
+    0xb9:'{^1}', 0xba:'{^o}', 0xbb:'>>',
+    0xbc:'{1/4}', 0xbd:'{1/2}', 0xbe:'{3/4}', 0xbf:'?',
+    0xd7:'*', 0xf7:'/'
+    }
+
+    r = ''
+    for i in unicrap:
+        if xlate.has_key(ord(i)):
+            r += xlate[ord(i)]
+        elif ord(i) >= 0x80:
+            pass
+        else:
+            r += i
+    return r
+
+# http://farmdev.com/talks/unicode/
+def to_unicode_or_bust(obj, encoding='utf-8'):
+    if isinstance(obj, basestring):
+        if not isinstance(obj, unicode):
+            obj = unicode(obj, encoding)
+    return obj
+
+
+def unescape(s):
+    "unescape HTML code refs; c.f. http://wiki.python.org/moin/EscapingHtml"
+    return re.sub('&(%s);' % '|'.join(htmlentitydefs.name2codepoint),
+              lambda m: unichr(htmlentitydefs.name2codepoint[m.group(1)]), s)
+
+
+class Person(object):
+    def __init__(self, name, department, position, phone, email, encoding = 'iso-8859-1'):
+        self.name = unescape(name.decode(encoding))
+        self.department = unescape(department.decode(encoding))
+        self.position = unescape(position.decode(encoding))
+        self.phone = phone.decode(encoding)
+        self.email = unescape(email.decode(encoding))
+
+    @property
+    def name_wo_accents(self):
+        return remove_accents(self.name)
+
+
+class Guest(Person):
+
+    def __init__(self, name, institution, dates, position, contact, email, encoding = 'iso-8859-1'):
+        super(Guest, self).__init__(name, '', position, '', email, encoding = encoding)
+        self.institution = unescape(institution.decode(encoding))
+        self.dates       = unescape(dates.decode(encoding))
+        self.contact     = unescape(contact.decode(encoding))
+
+def alphabetical_query():
+
+    alphabetical = urllib.urlopen('http://www.iaa.es/personal/general.html').read()
+    regexp = re.compile("<td nowrap>&nbsp;<b>([0-9]*)</b>&nbsp;</td><td nowrap>(<a .*?>)?(.*?)(</a>)?</td>"
+                        "<td nowrap>(.*?)</td><td nowrap>(.*?)</td><td nowrap>(.*?)&nbsp;</td></tr>", re.IGNORECASE)
+    found_persons = []
+    for matched_line in regexp.findall(alphabetical):
+        phone      = matched_line[0].strip()
+        name       = matched_line[2].strip()
+        department = matched_line[4].strip()
+        position   = matched_line[5].strip()
+        email      = matched_line[6].strip()
+        found_persons.append(Person(name, department, position, phone, email))
+    return found_persons
+    
+
+def directorio_telefonico():
+
+    directorio = urllib.urlopen('http://www.iaa.es/privado/directorio.html').read()
+    # A row in the table will look like this. There are no <a></a> tags right now, 
+    # but who knows if they may be added in the future.
+    # <tr><td nowrap align="center"><b>&nbsp;639</b></td><td nowrap>&nbsp;
+    # Amado Gonzalez, Pedro Jose</td><td nowrap>&nbsp;pja@iaa.es</td></tr>
+    
+    regexp = re.compile("<tr><td nowrap align=\"center\"><b>&nbsp;([0-9]*)</b></td>"
+                        "<td nowrap>&nbsp;(<a .*?>)?(.*?)(</a>)?</td><td nowrap>&nbsp;(.*?)</td></tr>",
+                        re.IGNORECASE)
+
+    found_persons = []
+    for matched_line in regexp.findall(directorio):
+        phone = matched_line[0].strip()
+        name  = matched_line[2].strip()
+        email = matched_line[4].strip()
+        found_persons.append(Person(name, '', '', phone, email))
+    return found_persons
+        
+
+def personal_vinculado():
+
+    vinculado = urllib.urlopen('http://www.iaa.es/privado/personal/pv.html').read()
+    # A row in the table will look like this. Note that the <a></a> tags are optional
+    # <tr><td nowrap>&nbsp;<b></b>&nbsp;</td><td nowrap><a href="/~iagudo">Agudo
+    # Rodriguez, Juan Ivan</a></td><td nowrap>DREG</td><td nowrap>Colaborador
+    # Externo</td><td nowrap>iagudo&nbsp;</td></tr>
+    
+    regexp = re.compile("<tr><td nowrap>&nbsp;<b>([0-9]*)</b>&nbsp;</td><td nowrap>(<a .*?>)?(.*?)(</a>)?</td>"
+                        "<td nowrap>(.*?)</td><td nowrap>(.*?)</td><td nowrap>(.*?)&nbsp;</td></tr>", re.IGNORECASE)
+
+    found_persons = []
+    for matched_line in regexp.findall(vinculado):
+        phone      = matched_line[0].strip()
+        name       = matched_line[2].strip()
+        department = matched_line[4].strip()
+        position   = matched_line[5].strip()
+        email      = matched_line[6].strip()
+        found_persons.append(Person(name, department, position, phone, email))
+    return found_persons
+
+
+def visitantes():
+
+    registro_visitas = urllib.urlopen('http://www.iaa.es/privado/visitas/').read()
+    
+    # <table width="100%" border="0" cellspacing="0" cellpadding="0" class="nlabel">
+    # <tr><td width="12" nowrap background="/img/label/ncl.gif">&nbsp;</td>
+    # <td nowrap background="/img/label/ncc.gif">Josep Maria Masque</td>
+    # <td width="12" nowrap background="/img/label/ncr.gif">&nbsp;</td>
+    # <td width="600" background="/img/label/nbg.gif">&nbsp;</td></tr>
+    # </table>
+    #
+    # This is how an entry in the listing looks like
+    # <table width="600" border="0" cellspacing="0" cellpadding="0" class="head">
+    # <tr><td width="12" nowrap>&nbsp;</td><td class="gbold">Stockholm University</td></tr>
+    # <tr><td width="12" nowrap>&nbsp;</td><td class="bold">del 04 de Octubre de 2010, al 29 de Octubre de 2010</td></tr>
+    # <tr><td width="12" nowrap>&nbsp;</td><td><span class="gbold">Cat.Prof.</span> Predoc<p>&nbsp;</p></td></tr>
+    # <tr><td width="12" nowrap>&nbsp;</td><td><span class="bold">Persona contacto:</span> Romero Canizales, Cristina</td></tr>
+    # <tr><td width="12" nowrap>&nbsp;</td><td><span class="bold">e-mail:</span> cromero@iaa.es</td></tr>
+    # </table>
+
+
+    regexp = re.compile(
+        "<table width=\"100%\" border=\"0\" cellspacing=\"0\" cellpadding=\"0\" class=\"nlabel\">.*?"
+        "<tr>.*?</td>.*?"
+        "<td .*?>(.*?)</td>.*?"
+        "<td .*?</td>.*?"
+        "<td .*?</tr>.*?"
+        "</table>.*?"
+        "<table .*?>[\s]*"
+        "<tr><td .*?</td><td .*?>(.*?)</td></tr>[\s]*"
+        "<tr><td .*?</td><td .*?>(.*?)</td></tr>[\s]*"
+        "<tr><td .*?</td><td><span .*?</span>(.*?)<p>.*?</p></td></tr>[\s]*"
+        "<tr><td .*?</td><td><span .*?</span>(.*?)</td></tr>[\s]*"
+        "<tr><td .*?</td><td><span .*?</span>(.*?)</td></tr>[\s]*"
+        "</table>"
+        , re.IGNORECASE | re.DOTALL)
+
+    found_visitors = []
+    for matched_line in regexp.findall(registro_visitas):
+        name        = matched_line[0].strip()
+        institution = matched_line[1].strip()
+        dates       = matched_line[2].strip()
+        position    = matched_line[3].strip()
+        contact     = matched_line[4].strip()
+        email       = matched_line[5].strip()
+        found_visitors.append(Guest(name, institution, dates, position, contact, email))
+    return found_visitors
+
+def remove_duplicate_names(list_of_persons):
+    unique_list = []
+    for person in list_of_persons:
+        if person.name not in map(lambda x: x.name, unique_list):
+            unique_list.append(person)
+    return unique_list
+
+def name_search(target, encoding='utf-8'):
+    target = remove_accents(target.decode(encoding)).lower()
+    query = directorio_telefonico() + personal_vinculado() + visitantes()
+    name_matches = filter(lambda x: target in x.name_wo_accents.lower(), query) 
+    return remove_duplicate_names(name_matches)
+
+def search(target, encoding='utf-8'):
+    target = remove_accents(target.decode(encoding)).lower()
+    query = directorio_telefonico() + personal_vinculado() + visitantes()
+    matches = filter(lambda x: target in x.name_wo_accents.lower() or \
+                               target in str(x.phone) or \
+                               target in x.email, query)
+
+    matches.sort(key = lambda x: x.name)
+    return remove_duplicate_names(matches)
+
+
+if __name__ == "__main__":
+
+    if sys.version_info < (2, 5):
+        print "Error: phone requires Python 2.5 os later :-("
+        sys.exit(3)
+
+    description = \
+    "phone is intended to serve as the white pages of the Institute of "       \
+    "Astrophysics of Andalusia: a command-line application that queries the "  \
+    "website in order to find the telephone number given a name or surname. "  \
+    "Note that visiting and associate researchers are included in the "        \
+    "search. The reverse telephone directory (grey pages) is also available, " \
+    "as searches may also be done by the phone number or e-mail address. All " \
+    "searches are case- and accent-insensitive."
+
+    parser = optparse.OptionParser(description = description)
+    parser.usage = "%prog NAME | PHONE | EMAIL"
+
+    if len(sys.argv) != 2:
+        parser.print_help()
+        sys.exit(2)
+
+    target = sys.argv[1]
+    matches = search(target)
+    
+    if not matches:
+        sys.exit(0)
+
+    # If the email address is at the @iaa.es domain (and most of the addresses
+    # certainly will), only the username is shown
+    domain = '@iaa.es'
+    for match in matches:
+        if match.email.endswith(domain):
+            match.email = match.email[:-len(domain)]
+
+    # Determine the max length of each column
+    name_column_length  = max(map(lambda x: len(x.name), matches))
+    phone_column_length = max(map(lambda x: len(x.phone), matches))
+    email_column_length = max(map(lambda x: len(x.email), matches))
+
+    for person in matches:
+        print person.name.ljust(name_column_length), '|', \
+              person.phone.ljust(phone_column_length), '|', \
+              person.email.ljust(email_column_length)
+
+    sys.exit(0)
+
diff --git a/setup.py b/setup.py
@@ -0,0 +1,30 @@
+#! /usr/bin/env python
+
+# Copyright (c) 2011 Victor Terron. All rights reserved.
+# Institute of Astrophysics of Andalusia, IAA-CSIC
+#
+# This file is part of phone.
+#
+# phone is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import distutils.core
+
+distutils.core.setup (name = 'phone',
+                      version = '0.9',
+                      description = "White pages of the IAA-CSIC",
+                      author = 'Victor Terron',
+                      author_email = 'vterron@iaa.es',
+                      url = 'http://www.iaa.es/~vterron/',
+                      license = "GNU General Public License, version 3 (GPLv3)",
+                      scripts = ['phone'])

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +COPYING
 +MANIFEST
 +README
 +phone
 +setup.py