Skip to content

Commit 46b862c

Browse files
author
Víctor Terrón
committed
The original script, now at GitHub. Fiat lux!
I never really thought that anybody else could be interested in using this script, but it seems that a small fraction among my almost two hundred workmates could want to install it on their systems. And there are a few features that may be worth implementing.
0 parents  commit 46b862c

File tree

6 files changed

+1036
-0
lines changed

6 files changed

+1036
-0
lines changed

COPYING

+674
Large diffs are not rendered by default.

MANIFEST

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
COPYING
2+
MANIFEST
3+
README
4+
phone
5+
setup.py

MANIFEST.in

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
include setup.py
2+
include phone
3+
global-exclude *~
4+
include README
5+
include COPYING
6+
include MANIFEST

README.rst

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
IAA-CSIC Phone Book
2+
===================
3+
4+
This Python script, **iaa-phone** is intended to serve as the *white pages* of the `Institute of Astrophysics of Andalusia <http://www.iaa.es>` (CSIC, Spain). It is a command-line application that queries the website in order to find the telephone number given a name or surname. Visiting and associate researchers are also included in the search.
5+
6+
The reverse telephone directory (*grey pages*) is also available, as searches may also be done by the phone number or e-mail address.
7+
8+
All searches are case- and accent-insensitive.

phone

+313
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,313 @@
1+
#! /usr/bin/env python
2+
3+
# Copyright (c) 2011 Victor Terron. All rights reserved.
4+
# Institute of Astrophysics of Andalusia, IAA-CSIC
5+
#
6+
# This file is part of phone.
7+
#
8+
# phone is free software: you can redistribute it and/or modify
9+
# it under the terms of the GNU General Public License as published by
10+
# the Free Software Foundation, either version 3 of the License, or
11+
# (at your option) any later version.
12+
#
13+
# This program is distributed in the hope that it will be useful,
14+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
15+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16+
# GNU General Public License for more details.
17+
#
18+
# You should have received a copy of the GNU General Public License
19+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
20+
21+
import re
22+
import sys
23+
import urllib
24+
import optparse
25+
import HTMLParser
26+
import unicodedata
27+
import htmlentitydefs
28+
29+
30+
class Scraper(HTMLParser.HTMLParser):
31+
in_h4 = False
32+
in_link = False
33+
34+
def handle_starttag(self, tag, attrs):
35+
attrs = dict(attrs)
36+
if tag == 'h4':
37+
self.in_h4 = True
38+
if tag == 'a' and 'href' in attrs:
39+
self.in_link = True
40+
self.chunks = []
41+
self.url = attrs['href']
42+
43+
def handle_data(self, data):
44+
if self.in_link:
45+
self.chunks.append(data)
46+
def handle_endtag(self, tag):
47+
if tag == 'h4':
48+
self.in_h4 = False
49+
if tag == 'a':
50+
if self.in_h4 and self.in_link:
51+
print '%s (%s)' % (''.join(self.chunks), self.url)
52+
self.in_link = False
53+
54+
55+
56+
# http://stackoverflow.com/questions/930303/python-string-cleanup-manipulation-accented-characters
57+
def remove_accents (unicrap):
58+
"""This replaces UNICODE Latin-1 characters with
59+
something equivalent in 7-bit ASCII. All characters in the standard
60+
7-bit ASCII range are preserved. In the 8th bit range all the Latin-1
61+
accented letters are stripped of their accents. Most symbol characters
62+
are converted to something meaninful. Anything not converted is deleted.
63+
"""
64+
xlate={0xc0:'A', 0xc1:'A', 0xc2:'A', 0xc3:'A', 0xc4:'A', 0xc5:'A',
65+
0xc6:'Ae', 0xc7:'C',
66+
0xc8:'E', 0xc9:'E', 0xca:'E', 0xcb:'E',
67+
0xcc:'I', 0xcd:'I', 0xce:'I', 0xcf:'I',
68+
0xd0:'Th', 0xd1:'N',
69+
0xd2:'O', 0xd3:'O', 0xd4:'O', 0xd5:'O', 0xd6:'O', 0xd8:'O',
70+
0xd9:'U', 0xda:'U', 0xdb:'U', 0xdc:'U',
71+
0xdd:'Y', 0xde:'th', 0xdf:'ss',
72+
0xe0:'a', 0xe1:'a', 0xe2:'a', 0xe3:'a', 0xe4:'a', 0xe5:'a',
73+
0xe6:'ae', 0xe7:'c',
74+
0xe8:'e', 0xe9:'e', 0xea:'e', 0xeb:'e',
75+
0xec:'i', 0xed:'i', 0xee:'i', 0xef:'i',
76+
0xf0:'th', 0xf1:'n',
77+
0xf2:'o', 0xf3:'o', 0xf4:'o', 0xf5:'o', 0xf6:'o', 0xf8:'o',
78+
0xf9:'u', 0xfa:'u', 0xfb:'u', 0xfc:'u',
79+
0xfd:'y', 0xfe:'th', 0xff:'y',
80+
0xa1:'!', 0xa2:'{cent}', 0xa3:'{pound}', 0xa4:'{currency}',
81+
0xa5:'{yen}', 0xa6:'|', 0xa7:'{section}', 0xa8:'{umlaut}',
82+
0xa9:'{C}', 0xaa:'{^a}', 0xab:'<<', 0xac:'{not}',
83+
0xad:'-', 0xae:'{R}', 0xaf:'_', 0xb0:'{degrees}',
84+
0xb1:'{+/-}', 0xb2:'{^2}', 0xb3:'{^3}', 0xb4:"'",
85+
0xb5:'{micro}', 0xb6:'{paragraph}', 0xb7:'*', 0xb8:'{cedilla}',
86+
0xb9:'{^1}', 0xba:'{^o}', 0xbb:'>>',
87+
0xbc:'{1/4}', 0xbd:'{1/2}', 0xbe:'{3/4}', 0xbf:'?',
88+
0xd7:'*', 0xf7:'/'
89+
}
90+
91+
r = ''
92+
for i in unicrap:
93+
if xlate.has_key(ord(i)):
94+
r += xlate[ord(i)]
95+
elif ord(i) >= 0x80:
96+
pass
97+
else:
98+
r += i
99+
return r
100+
101+
# http://farmdev.com/talks/unicode/
102+
def to_unicode_or_bust(obj, encoding='utf-8'):
103+
if isinstance(obj, basestring):
104+
if not isinstance(obj, unicode):
105+
obj = unicode(obj, encoding)
106+
return obj
107+
108+
109+
def unescape(s):
110+
"unescape HTML code refs; c.f. http://wiki.python.org/moin/EscapingHtml"
111+
return re.sub('&(%s);' % '|'.join(htmlentitydefs.name2codepoint),
112+
lambda m: unichr(htmlentitydefs.name2codepoint[m.group(1)]), s)
113+
114+
115+
class Person(object):
116+
def __init__(self, name, department, position, phone, email, encoding = 'iso-8859-1'):
117+
self.name = unescape(name.decode(encoding))
118+
self.department = unescape(department.decode(encoding))
119+
self.position = unescape(position.decode(encoding))
120+
self.phone = phone.decode(encoding)
121+
self.email = unescape(email.decode(encoding))
122+
123+
@property
124+
def name_wo_accents(self):
125+
return remove_accents(self.name)
126+
127+
128+
class Guest(Person):
129+
130+
def __init__(self, name, institution, dates, position, contact, email, encoding = 'iso-8859-1'):
131+
super(Guest, self).__init__(name, '', position, '', email, encoding = encoding)
132+
self.institution = unescape(institution.decode(encoding))
133+
self.dates = unescape(dates.decode(encoding))
134+
self.contact = unescape(contact.decode(encoding))
135+
136+
def alphabetical_query():
137+
138+
alphabetical = urllib.urlopen('http://www.iaa.es/personal/general.html').read()
139+
regexp = re.compile("<td nowrap>&nbsp;<b>([0-9]*)</b>&nbsp;</td><td nowrap>(<a .*?>)?(.*?)(</a>)?</td>"
140+
"<td nowrap>(.*?)</td><td nowrap>(.*?)</td><td nowrap>(.*?)&nbsp;</td></tr>", re.IGNORECASE)
141+
found_persons = []
142+
for matched_line in regexp.findall(alphabetical):
143+
phone = matched_line[0].strip()
144+
name = matched_line[2].strip()
145+
department = matched_line[4].strip()
146+
position = matched_line[5].strip()
147+
email = matched_line[6].strip()
148+
found_persons.append(Person(name, department, position, phone, email))
149+
return found_persons
150+
151+
152+
def directorio_telefonico():
153+
154+
directorio = urllib.urlopen('http://www.iaa.es/privado/directorio.html').read()
155+
# A row in the table will look like this. There are no <a></a> tags right now,
156+
# but who knows if they may be added in the future.
157+
# <tr><td nowrap align="center"><b>&nbsp;639</b></td><td nowrap>&nbsp;
158+
# Amado Gonzalez, Pedro Jose</td><td nowrap>&nbsp;[email protected]</td></tr>
159+
160+
regexp = re.compile("<tr><td nowrap align=\"center\"><b>&nbsp;([0-9]*)</b></td>"
161+
"<td nowrap>&nbsp;(<a .*?>)?(.*?)(</a>)?</td><td nowrap>&nbsp;(.*?)</td></tr>",
162+
re.IGNORECASE)
163+
164+
found_persons = []
165+
for matched_line in regexp.findall(directorio):
166+
phone = matched_line[0].strip()
167+
name = matched_line[2].strip()
168+
email = matched_line[4].strip()
169+
found_persons.append(Person(name, '', '', phone, email))
170+
return found_persons
171+
172+
173+
def personal_vinculado():
174+
175+
vinculado = urllib.urlopen('http://www.iaa.es/privado/personal/pv.html').read()
176+
# A row in the table will look like this. Note that the <a></a> tags are optional
177+
# <tr><td nowrap>&nbsp;<b></b>&nbsp;</td><td nowrap><a href="/~iagudo">Agudo
178+
# Rodriguez, Juan Ivan</a></td><td nowrap>DREG</td><td nowrap>Colaborador
179+
# Externo</td><td nowrap>iagudo&nbsp;</td></tr>
180+
181+
regexp = re.compile("<tr><td nowrap>&nbsp;<b>([0-9]*)</b>&nbsp;</td><td nowrap>(<a .*?>)?(.*?)(</a>)?</td>"
182+
"<td nowrap>(.*?)</td><td nowrap>(.*?)</td><td nowrap>(.*?)&nbsp;</td></tr>", re.IGNORECASE)
183+
184+
found_persons = []
185+
for matched_line in regexp.findall(vinculado):
186+
phone = matched_line[0].strip()
187+
name = matched_line[2].strip()
188+
department = matched_line[4].strip()
189+
position = matched_line[5].strip()
190+
email = matched_line[6].strip()
191+
found_persons.append(Person(name, department, position, phone, email))
192+
return found_persons
193+
194+
195+
def visitantes():
196+
197+
registro_visitas = urllib.urlopen('http://www.iaa.es/privado/visitas/').read()
198+
199+
# <table width="100%" border="0" cellspacing="0" cellpadding="0" class="nlabel">
200+
# <tr><td width="12" nowrap background="/img/label/ncl.gif">&nbsp;</td>
201+
# <td nowrap background="/img/label/ncc.gif">Josep Maria Masque</td>
202+
# <td width="12" nowrap background="/img/label/ncr.gif">&nbsp;</td>
203+
# <td width="600" background="/img/label/nbg.gif">&nbsp;</td></tr>
204+
# </table>
205+
#
206+
# This is how an entry in the listing looks like
207+
# <table width="600" border="0" cellspacing="0" cellpadding="0" class="head">
208+
# <tr><td width="12" nowrap>&nbsp;</td><td class="gbold">Stockholm University</td></tr>
209+
# <tr><td width="12" nowrap>&nbsp;</td><td class="bold">del 04 de Octubre de 2010, al 29 de Octubre de 2010</td></tr>
210+
# <tr><td width="12" nowrap>&nbsp;</td><td><span class="gbold">Cat.Prof.</span> Predoc<p>&nbsp;</p></td></tr>
211+
# <tr><td width="12" nowrap>&nbsp;</td><td><span class="bold">Persona contacto:</span> Romero Canizales, Cristina</td></tr>
212+
# <tr><td width="12" nowrap>&nbsp;</td><td><span class="bold">e-mail:</span> [email protected]</td></tr>
213+
# </table>
214+
215+
216+
regexp = re.compile(
217+
"<table width=\"100%\" border=\"0\" cellspacing=\"0\" cellpadding=\"0\" class=\"nlabel\">.*?"
218+
"<tr>.*?</td>.*?"
219+
"<td .*?>(.*?)</td>.*?"
220+
"<td .*?</td>.*?"
221+
"<td .*?</tr>.*?"
222+
"</table>.*?"
223+
"<table .*?>[\s]*"
224+
"<tr><td .*?</td><td .*?>(.*?)</td></tr>[\s]*"
225+
"<tr><td .*?</td><td .*?>(.*?)</td></tr>[\s]*"
226+
"<tr><td .*?</td><td><span .*?</span>(.*?)<p>.*?</p></td></tr>[\s]*"
227+
"<tr><td .*?</td><td><span .*?</span>(.*?)</td></tr>[\s]*"
228+
"<tr><td .*?</td><td><span .*?</span>(.*?)</td></tr>[\s]*"
229+
"</table>"
230+
, re.IGNORECASE | re.DOTALL)
231+
232+
found_visitors = []
233+
for matched_line in regexp.findall(registro_visitas):
234+
name = matched_line[0].strip()
235+
institution = matched_line[1].strip()
236+
dates = matched_line[2].strip()
237+
position = matched_line[3].strip()
238+
contact = matched_line[4].strip()
239+
email = matched_line[5].strip()
240+
found_visitors.append(Guest(name, institution, dates, position, contact, email))
241+
return found_visitors
242+
243+
def remove_duplicate_names(list_of_persons):
244+
unique_list = []
245+
for person in list_of_persons:
246+
if person.name not in map(lambda x: x.name, unique_list):
247+
unique_list.append(person)
248+
return unique_list
249+
250+
def name_search(target, encoding='utf-8'):
251+
target = remove_accents(target.decode(encoding)).lower()
252+
query = directorio_telefonico() + personal_vinculado() + visitantes()
253+
name_matches = filter(lambda x: target in x.name_wo_accents.lower(), query)
254+
return remove_duplicate_names(name_matches)
255+
256+
def search(target, encoding='utf-8'):
257+
target = remove_accents(target.decode(encoding)).lower()
258+
query = directorio_telefonico() + personal_vinculado() + visitantes()
259+
matches = filter(lambda x: target in x.name_wo_accents.lower() or \
260+
target in str(x.phone) or \
261+
target in x.email, query)
262+
263+
matches.sort(key = lambda x: x.name)
264+
return remove_duplicate_names(matches)
265+
266+
267+
if __name__ == "__main__":
268+
269+
if sys.version_info < (2, 5):
270+
print "Error: phone requires Python 2.5 os later :-("
271+
sys.exit(3)
272+
273+
description = \
274+
"phone is intended to serve as the white pages of the Institute of " \
275+
"Astrophysics of Andalusia: a command-line application that queries the " \
276+
"website in order to find the telephone number given a name or surname. " \
277+
"Note that visiting and associate researchers are included in the " \
278+
"search. The reverse telephone directory (grey pages) is also available, " \
279+
"as searches may also be done by the phone number or e-mail address. All " \
280+
"searches are case- and accent-insensitive."
281+
282+
parser = optparse.OptionParser(description = description)
283+
parser.usage = "%prog NAME | PHONE | EMAIL"
284+
285+
if len(sys.argv) != 2:
286+
parser.print_help()
287+
sys.exit(2)
288+
289+
target = sys.argv[1]
290+
matches = search(target)
291+
292+
if not matches:
293+
sys.exit(0)
294+
295+
# If the email address is at the @iaa.es domain (and most of the addresses
296+
# certainly will), only the username is shown
297+
domain = '@iaa.es'
298+
for match in matches:
299+
if match.email.endswith(domain):
300+
match.email = match.email[:-len(domain)]
301+
302+
# Determine the max length of each column
303+
name_column_length = max(map(lambda x: len(x.name), matches))
304+
phone_column_length = max(map(lambda x: len(x.phone), matches))
305+
email_column_length = max(map(lambda x: len(x.email), matches))
306+
307+
for person in matches:
308+
print person.name.ljust(name_column_length), '|', \
309+
person.phone.ljust(phone_column_length), '|', \
310+
person.email.ljust(email_column_length)
311+
312+
sys.exit(0)
313+

setup.py

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#! /usr/bin/env python
2+
3+
# Copyright (c) 2011 Victor Terron. All rights reserved.
4+
# Institute of Astrophysics of Andalusia, IAA-CSIC
5+
#
6+
# This file is part of phone.
7+
#
8+
# phone is free software: you can redistribute it and/or modify
9+
# it under the terms of the GNU General Public License as published by
10+
# the Free Software Foundation, either version 3 of the License, or
11+
# (at your option) any later version.
12+
#
13+
# This program is distributed in the hope that it will be useful,
14+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
15+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16+
# GNU General Public License for more details.
17+
#
18+
# You should have received a copy of the GNU General Public License
19+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
20+
21+
import distutils.core
22+
23+
distutils.core.setup (name = 'phone',
24+
version = '0.9',
25+
description = "White pages of the IAA-CSIC",
26+
author = 'Victor Terron',
27+
author_email = '[email protected]',
28+
url = 'http://www.iaa.es/~vterron/',
29+
license = "GNU General Public License, version 3 (GPLv3)",
30+
scripts = ['phone'])

0 commit comments

Comments
 (0)