Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Convert syntax to python3 and update the scrapping for the new websit… #2

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 6 additions & 14 deletions README
Original file line number Diff line number Diff line change
Expand Up @@ -21,21 +21,13 @@ STAGE 4 files: create nice web-based UI (maybe using Google App Engine?) to make
-------------------------------------------------------------------------------
INSTALLATION
-------------------------------------------------------------------------------
1. Download pubs_nips from the site[*] (https://sites.google.com/site/researchpooler/downloads)
2. Browse around (project is young, no installation needed so far!)
3. Download/Install current Python dependencies:
pip install requirements.txt
python nips_download_parse.py
python nips_add_pdftext.py

BeautifulSoup [for easy and robust HTML parsing]
PDFMiner [for parsing PDFs and extracting text]
simplejson [OPTIONAL. for parsing outputs of Google searches using their API]

4. Enjoy the demos

[*] Instead of downloading the database you can also regenerate the pubs_nips database yourself using the two scripts I wrote. Simply run:
$> python nips_download_parse.py
(takes a few seconds) and then
$> python nips_add_pdftext.py
(takes potentially an hour or two because it has to download and parse all papers published at NIPS since 2003)
-------------------------------------------------------------------------------
RUN using 'python demo1.py'
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
EXAMPLE USAGE
-------------------------------------------------------------------------------
Expand Down
93 changes: 49 additions & 44 deletions demo1.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,49 @@
"""
Some examples of fun things that can be done using the current 'API'
"""

from repool_util import loadPubs, openPDFs

def demo1():
"""
You wrote an algorithm and benchmarked it on the MNIST dataset. You are
wondering how your results compare with those in the literature:
1. Finds all publications that mention mnist
2. Print out their titles
3. Open the three latest publications that mention it at least twice

Pre-requisites:
- Assumes 'pubs_nips' exists and that pdf text is present.
This can be obtained by running
nips_download_parse.py and then nips_add_pdftext.py, or by downloading it
from site (https://sites.google.com/site/researchpooler/home)

Side-effects:
- will use os call to open a pdf with default program
"""

print "loading the NIPS publications dataset..."
pubs = loadPubs('pubs_nips')

# get all papers that mention mnist
p = [x for x in pubs if 'mnist' in x.get('pdf_text',{})]
print "titles of papers that mention MNIST dataset:"
for x in p:
print x['title']
print "total of %d publications mention MNIST." %(len(p),)

# sort by number of occurences
occ = [(x['year'], x['pdf']) for i,x in enumerate(p) if x['pdf_text']['mnist']>1]
occ.sort(reverse = True)

# open the top 3 latest in browser
print "opening the top 3..."
openPDFs([x for year,x in occ[:3]])

if __name__ == '__main__':
demo1()
import webbrowser
import requests
from bs4 import BeautifulSoup
from repool_util import loadPubs


def lookup(titles):
for title in titles:
search_url = f"https://www.google.com/search?q={title}"
try:
response = requests.get(search_url)
response.raise_for_status()

# Parse the search results page using BeautifulSoup.
soup = BeautifulSoup(response.text, 'html.parser')

# Find the first search result link and open it in the default web browser.
search_results = soup.find_all("a")
for result in search_results:
if result.get("href").startswith("/url?q="):
first_result_url = result.get("href")[7:].split('&')[0] # Extract the URL.
webbrowser.open(first_result_url, new=2)
break # Stop after opening the first result.
else:
print(f"No search results found for '{title}'")
except Exception as e:
print(f"Error opening the browser: {str(e)}")


options = ['venue', 'title', 'authors']
search_in = input("Choose what to search for " + str(options) + ": ").strip().lower()

if search_in not in options:
print(f"Invalid option. Please choose from {', '.join(options)}")
else:
word = input("Enter the word to search for: ").strip()

if word:
pubs = loadPubs('pubs_nips')
google_it = 'title'
p = [x[google_it] for x in pubs if word.lower() in x.get(search_in, '').lower()]

if p:
print(f"Number of results found: {len(p)}")
lookup(p)
else:
print('No results found for your search.')
else:
print('Please enter a word to search for.')
44 changes: 22 additions & 22 deletions nips_add_pdftext.py
Original file line number Diff line number Diff line change
@@ -1,55 +1,55 @@
"""
Standalone helper script.

Load nips pubs_ file, and adds to every paper its word counts under key
Load nips pubs_ file, and adds to every paper its word counts under key
'pdf_text'. The PDF for each paper is downloaded from NIPS site.
"""

from repool_util import loadPubs, savePubs, stringToWordDictionary
from pdf_read import convertPDF

pubs_all = loadPubs('pubs_nips')
print 'loaded pubs with %d entries.' % (len(pubs_all),)
print('loaded pubs with %d entries.' % (len(pubs_all),))

#possibly place restrictions on pubs to process here
# possibly place restrictions on pubs to process here
pubs = pubs_all

for i,p in enumerate(pubs):
#if the pdf url does not exist, in future this could possibly use google
#search to try to look up a link for the pdf first.
if p.has_key('pdf') and not p.has_key('pdf_text'):
for i, p in enumerate(pubs):

# if the pdf url does not exist, in future this could possibly use google
# search to try to look up a link for the pdf first.
if 'pdf' in p and not 'pdf_text' in p:

# try to open the PDF from downloaded location
processed = False
try:
floc = p['pdf'].index('NIPS')
fname = p['pdf'][floc:]
txt = convertPDF('downloads/'+fname)
processed = True
print 'found %s in file!' % (p['title'],)
print('found %s in file!' % (p['title'],))
except:
pass

if not processed:
# download the PDF and convert to text
try:
print 'downloading pdf for [%s] and parsing...' % (p.get('title', 'an un-titled paper'))
print('downloading pdf for [%s] and parsing...' % (p.get('title', 'an un-titled paper')))
txt = convertPDF(p['pdf'])
processed = True
print 'processed from url!'
print('processed from url!')
except:
print 'error: unable to open download the pdf from %s' % (p['pdf'],)
print 'skipping...'
print('error: unable to open download the pdf from %s' % (p['pdf'],))
print('skipping...')

if processed:
# convert to bag of words and store
try:
p['pdf_text'] = stringToWordDictionary(txt)
except:
print 'was unable to convert text to bag of words. Skipped.'
print '%d/%d = %.2f%% done.' % (i+1, len(pubs), 100*(i+1.0)/len(pubs))
savePubs('pubs_nips', pubs_all)
print('was unable to convert text to bag of words. Skipped.')


print('%d/%d = %.2f%% done.' % (i+1, len(pubs), 100*(i+1.0)/len(pubs)))

savePubs('pubs_nips', pubs_all)
152 changes: 47 additions & 105 deletions nips_download_parse.py
Original file line number Diff line number Diff line change
@@ -1,117 +1,59 @@
"""
Standalone helper script.

Parses NIPS proceedings for years 2003-2010, creates list of dictionaries
that store information about each publication, and saves the result as a
pickle in current directory called pubs_nips.
"""

import urllib
from BeautifulSoup import BeautifulSoup, Tag, NavigableString
import urllib.request
from bs4 import BeautifulSoup
from repool_util import savePubs

pubs = []
warnings = []
for num in range(16, 24):
year = 1987 + num

url = "http://books.nips.cc/nips%d.html" % (num,)
print "downloading proceedings from NIPS year %d..." % (year,)
f = urllib.urlopen(url)
s = f.read()
f.close()

print "done. Parsing..."
soup = BeautifulSoup(s)
soup = soup.find('table', {'width' : '600'}) # find the main table HTML
soup = soup.contents[0].contents[0] # descend down <tr> and then <td>

# iterate over this giant linear dump they have on the proceedings site
venue = 'NIPS %d' % (year,)
new_pub = {}
old_count = len(pubs)
for item in soup.contents:

if isinstance(item, Tag):
if item.name == 'b':

# we stumbled by a new publication entry. If we were processing
# one before this, then commit that one first then continue
if new_pub:
if not new_pub.has_key('authors'):
warnings.append("oh oh no authors for publication... ")

if not new_pub.has_key('title'):
warnings.append("oh oh no title for publication... ")

new_pub['venue'] = venue
new_pub['year']= year
pubs.append(new_pub)

# start new publication dictionary
new_pub = {}
new_title = str(item.contents[0]) # descend down a <b> tag
new_title = new_title.replace('\n', '')
new_pub['title'] = new_title

if item.name == 'a':
modifier = str(item.contents[0]).strip()
if modifier == '[pdf]':
new_pub['pdf'] = str(item.attrs[0][1])
elif modifier == '[bibtex]':
new_pub['bibtex'] = str(item.attrs[0][1])
elif modifier == '[correction]':
new_pub['correction'] = str(item.attrs[0][1])
elif modifier == '[supplemental]':
new_pub['supplemental'] = str(item.attrs[0][1])
elif modifier == '[slide]':
new_pub['slide'] = str(item.attrs[0][1])
elif modifier == '[audio]':
new_pub['audio'] = str(item.attrs[0][1])
elif modifier == '[ps.gz]':
pass # ignore
elif modifier == '[djvu]':
pass # ignore
else:
warnings.append("warning: modifier %s skipped" %(modifier,))

if isinstance(item, NavigableString):
if len(str(item))>3:

# this is probably the line describing authors
author_str = str(item)
author_str = author_str.replace('\n', '') # remove newlines
author_list = author_str.split(',')
if new_pub.has_key('authors'):
warnings.append("we're in trouble... %s, but already have "\
"%s" % (str(item), new_pub['authors']))

new_pub['authors'] = [x.strip() for x in author_list]

# I hate myself a little for this
# TODO LATER_MAYBE: CODE CHUNK DUPLICATION
if not new_pub.has_key('authors'):
warnings.append("oh oh no authors for publication... ")
if not new_pub.has_key('title'):
warnings.append("oh oh no title for publication... ")
new_pub['venue'] = venue
new_pub['year']= year
pubs.append(new_pub)

print "read in %d publications for year %d." % (len(pubs) - old_count, year)


# Loop over years from 2006 to 2022
for year in range(2006, 2023):
# Construct the URL for the specific year
url = f"https://proceedings.neurips.cc/paper_files/paper/{year}"

print(f"downloading proceedings from NIPS year {year}...")
print(url)

with urllib.request.urlopen(url) as f:
s = f.read()

print("done. Parsing...")
soup = BeautifulSoup(s, 'html.parser')

# Find the section containing publication information
publication_section = soup.find('div', {'class': 'container-fluid'})

# Iterate over each publication entry
for publication_entry in publication_section.find_all('li', {'class': 'none'}):
new_pub = {}

# Extract title
title_tag = publication_entry.find('a', {'title': 'paper title'})
if title_tag:
new_pub['title'] = title_tag.text.strip()

# Extract authors
authors_tag = publication_entry.find('i')
if authors_tag:
authors = authors_tag.text.strip().split(',')
new_pub['authors'] = [author.strip() for author in authors]

# Add publication to the list
if new_pub:
new_pub['venue'] = f'NeurIPS {year}'
pubs.append(new_pub)

print(f"Read in {len(pubs)} publications for year {year}.")

# show warnings, if any were generated
if len(warnings)>0:
print "%d warnings:" % (len(warnings),)
if len(warnings) > 0:
print(f"{len(warnings)} warnings:")
for x in warnings:
print x
print(x)
else:
print "No warnings generated."
print("No warnings generated.")

# finally, save pickle as output
print "read in a total of %d publications." % (len(pubs),)
fname = "pubs_nips"
print "saving pickle in %s" % (fname,)
print("saving pickle in %s" % fname)
savePubs(fname, pubs)
print "all done."
print("all done.")
Binary file added pubs_nips
Binary file not shown.
Loading