karpathy · Mtehabsim · Sep 1, 2023
diff --git a/README b/README
@@ -21,21 +21,13 @@ STAGE 4 files: create nice web-based UI (maybe using Google App Engine?) to make
 -------------------------------------------------------------------------------
 INSTALLATION
 -------------------------------------------------------------------------------
-1. Download pubs_nips from the site[*] (https://sites.google.com/site/researchpooler/downloads)
-2. Browse around (project is young, no installation needed so far!)
-3. Download/Install current Python dependencies:
+pip install requirements.txt
+python nips_download_parse.py
+python nips_add_pdftext.py
 
-BeautifulSoup   [for easy and robust HTML parsing]
-PDFMiner        [for parsing PDFs and extracting text]
-simplejson      [OPTIONAL. for parsing outputs of Google searches using their API]
-
-4. Enjoy the demos
-
-[*] Instead of downloading the database you can also regenerate the pubs_nips database yourself using the two scripts I wrote. Simply run:
-$> python nips_download_parse.py
-(takes a few seconds) and then
-$> python nips_add_pdftext.py
-(takes potentially an hour or two because it has to download and parse all papers published at NIPS since 2003)
+-------------------------------------------------------------------------------
+RUN using 'python demo1.py'
+-------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 EXAMPLE USAGE
 -------------------------------------------------------------------------------

diff --git a/demo1.py b/demo1.py
@@ -1,44 +1,49 @@
-"""
-Some examples of fun things that can be done using the current 'API'
-"""
-
-from repool_util import loadPubs, openPDFs
-
-def demo1():
-    """
-    You wrote an algorithm and benchmarked it on the MNIST dataset. You are 
-    wondering how your results compare with those in the literature:
-    1. Finds all publications that mention mnist
-    2. Print out their titles
-    3. Open the three latest publications that mention it at least twice
-
-    Pre-requisites:
-    - Assumes 'pubs_nips' exists and that pdf text is present. 
-      This can be obtained by running 
-      nips_download_parse.py and then nips_add_pdftext.py, or by downloading it 
-      from site (https://sites.google.com/site/researchpooler/home)
-
-    Side-effects:
-    - will use os call to open a pdf with default program
-    """
-
-    print "loading the NIPS publications dataset..."
-    pubs = loadPubs('pubs_nips')
-
-    # get all papers that mention mnist
-    p = [x for x in pubs if 'mnist' in x.get('pdf_text',{})]
-    print "titles of papers that mention MNIST dataset:"
-    for x in p:
-        print x['title']
-    print "total of %d publications mention MNIST." %(len(p),)
-
-    # sort by number of occurences
-    occ = [(x['year'], x['pdf']) for i,x in enumerate(p) if x['pdf_text']['mnist']>1]
-    occ.sort(reverse = True)
-
-    # open the top 3 latest in browser
-    print "opening the top 3..."
-    openPDFs([x for year,x in occ[:3]])
-
-if __name__ == '__main__':
-    demo1()
+import webbrowser
+import requests
+from bs4 import BeautifulSoup
+from repool_util import loadPubs
+
+
+def lookup(titles):
+    for title in titles:
+        search_url = f"https://www.google.com/search?q={title}"
+        try:
+            response = requests.get(search_url)
+            response.raise_for_status()
+
+            # Parse the search results page using BeautifulSoup.
+            soup = BeautifulSoup(response.text, 'html.parser')
+
+            # Find the first search result link and open it in the default web browser.
+            search_results = soup.find_all("a")
+            for result in search_results:
+                if result.get("href").startswith("/url?q="):
+                    first_result_url = result.get("href")[7:].split('&')[0]  # Extract the URL.
+                    webbrowser.open(first_result_url, new=2)
+                    break  # Stop after opening the first result.
+            else:
+                print(f"No search results found for '{title}'")
+        except Exception as e:
+            print(f"Error opening the browser: {str(e)}")
+
+
+options = ['venue', 'title', 'authors']
+search_in = input("Choose what to search for " + str(options) + ": ").strip().lower()
+
+if search_in not in options:
+    print(f"Invalid option. Please choose from {', '.join(options)}")
+else:
+    word = input("Enter the word to search for: ").strip()
+
+    if word:
+        pubs = loadPubs('pubs_nips')
+        google_it = 'title'
+        p = [x[google_it] for x in pubs if word.lower() in x.get(search_in, '').lower()]
+
+        if p:
+            print(f"Number of results found: {len(p)}")
+            lookup(p)
+        else:
+            print('No results found for your search.')
+    else:
+        print('Please enter a word to search for.')
diff --git a/nips_add_pdftext.py b/nips_add_pdftext.py
@@ -1,55 +1,55 @@
 """
 Standalone helper script.
 
-Load nips pubs_ file, and adds to every paper its word counts under key 
+Load nips pubs_ file, and adds to every paper its word counts under key
 'pdf_text'. The PDF for each paper is downloaded from NIPS site.
 """
 
 from repool_util import loadPubs, savePubs, stringToWordDictionary
 from pdf_read import convertPDF
 
 pubs_all = loadPubs('pubs_nips')
-print 'loaded pubs with %d entries.' % (len(pubs_all),)
+print('loaded pubs with %d entries.' % (len(pubs_all),))
 
-#possibly place restrictions on pubs to process here
+# possibly place restrictions on pubs to process here
 pubs = pubs_all
 
-for i,p in enumerate(pubs):
-    
-    #if the pdf url does not exist, in future this could possibly use google
-    #search to try to look up a link for the pdf first.
-    if p.has_key('pdf') and not p.has_key('pdf_text'):
-        
+for i, p in enumerate(pubs):
+
+    # if the pdf url does not exist, in future this could possibly use google
+    # search to try to look up a link for the pdf first.
+    if 'pdf' in p and not 'pdf_text' in p:
+
         # try to open the PDF from downloaded location
         processed = False
         try:
             floc = p['pdf'].index('NIPS')
             fname = p['pdf'][floc:]
             txt = convertPDF('downloads/'+fname)
             processed = True
-            print 'found %s in file!' % (p['title'],)
+            print('found %s in file!' % (p['title'],))
         except:
             pass
-            
+
         if not processed:
             # download the PDF and convert to text
             try:
-                print 'downloading pdf for [%s] and parsing...' % (p.get('title', 'an un-titled paper'))
+                print('downloading pdf for [%s] and parsing...' % (p.get('title', 'an un-titled paper')))
                 txt = convertPDF(p['pdf'])
                 processed = True
-                print 'processed from url!'
+                print('processed from url!')
             except:
-                print 'error: unable to open download the pdf from %s' % (p['pdf'],)
-                print 'skipping...'
-        
+                print('error: unable to open download the pdf from %s' % (p['pdf'],))
+                print('skipping...')
+
         if processed:
             # convert to bag of words and store
             try:
                 p['pdf_text'] = stringToWordDictionary(txt)
             except:
-                print 'was unable to convert text to bag of words. Skipped.'
-                
-        
-    print '%d/%d = %.2f%% done.' % (i+1, len(pubs), 100*(i+1.0)/len(pubs))
-    
-savePubs('pubs_nips', pubs_all)
+                print('was unable to convert text to bag of words. Skipped.')
+
+
+    print('%d/%d = %.2f%% done.' % (i+1, len(pubs), 100*(i+1.0)/len(pubs)))
+
+savePubs('pubs_nips', pubs_all)
diff --git a/nips_download_parse.py b/nips_download_parse.py
@@ -1,117 +1,59 @@
-"""
-Standalone helper script.
-
-Parses NIPS proceedings for years 2003-2010, creates list of dictionaries
-that store information about each publication, and saves the result as a 
-pickle in current directory called pubs_nips.
-"""
-
-import urllib
-from BeautifulSoup import BeautifulSoup, Tag, NavigableString
+import urllib.request
+from bs4 import BeautifulSoup
 from repool_util import savePubs
 
 pubs = []
 warnings = []
-for num in range(16, 24):
-    year = 1987 + num
-
-    url = "http://books.nips.cc/nips%d.html" % (num,)
-    print "downloading proceedings from NIPS year %d..." % (year,)
-    f = urllib.urlopen(url)
-    s = f.read()
-    f.close()
-
-    print "done. Parsing..."
-    soup = BeautifulSoup(s)
-    soup = soup.find('table', {'width' : '600'}) # find the main table HTML
-    soup = soup.contents[0].contents[0] # descend down <tr> and then <td>
-
-    # iterate over this giant linear dump they have on the proceedings site
-    venue = 'NIPS %d' % (year,)
-    new_pub = {}
-    old_count = len(pubs)
-    for item in soup.contents:
-
-        if isinstance(item, Tag):
-            if item.name == 'b':
-
-                # we stumbled by a new publication entry. If we were processing
-                # one before this, then commit that one first then continue
-                if new_pub:
-                    if not new_pub.has_key('authors'):
-                        warnings.append("oh oh no authors for publication... ")
-
-                    if not new_pub.has_key('title'):
-                        warnings.append("oh oh no title for publication... ")
-
-                    new_pub['venue'] = venue
-                    new_pub['year']= year
-                    pubs.append(new_pub)
-
-                # start new publication dictionary
-                new_pub = {}
-                new_title = str(item.contents[0]) # descend down a <b> tag
-                new_title = new_title.replace('\n', '')
-                new_pub['title'] = new_title
-
-            if item.name == 'a':
-                modifier = str(item.contents[0]).strip()
-                if modifier == '[pdf]':
-                    new_pub['pdf'] = str(item.attrs[0][1])
-                elif modifier == '[bibtex]':
-                    new_pub['bibtex'] = str(item.attrs[0][1])
-                elif modifier == '[correction]':
-                    new_pub['correction'] = str(item.attrs[0][1])
-                elif modifier == '[supplemental]':
-                    new_pub['supplemental'] = str(item.attrs[0][1])
-                elif modifier == '[slide]':
-                    new_pub['slide'] = str(item.attrs[0][1])
-                elif modifier == '[audio]':
-                    new_pub['audio'] = str(item.attrs[0][1])
-                elif modifier == '[ps.gz]':
-                    pass # ignore
-                elif modifier == '[djvu]':
-                    pass # ignore
-                else:
-                    warnings.append("warning: modifier %s skipped" %(modifier,))
-
-        if isinstance(item, NavigableString):
-            if len(str(item))>3:
-
-                # this is probably the line describing authors
-                author_str = str(item)
-                author_str = author_str.replace('\n', '') # remove newlines
-                author_list = author_str.split(',')
-                if new_pub.has_key('authors'):
-                    warnings.append("we're in trouble... %s, but already have "\
-                                    "%s" % (str(item), new_pub['authors']))
-
-                new_pub['authors'] = [x.strip() for x in author_list]
-
-    # I hate myself a little for this
-    # TODO LATER_MAYBE: CODE CHUNK DUPLICATION
-    if not new_pub.has_key('authors'):
-        warnings.append("oh oh no authors for publication... ")
-    if not new_pub.has_key('title'):
-        warnings.append("oh oh no title for publication... ")
-    new_pub['venue'] = venue
-    new_pub['year']= year
-    pubs.append(new_pub)
-
-    print "read in %d publications for year %d." % (len(pubs) - old_count, year)
-
+
+# Loop over years from 2006 to 2022
+for year in range(2006, 2023):
+    # Construct the URL for the specific year
+    url = f"https://proceedings.neurips.cc/paper_files/paper/{year}"
+
+    print(f"downloading proceedings from NIPS year {year}...")
+    print(url)
+
+    with urllib.request.urlopen(url) as f:
+        s = f.read()
+
+    print("done. Parsing...")
+    soup = BeautifulSoup(s, 'html.parser')
+
+    # Find the section containing publication information
+    publication_section = soup.find('div', {'class': 'container-fluid'})
+
+    # Iterate over each publication entry
+    for publication_entry in publication_section.find_all('li', {'class': 'none'}):
+        new_pub = {}
+
+        # Extract title
+        title_tag = publication_entry.find('a', {'title': 'paper title'})
+        if title_tag:
+            new_pub['title'] = title_tag.text.strip()
+
+        # Extract authors
+        authors_tag = publication_entry.find('i')
+        if authors_tag:
+            authors = authors_tag.text.strip().split(',')
+            new_pub['authors'] = [author.strip() for author in authors]
+
+        # Add publication to the list
+        if new_pub:
+            new_pub['venue'] = f'NeurIPS {year}'
+            pubs.append(new_pub)
+
+    print(f"Read in {len(pubs)} publications for year {year}.")
 
 # show warnings, if any were generated
-if len(warnings)>0:
-    print "%d warnings:" % (len(warnings),)
+if len(warnings) > 0:
+    print(f"{len(warnings)} warnings:")
     for x in warnings:
-        print x
+        print(x)
 else:
-    print "No warnings generated."
+    print("No warnings generated.")
 
 # finally, save pickle as output
-print "read in a total of %d publications." % (len(pubs),)
 fname = "pubs_nips"
-print "saving pickle in %s" % (fname,)
+print("saving pickle in %s" % fname)
 savePubs(fname, pubs)
-print "all done."
+print("all done.")
diff --git a/pubs_nips b/pubs_nips