diff --git a/download_repo_info.py b/download_repo_info.py index 000ad46..f55792e 100644 --- a/download_repo_info.py +++ b/download_repo_info.py @@ -7,6 +7,7 @@ The output is github_repositories.csv ''' +import logging import os import json import time @@ -31,7 +32,7 @@ def save_ckpt(lower_bound: int, upper_bound: int): global repo_list repo_list = list(set(repo_list)) # remove duplicates - print(f"Saving checkpoint {lower_bound, upper_bound}...") + logging.info(f"Saving checkpoint {lower_bound, upper_bound}...") with open('repo_ckpt.pkl', 'wb') as f: pickle.dump((lower_bound, upper_bound, repo_list), f) @@ -45,9 +46,9 @@ def get_request(lower_bound: int, upper_bound: int, page: int = 1): ) if r.status_code == 403: - print('API rate limit exceeded.') + logging.error('API rate limit exceeded.') save_ckpt(lower_bound, upper_bound, repo_list) - print('Exiting program.') + logging.info('Exiting program.') exit() elif r.status_code == 422: # No more pages available @@ -56,16 +57,16 @@ def get_request(lower_bound: int, upper_bound: int, page: int = 1): try: assert r.status_code == 200 except: - print(f'Unexpected status code. Status code returned is {r.status_code}') - print(r.text) + logging.error(f'Unexpected status code. Status code returned is {r.status_code}') + logging.info(r.text) save_ckpt(lower_bound, upper_bound) - print("Exiting program.") + logging.info("Exiting program.") exit() REMAINING_REQUESTS -= 1 if REMAINING_REQUESTS == 0: - print("Sleeping 60 seconds to stay under GitHub API rate limit...") + logging.info("Sleeping 60 seconds to stay under GitHub API rate limit...") time.sleep(60) save_ckpt(lower_bound, upper_bound) REMAINING_REQUESTS = 30 @@ -102,14 +103,14 @@ def download_range(lower_bound, upper_bound): # Load checkpoint with open('repo_ckpt.pkl', 'rb') as f: lower_bound, upper_bound, repo_list = pickle.load(f) - print(f"Loading from {lower_bound}..{upper_bound}") + logging.info(f"Loading from {lower_bound}..{upper_bound}") else: lower_bound = 0 upper_bound = 5 repo_list = [] if lower_bound >= 10000000: - print(''' + logging.info(''' Checkpoint is for an already completed download of GitHub repository information. Please delete `repo_ckpt.pkl` to restart and try again. ''') @@ -140,13 +141,13 @@ def download_range(lower_bound, upper_bound): # Update the slope of our linear approximation slope = n_results/(upper_bound - lower_bound) - print(f'size {lower_bound}..{upper_bound} ~> {n_results} results') + logging.info(f'size {lower_bound}..{upper_bound} ~> {n_results} results') # If we get <= 1000 results over the range, exit the search loop # and download all repository names over the range if n_results <= 1000: break - print(f"Downloading repositories in size range {lower_bound}..{upper_bound}") + logging.info(f"Downloading repositories in size range {lower_bound}..{upper_bound}") download_range(lower_bound, upper_bound) lower_bound = upper_bound + 1 diff --git a/download_repo_text.py b/download_repo_text.py index 41aefdb..564abb9 100644 --- a/download_repo_text.py +++ b/download_repo_text.py @@ -1,3 +1,4 @@ +import logging import chardet import magic import lm_dataformat as lmd @@ -77,7 +78,7 @@ def get_content(f): # something went horribly wrong! ... except: - print(type, f, enc) + logging.info(type, f, enc) traceback.print_exc() time.sleep(0.1) return diff --git a/download_repos.py b/download_repos.py index d9658b4..4be01b4 100644 --- a/download_repos.py +++ b/download_repos.py @@ -3,6 +3,7 @@ Downloads all the repositories listed in repo_names.csv ''' +import logging import os import csv from tqdm import tqdm @@ -13,7 +14,7 @@ def download_repo(repo): if file_name not in os.listdir("output/"): os.system(f'git clone --depth 1 --single-branch https://github.com/{repo} output/{file_name}') else: - print(f"Already downloaded {repo}") + logging.info(f"Already downloaded {repo}") with open('github_repositories.csv', 'r') as f: csv_reader = csv.reader(f)