download_repo_info.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -7,6 +7,7 @@
  
    The output is github_repositories.csv

    '''

    import logging

    import os

    import json

    import time

    @@ -31,7 +32,7 @@
  
    def save_ckpt(lower_bound: int, upper_bound: int):

        global repo_list

        repo_list = list(set(repo_list)) # remove duplicates

        print(f"Saving checkpoint {lower_bound, upper_bound}...")

        logging.info(f"Saving checkpoint {lower_bound, upper_bound}...")

        with open('repo_ckpt.pkl', 'wb') as f:

            pickle.dump((lower_bound, upper_bound, repo_list), f)

    @@ -45,9 +46,9 @@ def get_request(lower_bound: int, upper_bound: int, page: int = 1):
  
               )

        if r.status_code == 403:

                print('API rate limit exceeded.')

                logging.error('API rate limit exceeded.')

                save_ckpt(lower_bound, upper_bound, repo_list)

                print('Exiting program.')

                logging.info('Exiting program.')

                exit()

        elif r.status_code == 422:

            # No more pages available

    @@ -56,16 +57,16 @@ def get_request(lower_bound: int, upper_bound: int, page: int = 1):
  
        try:

            assert r.status_code == 200

        except:

            print(f'Unexpected status code. Status code returned is {r.status_code}')

            print(r.text)

            logging.error(f'Unexpected status code. Status code returned is {r.status_code}')

            logging.info(r.text)

            save_ckpt(lower_bound, upper_bound)

            print("Exiting program.")

            logging.info("Exiting program.")

            exit()

        REMAINING_REQUESTS -= 1

        if REMAINING_REQUESTS == 0:

            print("Sleeping 60 seconds to stay under GitHub API rate limit...")

            logging.info("Sleeping 60 seconds to stay under GitHub API rate limit...")

            time.sleep(60)

            save_ckpt(lower_bound, upper_bound)

            REMAINING_REQUESTS = 30

    @@ -102,14 +103,14 @@ def download_range(lower_bound, upper_bound):
  
            # Load checkpoint

            with open('repo_ckpt.pkl', 'rb') as f:

                lower_bound, upper_bound, repo_list = pickle.load(f)

            print(f"Loading from {lower_bound}..{upper_bound}")

            logging.info(f"Loading from {lower_bound}..{upper_bound}")

        else:

            lower_bound = 0

            upper_bound = 5

            repo_list = []

        if lower_bound >= 10000000:

            print('''

            logging.info('''

    Checkpoint is for an already completed download of GitHub repository information.

    Please delete `repo_ckpt.pkl` to restart and try again.

                ''')

    @@ -140,13 +141,13 @@ def download_range(lower_bound, upper_bound):
  
                # Update the slope of our linear approximation

                slope = n_results/(upper_bound - lower_bound)

                print(f'size {lower_bound}..{upper_bound} ~> {n_results} results')

                logging.info(f'size {lower_bound}..{upper_bound} ~> {n_results} results')

                # If we get <= 1000 results over the range, exit the search loop

                # and download all repository names over the range

                if n_results <= 1000:

                    break

            print(f"Downloading repositories in size range {lower_bound}..{upper_bound}")

            logging.info(f"Downloading repositories in size range {lower_bound}..{upper_bound}")

            download_range(lower_bound, upper_bound)

            lower_bound = upper_bound + 1

download_repo_text.py

-Original file line number
+Diff line change
@@ -1,3 +1,4 @@
+    import logging
     import chardet
     import magic
     import lm_dataformat as lmd
@@ Expand Down Expand Up / @@ -77,7 +78,7 @@ def get_content(f): @@
                 # something went horribly wrong!
                 ...
         except:
-            print(type, f, enc)
+            logging.info(type, f, enc)
             traceback.print_exc()
             time.sleep(0.1)
             return
@@ Expand Down @@

download_repos.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -3,6 +3,7 @@ @@
     Downloads all the repositories listed in repo_names.csv
     '''
+    import logging
     import os
     import csv
     from tqdm import tqdm
@@ Expand All / @@ -13,7 +14,7 @@ def download_repo(repo): @@
         if file_name not in os.listdir("output/"):
             os.system(f'git clone --depth 1 --single-branch https://github.com/{repo} output/{file_name}')
         else:
-            print(f"Already downloaded {repo}")
+            logging.info(f"Already downloaded {repo}")
     with open('github_repositories.csv', 'r') as f:
         csv_reader = csv.reader(f)
@@ Expand Down @@

Codemod - Convert print() to logging.* #3

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open

shbradki wants to merge 1 commit into noanabeshima:master from shbradki:refactored-with-codemod

-Original file line number
+Diff line change
@@ -1,3 +1,4 @@
+    import logging
     import chardet
     import magic
     import lm_dataformat as lmd
@@ Expand Down Expand Up / @@ -77,7 +78,7 @@ def get_content(f): @@
                 # something went horribly wrong!
                 ...
         except:
-            print(type, f, enc)
+            logging.info(type, f, enc)
             traceback.print_exc()
             time.sleep(0.1)
             return
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -3,6 +3,7 @@ @@
     Downloads all the repositories listed in repo_names.csv
     '''
+    import logging
     import os
     import csv
     from tqdm import tqdm
@@ Expand All / @@ -13,7 +14,7 @@ def download_repo(repo): @@
         if file_name not in os.listdir("output/"):
             os.system(f'git clone --depth 1 --single-branch https://github.com/{repo} output/{file_name}')
         else:
-            print(f"Already downloaded {repo}")
+            logging.info(f"Already downloaded {repo}")
     with open('github_repositories.csv', 'r') as f:
         csv_reader = csv.reader(f)
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Codemod - Convert print() to logging.* #3

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Codemod - Convert print() to logging.* #3

Are you sure you want to change the base?

Uh oh!

Codemod - Convert print() to logging.* #3

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing