EleutherAI · kleag · Nov 15, 2023 · Jul 1, 2024 · Jul 1, 2024
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+*.tmp
+*.pkl
+*.csv
+github-repos-c-cpp-80/
+error_log.txt
diff --git a/README.md b/README.md
@@ -34,3 +34,20 @@ The way we do that here is by restricting the minimum and maximum size of all th
 - [ ] Better command-line interface
 - [ ] Update requirements.txt
 
+## Stats
+
+With original repositories list:
+2.641.167.930 lines
+10.001.770.063 tokens
+106.585.972.613 characters
+
+With original repositories list and only C/C++ projects:
+2.535.193.093 lines 
+9.587.816.068 tokens 
+102.250.174.187 characters
+
+With updated repositories list with 80 stars and only C/C++ projects:
+3.330.647.659 lignes
+12.636.857.650 tokens
+137.160.720.047 characters
+
diff --git a/cleanup.py b/cleanup.py
@@ -0,0 +1,42 @@
+import csv
+import os
+import shutil
+import sys
+
+def contains_subdirectories(directory_path):
+    entries = os.listdir(directory_path)
+
+    for entry in entries:
+        entry_path = os.path.join(directory_path, entry)
+        if os.path.isdir(entry_path):
+            return True
+
+    return False
+
+def process_csv_file(filename):
+    with open(filename, newline='') as csvfile:
+        reader = csv.reader(csvfile, delimiter=',')
+        for path, _, lang in reader:
+            # Check if the language is neither C nor C++
+            if lang == 'C' or lang == 'C++':
+                continue
+            if os.path.exists(path):
+                print(path+ " "  + lang)
+                # Split the path into directory components
+                print(f"Remove {path}: {lang}", file=sys.stderr)
+                shutil.rmtree(path, ignore_errors=True)
+
+            components = path.split('/')
+
+            # Remove dir2 from the components
+            components.pop(-1)
+
+            # Reconstruct the modified path
+            modified_path = '/'.join(components)
+            if os.path.exists(modified_path) and not contains_subdirectories(
+                    modified_path):
+                print(f"Remove {modified_path}: {lang} and empty", file=sys.stderr)
+                shutil.rmtree(modified_path, ignore_errors=True)
+
+# Replace 'f.csv' with your actual CSV file name
+process_csv_file('../github_repositories_20230830_100stars.csv')
diff --git a/download_repo_info.py b/download_repo_info.py
@@ -2,22 +2,27 @@
 get_repo_information.py
 
 Downloads information about all GitHub repositories
-with greater than or equal to 100 stars and are less than a gigabyte in size 
+with greater than or equal to 100 stars and are less than a gigabyte in size
 Each data record has the repository's name, number of stars, and top language
 The output is github_repositories.csv
 '''
 
-import os
+import argparse
 import json
-import time
 import math
+import os
 import pickle
 import requests
+import sys
+import time
+
 from tqdm import tqdm
 
 #~~~~~~~~~~~~~~~~~~
-USER = "noanabeshima"
-TOKEN = "14d353dfb27b03c5de0cbe56bab154cf6713dde2"
+USER = os.environ.get('GITHUB_USER') or raise RuntimeError(
+        "Please define the GITHUB_USER environment variable")
+TOKEN= os.environ.get('GITHUB_TOKEN') or raise RuntimeError(
+        "Please define the GITHUB_TOKEN environment variable")
 #~~~~~~~~~~~~~~~~~~
 
 
@@ -35,18 +40,17 @@ def save_ckpt(lower_bound: int, upper_bound: int):
     with open('repo_ckpt.pkl', 'wb') as f:
         pickle.dump((lower_bound, upper_bound, repo_list), f)
 
-def get_request(lower_bound: int, upper_bound: int, page: int = 1):
-    # Returns a request object from querying GitHub 
+def get_request(lower_bound: int, upper_bound: int, page: int = 1, stars: int = 100):
+    # Returns a request object from querying GitHub
     # for repos in-between size lower_bound and size upper_bound with over 100 stars.
     global REMAINING_REQUESTS, USER, TOKEN, repo_list
-    r = requests.get(
-           f'https://api.github.com/search/repositories?q=size:{lower_bound}..{upper_bound}+stars:>100&per_page=100&page={page}',
-           auth = (USER, TOKEN)
-           )
+    query = f'https://api.github.com/search/repositories?q=+size:{lower_bound}..{upper_bound}+stars:>{stars}&per_page=100&page={page}'
+    # print(query)
+    r = requests.get(query, auth = (USER, TOKEN))
 
     if r.status_code == 403:
             print('API rate limit exceeded.')
-            save_ckpt(lower_bound, upper_bound, repo_list)
+            save_ckpt(lower_bound, upper_bound)
             print('Exiting program.')
             exit()
     elif r.status_code == 422:
@@ -58,44 +62,60 @@ def get_request(lower_bound: int, upper_bound: int, page: int = 1):
     except:
         print(f'Unexpected status code. Status code returned is {r.status_code}')
         print(r.text)
-        save_ckpt(lower_bound, upper_bound, repo_list)
+        save_ckpt(lower_bound, upper_bound)
         print("Exiting program.")
         exit()
-    
+
     REMAINING_REQUESTS -= 1
 
     if REMAINING_REQUESTS == 0:
         print("Sleeping 60 seconds to stay under GitHub API rate limit...")
         time.sleep(60)
         save_ckpt(lower_bound, upper_bound)
         REMAINING_REQUESTS = 30
-
+    # print(json.dumps(r.json()), file=sys.stderr)
     return r
 
 
-def download_range(lower_bound, upper_bound):
+def download_range(lower_bound, upper_bound, stars: int = 100):
     # Saves the names of repositories on GitHub to repo_list
     # in-between size minimum and maximum with over 100 stars.
     global repo_list
     # Github page options start at index 1.
     for page in range(1, 11):
-        r = get_request(lower_bound=lower_bound, upper_bound=upper_bound, page=page)
+        r = get_request(lower_bound=lower_bound, upper_bound=upper_bound, page=page,
+                        stars=stars)
 
+        # print(f"({lower_bound}, {upper_bound}, {page}, {stars})", file=sys.stderr)
         if page == 1:
             n_results = r.json()['total_count']
             n_query_pages = min(math.ceil(n_results/100), 10) # GitHub API capped at 1000 results
 
         for repository in r.json()['items']:
             name = repository['full_name']
-            stars = repository['stargazers_count']
+            stars_count = repository['stargazers_count']
             lang = repository['language']
-            repo_list.append((name, stars, lang)) # eg (noanabeshima/github-scraper, 1, Python)
+            # print(f"({name}, {stars_count}, {lang})", file=sys.stderr)
+            repo_list.append((name, stars_count, lang)) # eg (noanabeshima/github-scraper, 1, Python)
 
         if page >= n_query_pages:
             # No more pages available
             return n_results
 
 if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        prog="Download ",
+        description=('Downloads information about all GitHub repositories'
+            'with greater than or equal to 100 stars by default and are less than a '
+            'gigabyte in size. Each data record has the repository\'s name, number of '
+            'stars, and top language. The default output is github_repositories.csv')
+        )
+
+    parser.add_argument("-s", "--stars", type=int, default=100,
+                        help="The minimum number of stars.")
+    parser.add_argument("-o", "--output", type=str, default='github_repositories.csv',
+                        help="The output file.")
+    args = parser.parse_args()
     # If pickled checkpoint exists, load it.
     # Otherwise, intialize repo_list as an empty list
     if 'repo_ckpt.pkl' in os.listdir():
@@ -115,13 +135,13 @@ def download_range(lower_bound, upper_bound):
             ''')
         exit()
 
-    
+
     r = get_request(lower_bound, upper_bound)
 
     # Initial number of results
     n_results = r.json()['total_count']
     # Initial slope for our linear approximation.
-    slope = n_results/(upper_bound-lower_bound)
+    slope = n_results/(upper_bound - lower_bound if upper_bound > lower_bound else 1)
 
     # Main loop.
     # Breaks when all repositories considered are greater in size than a gigabyte
@@ -132,27 +152,28 @@ def download_range(lower_bound, upper_bound):
             # Update upper bound to be guess by linear approximation for what range will return 1000 results
             # As GitHub repositories follow a power distribution, this tends to be an underestimate.
             upper_bound = math.floor((1000/slope) + lower_bound)
-            upper_bound = max(upper_bound, lower_bound + 1)
+            upper_bound = max(upper_bound, lower_bound)
 
             # How many results are there at our guess?
             n_results = get_request(lower_bound, upper_bound).json()['total_count']
 
             # Update the slope of our linear approximation
-            slope = n_results/(upper_bound - lower_bound)
+            slope = n_results/(upper_bound - lower_bound if upper_bound > lower_bound else 1)
 
-            print(f'size {lower_bound}..{upper_bound} ~> {n_results} results')
+            print(f'size {lower_bound}..{upper_bound} ~> {n_results} results. slope: {slope}')
             # If we get <= 1000 results over the range, exit the search loop
             # and download all repository names over the range
-            if n_results <= 1000:
+            if n_results <= 1000 or upper_bound == lower_bound:
+            # if n_results <= 1000:
                 break
 
         print(f"Downloading repositories in size range {lower_bound}..{upper_bound}")
-        download_range(lower_bound, upper_bound)
+        download_range(lower_bound, upper_bound, args.stars)
         lower_bound = upper_bound + 1
 
     save_ckpt(lower_bound, upper_bound)
 
-    with open('github_repositories.csv', 'w') as f:
+    with open(args.output, 'w') as f:
         for repo in repo_list:
             name, stars, lang = repo
             f.write(f'{name},{stars},{lang}\n')
diff --git a/download_repo_text.py b/download_repo_text.py
@@ -172,7 +172,7 @@ def _process_repo(repo_data, repodir):
         for curdir, dirs, files in os.walk(repodir):
 
             files = [curdir + '/' + f for f in files if '.git' not in f and f[
-                0] is not '.' and 'LICENSE' not in f and 'node_modules' not in f and '.min.' not in f and f.split('.')[
+                0] != '.' and 'LICENSE' not in f and 'node_modules' not in f and '.min.' not in f and f.split('.')[
                          -1] not in bad_extensions]
 
             filenames = [f.split("/")[-1] for f in files]

diff --git a/download_repos.py b/download_repos.py
@@ -24,5 +24,5 @@ def download_repo(repo):
 
 
 repo_names = [repo[0] for repo in repositories]
-Parallel(n_jobs=40, prefer="threads")(
-    delayed(download_repo)(name) for name in tqdm(repo_names))
+Parallel(n_jobs=16, prefer="threads")(
+    delayed(download_repo)(name) for name in tqdm(repo_names))
diff --git a/download_repos_parallel.py b/download_repos_parallel.py
@@ -0,0 +1,96 @@
+'''
+download_repos_parallel.py
+Downloads all the repositories listed in repo_names.csv
+'''
+
+import argparse
+import concurrent.futures
+import csv
+import os
+import subprocess
+import sys
+
+from tqdm import tqdm
+
+# Initialize the ArgumentParser
+parser = argparse.ArgumentParser(
+    description="Clone GitHub repositories based on criteria")
+parser.add_argument('-f', '--repo-file', dest='repo_file',
+                    help='Path to the CSV file containing repository info (repo,stars,language)',
+                    default="github_repositories.csv")
+parser.add_argument('-s', '--min_stars', type=int, default=100,
+                    help='Minimum number of stars to consider')
+parser.add_argument('-l', '--languages', action='append',
+                    default=None,
+                    help='List of accepted languages')
+parser.add_argument('--output_dir', default='cloned_repos',
+                    help='Directory to store the cloned repositories')
+parser.add_argument('--error_log', default='error_log.txt',
+                    help='Path to the error log file')
+parser.add_argument('--num_threads', type=int, default=10,
+                    help='Number of threads for parallel execution')
+args = parser.parse_args()
+print(f"min stars: {args.min_stars}", file=sys.stderr)
+print(f"languages: {args.languages}", file=sys.stderr)
+# Rest of the script remains the same
+NUM_PARALLEL = args.num_threads
+CLONE_DIR = args.output_dir
+ERROR_LOG = args.error_log
+
+my_env = os.environ.copy()
+my_env["GIT_TERMINAL_PROMPT"] = "0"
+
+def clone_repository(repo):
+    try:
+        repo_name, stars, language = repo
+        repo_url = f"https://github.com/{repo_name}"
+        clone_path = os.path.join(CLONE_DIR, repo_name)
+
+        if os.path.exists(clone_path):
+            return f"{repo_name}", f"{repo_name} (already present)"
+        if int(stars) < args.min_stars:
+            return None, f"Filtered {repo_name}, stars"
+        if args.languages and language not in args.languages:
+            return None, f"Filtered {repo_name}, language ({language})"
+
+        os.makedirs(clone_path, exist_ok=True)
+
+        with (open(f'{clone_path}_output.log', 'w') as stdout_file,
+              open(f'{clone_path}_error.log', 'w') as stderr_file):
+            result = subprocess.run(['git', 'clone', "--depth", "1", "--single-branch",
+                                     repo_url, clone_path],
+                                    stdout=stdout_file, stderr=stderr_file, text=True,
+                                    env=my_env)
+            if result.returncode == 0:
+                return f"{repo_name}", f"Cloned {repo_name}"
+            else:
+                return None, f"Failed {repo_name}"
+    except Exception as e:
+        with open(ERROR_LOG, 'a') as error_log:
+            error_log.write(f"Error cloning {repo_url}: {e}\n")
+        return None, f"Error cloning {repo_url}: {e}"
+    return None, "Nothing done"
+
+with open(args.repo_file, 'r') as f:
+    csv_reader = csv.reader(f)
+    repositories = list(map(tuple, csv_reader))
+
+os.makedirs(CLONE_DIR, exist_ok=True)
+open(ERROR_LOG, 'w').close()
+
+# Use a list to store the results of the cloning attempts
+results = []
+
+# Use tqdm to create a progress bar
+with tqdm(total=len(repositories), desc="Cloning Repositories", leave=False) as pbar:
+    with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_PARALLEL) as executor:
+        # Using 'map' with 'tqdm' to track progress
+        for result, message in executor.map(clone_repository, repositories):
+            if result:
+                results.append(result)
+            pbar.write(message)
+            pbar.update(1)  # Increment the progress bar
+
+
+print("Cloning complete. Check the error log for details.")
+print("Successfully cloned repositories:", results)