Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
*.tmp
*.pkl
*.csv
github-repos-c-cpp-80/
error_log.txt
17 changes: 17 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,20 @@ The way we do that here is by restricting the minimum and maximum size of all th
- [ ] Better command-line interface
- [ ] Update requirements.txt

## Stats

With original repositories list:
2.641.167.930 lines
10.001.770.063 tokens
106.585.972.613 characters

With original repositories list and only C/C++ projects:
2.535.193.093 lines
9.587.816.068 tokens
102.250.174.187 characters

With updated repositories list with 80 stars and only C/C++ projects:
3.330.647.659 lignes
12.636.857.650 tokens
137.160.720.047 characters

42 changes: 42 additions & 0 deletions cleanup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import csv
import os
import shutil
import sys

def contains_subdirectories(directory_path):
entries = os.listdir(directory_path)

for entry in entries:
entry_path = os.path.join(directory_path, entry)
if os.path.isdir(entry_path):
return True

return False

def process_csv_file(filename):
with open(filename, newline='') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for path, _, lang in reader:
# Check if the language is neither C nor C++
if lang == 'C' or lang == 'C++':
continue
if os.path.exists(path):
print(path+ " " + lang)
# Split the path into directory components
print(f"Remove {path}: {lang}", file=sys.stderr)
shutil.rmtree(path, ignore_errors=True)

components = path.split('/')

# Remove dir2 from the components
components.pop(-1)

# Reconstruct the modified path
modified_path = '/'.join(components)
if os.path.exists(modified_path) and not contains_subdirectories(
modified_path):
print(f"Remove {modified_path}: {lang} and empty", file=sys.stderr)
shutil.rmtree(modified_path, ignore_errors=True)

# Replace 'f.csv' with your actual CSV file name
process_csv_file('../github_repositories_20230830_100stars.csv')
75 changes: 48 additions & 27 deletions download_repo_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,27 @@
get_repo_information.py

Downloads information about all GitHub repositories
with greater than or equal to 100 stars and are less than a gigabyte in size
with greater than or equal to 100 stars and are less than a gigabyte in size
Each data record has the repository's name, number of stars, and top language
The output is github_repositories.csv
'''

import os
import argparse
import json
import time
import math
import os
import pickle
import requests
import sys
import time

from tqdm import tqdm

#~~~~~~~~~~~~~~~~~~
USER = "noanabeshima"
TOKEN = "14d353dfb27b03c5de0cbe56bab154cf6713dde2"
USER = os.environ.get('GITHUB_USER') or raise RuntimeError(
"Please define the GITHUB_USER environment variable")
TOKEN= os.environ.get('GITHUB_TOKEN') or raise RuntimeError(
"Please define the GITHUB_TOKEN environment variable")
#~~~~~~~~~~~~~~~~~~


Expand All @@ -35,18 +40,17 @@ def save_ckpt(lower_bound: int, upper_bound: int):
with open('repo_ckpt.pkl', 'wb') as f:
pickle.dump((lower_bound, upper_bound, repo_list), f)

def get_request(lower_bound: int, upper_bound: int, page: int = 1):
# Returns a request object from querying GitHub
def get_request(lower_bound: int, upper_bound: int, page: int = 1, stars: int = 100):
# Returns a request object from querying GitHub
# for repos in-between size lower_bound and size upper_bound with over 100 stars.
global REMAINING_REQUESTS, USER, TOKEN, repo_list
r = requests.get(
f'https://api.github.com/search/repositories?q=size:{lower_bound}..{upper_bound}+stars:>100&per_page=100&page={page}',
auth = (USER, TOKEN)
)
query = f'https://api.github.com/search/repositories?q=+size:{lower_bound}..{upper_bound}+stars:>{stars}&per_page=100&page={page}'
# print(query)
r = requests.get(query, auth = (USER, TOKEN))

if r.status_code == 403:
print('API rate limit exceeded.')
save_ckpt(lower_bound, upper_bound, repo_list)
save_ckpt(lower_bound, upper_bound)
print('Exiting program.')
exit()
elif r.status_code == 422:
Expand All @@ -58,44 +62,60 @@ def get_request(lower_bound: int, upper_bound: int, page: int = 1):
except:
print(f'Unexpected status code. Status code returned is {r.status_code}')
print(r.text)
save_ckpt(lower_bound, upper_bound, repo_list)
save_ckpt(lower_bound, upper_bound)
print("Exiting program.")
exit()

REMAINING_REQUESTS -= 1

if REMAINING_REQUESTS == 0:
print("Sleeping 60 seconds to stay under GitHub API rate limit...")
time.sleep(60)
save_ckpt(lower_bound, upper_bound)
REMAINING_REQUESTS = 30

# print(json.dumps(r.json()), file=sys.stderr)
return r


def download_range(lower_bound, upper_bound):
def download_range(lower_bound, upper_bound, stars: int = 100):
# Saves the names of repositories on GitHub to repo_list
# in-between size minimum and maximum with over 100 stars.
global repo_list
# Github page options start at index 1.
for page in range(1, 11):
r = get_request(lower_bound=lower_bound, upper_bound=upper_bound, page=page)
r = get_request(lower_bound=lower_bound, upper_bound=upper_bound, page=page,
stars=stars)

# print(f"({lower_bound}, {upper_bound}, {page}, {stars})", file=sys.stderr)
if page == 1:
n_results = r.json()['total_count']
n_query_pages = min(math.ceil(n_results/100), 10) # GitHub API capped at 1000 results

for repository in r.json()['items']:
name = repository['full_name']
stars = repository['stargazers_count']
stars_count = repository['stargazers_count']
lang = repository['language']
repo_list.append((name, stars, lang)) # eg (noanabeshima/github-scraper, 1, Python)
# print(f"({name}, {stars_count}, {lang})", file=sys.stderr)
repo_list.append((name, stars_count, lang)) # eg (noanabeshima/github-scraper, 1, Python)

if page >= n_query_pages:
# No more pages available
return n_results

if __name__ == '__main__':
parser = argparse.ArgumentParser(
prog="Download ",
description=('Downloads information about all GitHub repositories'
'with greater than or equal to 100 stars by default and are less than a '
'gigabyte in size. Each data record has the repository\'s name, number of '
'stars, and top language. The default output is github_repositories.csv')
)

parser.add_argument("-s", "--stars", type=int, default=100,
help="The minimum number of stars.")
parser.add_argument("-o", "--output", type=str, default='github_repositories.csv',
help="The output file.")
args = parser.parse_args()
# If pickled checkpoint exists, load it.
# Otherwise, intialize repo_list as an empty list
if 'repo_ckpt.pkl' in os.listdir():
Expand All @@ -115,13 +135,13 @@ def download_range(lower_bound, upper_bound):
''')
exit()


r = get_request(lower_bound, upper_bound)

# Initial number of results
n_results = r.json()['total_count']
# Initial slope for our linear approximation.
slope = n_results/(upper_bound-lower_bound)
slope = n_results/(upper_bound - lower_bound if upper_bound > lower_bound else 1)

# Main loop.
# Breaks when all repositories considered are greater in size than a gigabyte
Expand All @@ -132,27 +152,28 @@ def download_range(lower_bound, upper_bound):
# Update upper bound to be guess by linear approximation for what range will return 1000 results
# As GitHub repositories follow a power distribution, this tends to be an underestimate.
upper_bound = math.floor((1000/slope) + lower_bound)
upper_bound = max(upper_bound, lower_bound + 1)
upper_bound = max(upper_bound, lower_bound)

# How many results are there at our guess?
n_results = get_request(lower_bound, upper_bound).json()['total_count']

# Update the slope of our linear approximation
slope = n_results/(upper_bound - lower_bound)
slope = n_results/(upper_bound - lower_bound if upper_bound > lower_bound else 1)

print(f'size {lower_bound}..{upper_bound} ~> {n_results} results')
print(f'size {lower_bound}..{upper_bound} ~> {n_results} results. slope: {slope}')
# If we get <= 1000 results over the range, exit the search loop
# and download all repository names over the range
if n_results <= 1000:
if n_results <= 1000 or upper_bound == lower_bound:
# if n_results <= 1000:
break

print(f"Downloading repositories in size range {lower_bound}..{upper_bound}")
download_range(lower_bound, upper_bound)
download_range(lower_bound, upper_bound, args.stars)
lower_bound = upper_bound + 1

save_ckpt(lower_bound, upper_bound)

with open('github_repositories.csv', 'w') as f:
with open(args.output, 'w') as f:
for repo in repo_list:
name, stars, lang = repo
f.write(f'{name},{stars},{lang}\n')
2 changes: 1 addition & 1 deletion download_repo_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def _process_repo(repo_data, repodir):
for curdir, dirs, files in os.walk(repodir):

files = [curdir + '/' + f for f in files if '.git' not in f and f[
0] is not '.' and 'LICENSE' not in f and 'node_modules' not in f and '.min.' not in f and f.split('.')[
0] != '.' and 'LICENSE' not in f and 'node_modules' not in f and '.min.' not in f and f.split('.')[
-1] not in bad_extensions]

filenames = [f.split("/")[-1] for f in files]
Expand Down
4 changes: 2 additions & 2 deletions download_repos.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,5 @@ def download_repo(repo):


repo_names = [repo[0] for repo in repositories]
Parallel(n_jobs=40, prefer="threads")(
delayed(download_repo)(name) for name in tqdm(repo_names))
Parallel(n_jobs=16, prefer="threads")(
delayed(download_repo)(name) for name in tqdm(repo_names))
96 changes: 96 additions & 0 deletions download_repos_parallel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
'''
download_repos_parallel.py
Downloads all the repositories listed in repo_names.csv
'''

import argparse
import concurrent.futures
import csv
import os
import subprocess
import sys

from tqdm import tqdm

# Initialize the ArgumentParser
parser = argparse.ArgumentParser(
description="Clone GitHub repositories based on criteria")
parser.add_argument('-f', '--repo-file', dest='repo_file',
help='Path to the CSV file containing repository info (repo,stars,language)',
default="github_repositories.csv")
parser.add_argument('-s', '--min_stars', type=int, default=100,
help='Minimum number of stars to consider')
parser.add_argument('-l', '--languages', action='append',
default=None,
help='List of accepted languages')
parser.add_argument('--output_dir', default='cloned_repos',
help='Directory to store the cloned repositories')
parser.add_argument('--error_log', default='error_log.txt',
help='Path to the error log file')
parser.add_argument('--num_threads', type=int, default=10,
help='Number of threads for parallel execution')
args = parser.parse_args()
print(f"min stars: {args.min_stars}", file=sys.stderr)
print(f"languages: {args.languages}", file=sys.stderr)
# Rest of the script remains the same
NUM_PARALLEL = args.num_threads
CLONE_DIR = args.output_dir
ERROR_LOG = args.error_log

my_env = os.environ.copy()
my_env["GIT_TERMINAL_PROMPT"] = "0"

def clone_repository(repo):
try:
repo_name, stars, language = repo
repo_url = f"https://github.com/{repo_name}"
clone_path = os.path.join(CLONE_DIR, repo_name)

if os.path.exists(clone_path):
return f"{repo_name}", f"{repo_name} (already present)"
if int(stars) < args.min_stars:
return None, f"Filtered {repo_name}, stars"
if args.languages and language not in args.languages:
return None, f"Filtered {repo_name}, language ({language})"

os.makedirs(clone_path, exist_ok=True)

with (open(f'{clone_path}_output.log', 'w') as stdout_file,
open(f'{clone_path}_error.log', 'w') as stderr_file):
result = subprocess.run(['git', 'clone', "--depth", "1", "--single-branch",
repo_url, clone_path],
stdout=stdout_file, stderr=stderr_file, text=True,
env=my_env)
if result.returncode == 0:
return f"{repo_name}", f"Cloned {repo_name}"
else:
return None, f"Failed {repo_name}"
except Exception as e:
with open(ERROR_LOG, 'a') as error_log:
error_log.write(f"Error cloning {repo_url}: {e}\n")
return None, f"Error cloning {repo_url}: {e}"
return None, "Nothing done"

with open(args.repo_file, 'r') as f:
csv_reader = csv.reader(f)
repositories = list(map(tuple, csv_reader))

os.makedirs(CLONE_DIR, exist_ok=True)
open(ERROR_LOG, 'w').close()

# Use a list to store the results of the cloning attempts
results = []

# Use tqdm to create a progress bar
with tqdm(total=len(repositories), desc="Cloning Repositories", leave=False) as pbar:
with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_PARALLEL) as executor:
# Using 'map' with 'tqdm' to track progress
for result, message in executor.map(clone_repository, repositories):
if result:
results.append(result)
pbar.write(message)
pbar.update(1) # Increment the progress bar


print("Cloning complete. Check the error log for details.")
print("Successfully cloned repositories:", results)
Loading