From da719018fd00674b60def5ee1e790e782b908945 Mon Sep 17 00:00:00 2001 From: "Dawn M. Foster" Date: Fri, 1 Dec 2023 13:36:14 +0000 Subject: [PATCH 1/2] separated the code to create a new function with the org level directory so that I can use that separately to create the path for a csv file that summarizes the org level details --- utils/file_operations.py | 41 ++++++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/utils/file_operations.py b/utils/file_operations.py index 5ff5c97..f47ab61 100644 --- a/utils/file_operations.py +++ b/utils/file_operations.py @@ -4,9 +4,38 @@ """ This file contains several functions that perform various file operations """ -def output_path(repo_name, org_name): +def create_path_str(org_name): """ Creates the path string where files will be located + Parameters + ---------- + org_name : str + + Returns + ------- + path_str : str + + """ + import datetime + from os.path import dirname, join + from pathlib import Path + from utils.date_calcs import get_last_month + + today = datetime.date.today() + last_month = get_last_month() + current_year_month = str(last_month.year) + '-' + '{:02d}'.format(last_month.month) + + current_dir = dirname(dirname(__file__)) # the double dirname is equivalent to ../ + rel_path = './output/' + current_year_month + '/' + org_name + path_str = join(current_dir, rel_path) + Path(path_str).mkdir(parents=True, exist_ok=True) + + return path_str + + +def output_path(repo_name, org_name): + """ Creates the path string including repo where .png files will be located + Parameters ---------- repo_name : str @@ -17,17 +46,13 @@ def output_path(repo_name, org_name): path : str """ - import datetime from os.path import dirname, join from pathlib import Path - from utils.date_calcs import get_last_month - today = datetime.date.today() - last_month = get_last_month() - current_year_month = str(last_month.year) + '-' + '{:02d}'.format(last_month.month) + path_str = create_path_str(org_name) current_dir = dirname(dirname(__file__)) # the double dirname is equivalent to ../ - rel_path = './output/' + current_year_month + '/' + org_name + '/' + repo_name + rel_path = path_str + '/' + repo_name path = join(current_dir, rel_path) Path(path).mkdir(parents=True, exist_ok=True) @@ -51,4 +76,4 @@ def output_filename(repo_name, org_name, metric_string): filename = path + '/' + repo_name + '_' + metric_string + '.png' - return filename \ No newline at end of file + return filename From 806ac42f33c3f340d5b4603b0236e910a9a8f246 Mon Sep 17 00:00:00 2001 From: "Dawn M. Foster" Date: Fri, 1 Dec 2023 14:33:58 +0000 Subject: [PATCH 2/2] Refactored the metrics functions to return some minimal info that is written to a csv file when gathering data on an org to make it easier to find the repos that you want to look at in more detail" --- health_by_repo.py | 28 ++++++++++++++++++++++++---- metrics/bus_factor.py | 5 ++++- metrics/closure_ratio.py | 4 +++- metrics/first_response.py | 4 ++-- metrics/release_frequency.py | 6 ++++-- 5 files changed, 37 insertions(+), 10 deletions(-) diff --git a/health_by_repo.py b/health_by_repo.py index b3e4067..49ac67b 100644 --- a/health_by_repo.py +++ b/health_by_repo.py @@ -62,10 +62,12 @@ """ import argparse +import sys import pandas as pd from utils.augur_connect import augur_db_connect from utils.date_calcs import get_dates from utils.repo_info import get_repo_info, fork_archive, get_org_repos +from utils.file_operations import create_path_str from metrics.release_frequency import activity_release_graph from metrics.closure_ratio import sustain_prs_by_repo_graph from metrics.first_response import response_time_graph @@ -101,6 +103,18 @@ # This is the case where data is gathered on all repos from an org repoDF = get_org_repos(org_name, engine) print("multiple repos") + + # When gathering data on an org, it can be helpful to have a summary CSV + path = create_path_str(org_name) + output_filename = path + '/_' + org_name + '_output_yr_' + str(years) + '_bdays_' + str(bus_days) + '.csv' + + try: + csv_output = open(output_filename, 'w') + csv_output.write('org_name,repo_name,releases,first_resp_mos,closure_ratio_mos,bus_factor,bus_factor_percents,fork,archive\n') + except: + print('Could not write to csv file. Exiting') + sys.exit(1) + else: # This is the case where data is gathered on a single org / repo combo repo_id = get_repo_info(engine, org_name, repo_name) @@ -121,14 +135,20 @@ # This section collects all of the data using the functions for each graph # found in common_functions.py and creates the graphs for each metric + # Skips archived repos - activity_release_graph(repo_id, repo_name, org_name, start_date, end_date, engine, years) + if is_archived == False: + releases = activity_release_graph(repo_id, repo_name, org_name, start_date, end_date, engine, years) - sustain_prs_by_repo_graph(repo_id, repo_name, org_name, start_date, end_date, engine, years) + closure_ratio_mos = sustain_prs_by_repo_graph(repo_id, repo_name, org_name, start_date, end_date, engine, years) - contributor_risk_graph(repo_id, repo_name, org_name, start_date, end_date, engine, years) + bus_factor, bus_factor_percents = contributor_risk_graph(repo_id, repo_name, org_name, start_date, end_date, engine, years) - response_time_graph(repo_id, repo_name, org_name, start_date, end_date, engine, bus_days, years) + first_resp_mos = response_time_graph(repo_id, repo_name, org_name, start_date, end_date, engine, bus_days, years) + if len(repoDF) > 1: + csv_line = org_name + ',' + repo_name + ',' + releases + ',' + first_resp_mos + ',' + closure_ratio_mos + ',' + bus_factor + ',' + bus_factor_percents + ',' + str(is_forked) + ',' + str(is_archived) + '\n' + csv_output.write(csv_line) + # Print a separator between repos print('-------------') diff --git a/metrics/bus_factor.py b/metrics/bus_factor.py index 6c5a33e..d53fcd5 100644 --- a/metrics/bus_factor.py +++ b/metrics/bus_factor.py @@ -171,7 +171,7 @@ def contributor_risk_graph(repo_id, repo_name, org_name, start_date, end_date, e error_num, error_text, names, percents, commits, title, interpretation, num_people = contributor_risk_data(repo_id, repo_name, org_name, start_date, end_date, engine) if error_num == -1: - return + return "Error","Error" matplotlib.use('Agg') #prevents from tying to send plot to screen sns.set_style('ticks') @@ -204,3 +204,6 @@ def contributor_risk_graph(repo_id, repo_name, org_name, start_date, end_date, e print('Bus Factor / Contributor Risk for', org_name, '/', repo_name, 'from', start_date, 'to', end_date, '\nsaved as', filename) print(num_people, 'people make up > 70% of the commits in the past year.') + + percent_str = '--'.join(str(x) for x in percents) + return str(num_people), percent_str \ No newline at end of file diff --git a/metrics/closure_ratio.py b/metrics/closure_ratio.py index 1ae1fc3..f0a2f57 100644 --- a/metrics/closure_ratio.py +++ b/metrics/closure_ratio.py @@ -298,7 +298,7 @@ def sustain_prs_by_repo_graph(repo_id, repo_name, org_name, start_date, end_date if error_num == -1: print("Closure Ratio: Too few PRs to calculate") - return + return "Too Few PRs" matplotlib.use('Agg') #prevents from tying to send plot to screen sns.set_style('ticks') @@ -326,3 +326,5 @@ def sustain_prs_by_repo_graph(repo_id, repo_name, org_name, start_date, end_date print('Change Request Closure Ratio (keeping up with contributions) for', org_name, '/', repo_name, 'from', start_date, 'to', end_date, '\nsaved as', filename) print('Number of months in the past 6 months with > 15% of PRs not closed:', month_num) + + return str(month_num) \ No newline at end of file diff --git a/metrics/first_response.py b/metrics/first_response.py index 74b1163..b50d19d 100644 --- a/metrics/first_response.py +++ b/metrics/first_response.py @@ -223,7 +223,7 @@ def response_time_graph(repo_id, repo_name, org_name, start_date, end_date, engi # Don't gather data if less than 24 PRs if error_num == -1: print("First Response: Too few PRs to calculate") - return + return "Too Few PRs" sns.set_style('ticks') sns.set(style="whitegrid", font_scale=2) @@ -253,5 +253,5 @@ def response_time_graph(repo_id, repo_name, org_name, start_date, end_date, engi print('Time to first response for', org_name, '/', repo_name, 'from', start_date, 'to', end_date, '\nsaved as', filename) print(month_num, 'months with more than 10% of pull requests not responded to within specified business days in the past 6 months') - + return str(month_num) diff --git a/metrics/release_frequency.py b/metrics/release_frequency.py index 906ab7b..f0c849f 100644 --- a/metrics/release_frequency.py +++ b/metrics/release_frequency.py @@ -114,7 +114,7 @@ def activity_release_graph(repo_id, repo_name, org_name, start_date, end_date, e error_num, error_text, releases_df, start_dt, end_dt, title, interpretation, release_num = activity_release_data(repo_id, repo_name, org_name, start_date, end_date, engine) if error_num == -1: - return + return "0" matplotlib.use('Agg') #prevents from tying to send plot to screen sns.set(style="whitegrid", font_scale=2) @@ -140,4 +140,6 @@ def activity_release_graph(repo_id, repo_name, org_name, start_date, end_date, e plt.close(fig) print('Release Frequency for', org_name, '/', repo_name, 'from', start_date, 'to', end_date, '\nsaved as', filename) - print(release_num, 'releases in the past 6 months') \ No newline at end of file + print(release_num, 'releases in the past 6 months') + + return str(release_num) \ No newline at end of file