diff --git a/.gitignore b/.gitignore index 1426dea..77da4eb 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,9 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. +# Generated Project Files +repo + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/README.md b/README.md index bcaf3a2..b1fad83 100644 --- a/README.md +++ b/README.md @@ -56,13 +56,11 @@ files for labelling the licenses under which contributors have shared their code poetry install ``` -4. Clone the repository you wish to analyze into the included `repo` folder. -5. [Optional] Make copies of the files in the `config` folder without the `dist` extension. -6. Run the analyzer with `make run` - - The first run will take a while as it computes an accurate `git blame` for - every file in your repository. At the end of the run, a cached blame file - will be generated in the `build` directory to speed up future runs. +4. Run `python ./git_authorship REPO_URL` + - Generates a treemap at `authorship.html` + - AND Generates a JSON output at `authorship.json` + ## License Copyright (c) 2022 Joseph Hale, All Rights Reserved diff --git a/git_authorship/__main__.py b/git_authorship/__main__.py new file mode 100644 index 0000000..a1179fc --- /dev/null +++ b/git_authorship/__main__.py @@ -0,0 +1,166 @@ +# Copyright (c) 2024 Joseph Hale +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +import argparse +from collections import defaultdict +from pathlib import Path +from typing import Dict, List, Optional +import plotly.graph_objects as go +import json + +from git import Repo + +EXCLUDE_DIRS = [".git"] + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("location", nargs="?", default=".") + parser.add_argument("--clone-to", nargs="?", default="./repo/git_authorship") + # TODO --branch (to analyze a specific branch) + return parser.parse_args() + + +def ensure_cloned_and_pulled(location: str, clone_to: str): + if not Path(clone_to).exists(): + Repo.clone_from(location, clone_to) + else: + Repo(clone_to).git.pull() + + return Repo(clone_to) + + +def iterfiles(dir: Path, exclude: Optional[List[Path]] = None): + exclude = exclude or [] + for path in dir.iterdir(): + if path.is_file(): + yield path + elif path.is_dir() and path not in exclude: + yield from iterfiles(path) + + +def iterdirs(dir: Path, exclude: Optional[List[Path]] = None): + exclude = exclude or [] + for path in dir.iterdir(): + if path.is_file(): + continue + elif path.is_dir() and path not in exclude: + yield path + yield from iterdirs(path) + + +FilePath = Path +Author = str +LineCount = int +Authorship = Dict[Author, LineCount] +RepoAuthorship = Dict[FilePath, Authorship] + + +def file_authorship(repo: Repo, path: Path) -> Authorship: + raw_blame = repo.blame("HEAD", str(path), rev_opts=["-M", "-C", "-C", "-C"]) + blame = [ + (f"{commit.author.name} <{commit.author.email}>", len(lines)) + for commit, lines in (raw_blame or []) + ] + + authorship = defaultdict(int) + for author, lines in blame: + authorship[author] += lines + + return authorship + + +def repo_authorship(repo: Repo) -> RepoAuthorship: + """ + Calculates how many lines each author has contributed to the repo, with breakdowns + by folder and file. + + e.g. For a repo with the following structure: + + ``` + . + ├── folder1 + │ ├── file1.txt (author1: 25 lines, author2: 150 lines) + │ └── file2.txt (author1: 25 lines) + ├── folder2 + │ ├── file1.txt (author1: 25 lines, author2: 25 lines) + │ └── file2.txt (author1: 25 lines, author2: 25 lines) + ``` + + The result will be + + ``` + { + ".": { "author1": 100, "author2": 200 }, + "./folder1": { "author1": 50, "author2": 150 }, + "./folder1/file1.txt": { "author1": 25, "author2": 150 }, + "./folder1/file2.txt": { "author1": 25 }, + "./folder2": { "author1": 50, "author2": 50 }, + "./folder2/file1.txt": { "author1": 25, "author2": 25 }, + "./folder2/file2.txt": { "author1": 25, "author2": 25 }, + } + ``` + + """ + root = Path(repo.working_dir) + filepaths = [ + Path(str(f)[len(str(root)) + 1 :]) + for f in iterfiles(root, exclude=[root / d for d in EXCLUDE_DIRS]) + ] + file_authorships = {path: file_authorship(repo, path) for path in filepaths} + + repo_authorship: RepoAuthorship = defaultdict(lambda: defaultdict(int)) + for file, authorship in file_authorships.items(): + parts = f"./{file}".split("/") + for i in range(len(parts)): + cur = "/".join(parts[: i + 1]) + for author, lines in authorship.items(): + repo_authorship[Path(cur)][author] += lines + + return repo_authorship + + +def export_treemap(authorship: RepoAuthorship, output: Path = Path("authorship.html")): + ids = [str(file) for file in authorship.keys()] + parents = [ + str(file.parent) if str(file) != "." else "" for file in authorship.keys() + ] + values = [sum(authors.values()) for authors in authorship.values()] + labels = [file.name for file in authorship.keys()] + descriptions = [ + "
Authors:
- " + + "
- ".join(f"{author}: {lines}" for author, lines in authorship.items()) + for authorship in authorship.values() + ] + + fig = go.Figure( + go.Treemap( + ids=ids, + labels=labels, + parents=parents, + values=values, + maxdepth=3, + branchvalues="total", + text=descriptions, + hovertemplate="%{label}

%{value} lines
%{text}", + root_color="lightgrey", + ) + ) + + fig.update_layout(margin=dict(t=50, l=25, r=25, b=25)) + fig.write_html(output) + + +def export_json(authorship: RepoAuthorship, output: Path = Path("authorship.json")): + with open(output, "w") as f: + json.dump({str(path): authors for path, authors in authorship.items()}, f) + + +if __name__ == "__main__": + args = parse_args() + repo = ensure_cloned_and_pulled(args.location, args.clone_to) + authorship = repo_authorship(repo) + export_treemap(authorship) + export_json(authorship) diff --git a/git_authorship/authorship_analyzer.py b/git_authorship/authorship_analyzer.py deleted file mode 100644 index f972157..0000000 --- a/git_authorship/authorship_analyzer.py +++ /dev/null @@ -1,261 +0,0 @@ -# Copyright (c) 2022 Joseph Hale -# -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at http://mozilla.org/MPL/2.0/. -import json -import os -from typing import Dict -from typing import List -from typing import Optional - -from git import Repo - -# Global Configuration -GIT_REPO_BASE_PATH = "./repo" -AUTHOR_LICENSE_FILE = "./config/author-licenses.txt" -PSEUDONYMS_FILE = "./config/pseudonyms.txt" -EXCLUDE_DIRS = [".git"] - - -def _load_pseudonyms(path: str): - cache = {} - if os.path.exists(path): - with open(path) as f: - for line in f: - module, author, email, license = line.strip().split("|") - cache[module] = { - "author": author, - "email": email, - "license": license, - } - return cache - - -def _load_author_licenses(path: str): - cache = {} - if os.path.exists(path): - with open(path) as f: - for line in f: - author, license = line.strip().split("|") - cache[author] = license - return cache - - -def _get_git_repo_path(base_path: str): - _root, dirs, _files = next(os.walk(base_path)) - return os.path.join(base_path, dirs[0]) - - -# Internal Configuration -_GIT_REPO_PATH = _get_git_repo_path(GIT_REPO_BASE_PATH) -_BUILT_PATH = "./build" -_BUILT_BLAME_FILE = f"{_BUILT_PATH}/blame.json" -_BUILT_HTML_FILE = f"{_BUILT_PATH}/authorship.html" -_PSEUDONYMS_CACHE = _load_pseudonyms(PSEUDONYMS_FILE) -_AUTHOR_LICENSE_CACHE = _load_author_licenses(AUTHOR_LICENSE_FILE) - - -repo = Repo(_GIT_REPO_PATH) - - -def file_blame(path: str): - blame = repo.blame("HEAD", path, rev_opts=["-M", "-C", "-C", "-C"]) - return ( - [ - [(commit.author.name, commit.author.email), len(lines)] - for commit, lines in blame - ] - if blame - else [] - ) - - -def folder_blame(path: str) -> Dict: - print(f"Blaming {path}") - root, dirs, files = next(os.walk(path)) - return { - "files": { - file_name: file_blame( - os.path.join(root, file_name)[len(_GIT_REPO_PATH) + 1 :] - ) - for file_name in files - }, - "dirs": { - dir_name: folder_blame(os.path.join(root, dir_name)) - for dir_name in dirs - if dir_name not in EXCLUDE_DIRS - }, - } - - -def repo_blame(): - from pathlib import Path - - if os.path.exists(_BUILT_BLAME_FILE): - with open(_BUILT_BLAME_FILE) as f: - return json.load(f) - else: - blame = folder_blame(_GIT_REPO_PATH) - Path(_BUILT_PATH).mkdir(parents=True, exist_ok=True) - with open(_BUILT_BLAME_FILE, "w") as f: - json.dump(blame, f) - return blame - - -##################################### - - -class ModuleAnalyzer: - name: str - parent: str - blame: List - submodules: List["ModuleAnalyzer"] - - def __init__( - self, - name: str, - parent: str, - submodules: Optional[List["ModuleAnalyzer"]] = None, - ): - self.name = name - self.parent = parent - self.submodules = submodules or [] - - def with_blame(self, blame) -> "ModuleAnalyzer": - self.blame = blame - return self - - def authorship(self): - authors = {} - for submodule in self.submodules: - for author, lines in submodule.authorship().items(): - authors[author] = authors.get(author, 0) + lines - return authors - - def licensing(self): - licenses = {} - for submodule in self.submodules: - for license, lines in submodule.licensing().items(): - licenses[license] = licenses.get(license, 0) + lines - return licenses - - def flatten(self): - flat = [self] - for submodule in self.submodules: - flat += submodule.flatten() - return flat - - -class FileModuleAnalyzer(ModuleAnalyzer): - def authorship(self): - NAME = 0 # list idx - authors = {} - for author, lines in self.blame: - name = self.__author_override() or author[NAME] - authors[name] = authors.get(name, 0) + lines - return authors - - def __author_override(self) -> Optional[str]: - for override_path, override in _PSEUDONYMS_CACHE.items(): - if override_path in self.name: - return override["author"] - return None - - def licensing(self): - NAME = 0 # list idx - licenses = {} - for author, lines in self.blame: - license = self.__license_override() or _AUTHOR_LICENSE_CACHE.get( - author[NAME], "???" - ) - licenses[license] = licenses.get(license, 0) + lines - return licenses - - def __license_override(self) -> Optional[str]: - # Note, uses the first matching entry. - # Consider using the longest matching entry instead. - for override_path, override in _PSEUDONYMS_CACHE.items(): - if override_path in self.name: - return override["license"] - return None - - -def raw_blame_to_module_analyzer( - module_name: str, parent_name: str, raw_blame: Dict[str, Dict] -) -> ModuleAnalyzer: - return ModuleAnalyzer( - module_name, - parent_name, - [ - *[ - FileModuleAnalyzer( - f"{module_name}/{file_name}", module_name - ).with_blame(blame) - for file_name, blame in raw_blame["files"].items() - ], - *[ - raw_blame_to_module_analyzer( - f"{module_name}/{dirname}", module_name, dirblame - ) - for dirname, dirblame in raw_blame["dirs"].items() - ], - ], - ) - - -##################################### - - -blame = repo_blame() -analyzer = raw_blame_to_module_analyzer(_GIT_REPO_PATH, "", blame) -stats = analyzer.authorship() -print(stats) - - -##################################### - - -import plotly.graph_objects as go - -modules = analyzer.flatten() -ids = [module.name for module in modules] -labels = [i.split("/")[-1] for i in ids] -parents = [module.parent for module in modules] -values = [sum(module.authorship().values()) for module in modules] - - -def authorship_str(module: ModuleAnalyzer) -> str: - authors = module.authorship() - return "
Authors:
- " + "
- ".join( - f"{author}: {lines}" for author, lines in authors.items() - ) - - -def licensing_str(module: ModuleAnalyzer) -> str: - licenses = module.licensing() - return "
Licenses:
- " + "
- ".join( - f"{license}: {lines}" for license, lines in licenses.items() - ) - - -def info_str(module: ModuleAnalyzer) -> str: - return f"{authorship_str(module)}
{licensing_str(module)}" - - -fig = go.Figure( - go.Treemap( - ids=ids, - labels=labels, - parents=parents, - values=values, - maxdepth=3, - branchvalues="total", - text=[info_str(module) for module in modules], - hovertemplate="%{label}

%{value} lines
%{text}", - root_color="lightgrey", - ) -) - -fig.update_layout(margin=dict(t=50, l=25, r=25, b=25)) -fig.write_html(_BUILT_HTML_FILE) diff --git a/repo/.gitignore b/repo/.gitignore deleted file mode 100644 index 9ce1769..0000000 --- a/repo/.gitignore +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright (c) 2022 Joseph Hale -# -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at http://mozilla.org/MPL/2.0/. - -* -!.gitignore \ No newline at end of file