diff --git a/.gitignore b/.gitignore
index 1426dea..77da4eb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,9 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+# Generated Project Files
+repo
+
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
diff --git a/README.md b/README.md
index bcaf3a2..b1fad83 100644
--- a/README.md
+++ b/README.md
@@ -56,13 +56,11 @@ files for labelling the licenses under which contributors have shared their code
poetry install
```
-4. Clone the repository you wish to analyze into the included `repo` folder.
-5. [Optional] Make copies of the files in the `config` folder without the `dist` extension.
-6. Run the analyzer with `make run`
- - The first run will take a while as it computes an accurate `git blame` for
- every file in your repository. At the end of the run, a cached blame file
- will be generated in the `build` directory to speed up future runs.
+4. Run `python ./git_authorship REPO_URL`
+ - Generates a treemap at `authorship.html`
+ - AND Generates a JSON output at `authorship.json`
+
## License
Copyright (c) 2022 Joseph Hale, All Rights Reserved
diff --git a/git_authorship/__main__.py b/git_authorship/__main__.py
new file mode 100644
index 0000000..a1179fc
--- /dev/null
+++ b/git_authorship/__main__.py
@@ -0,0 +1,166 @@
+# Copyright (c) 2024 Joseph Hale
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+import argparse
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List, Optional
+import plotly.graph_objects as go
+import json
+
+from git import Repo
+
+EXCLUDE_DIRS = [".git"]
+
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("location", nargs="?", default=".")
+ parser.add_argument("--clone-to", nargs="?", default="./repo/git_authorship")
+ # TODO --branch (to analyze a specific branch)
+ return parser.parse_args()
+
+
+def ensure_cloned_and_pulled(location: str, clone_to: str):
+ if not Path(clone_to).exists():
+ Repo.clone_from(location, clone_to)
+ else:
+ Repo(clone_to).git.pull()
+
+ return Repo(clone_to)
+
+
+def iterfiles(dir: Path, exclude: Optional[List[Path]] = None):
+ exclude = exclude or []
+ for path in dir.iterdir():
+ if path.is_file():
+ yield path
+ elif path.is_dir() and path not in exclude:
+ yield from iterfiles(path)
+
+
+def iterdirs(dir: Path, exclude: Optional[List[Path]] = None):
+ exclude = exclude or []
+ for path in dir.iterdir():
+ if path.is_file():
+ continue
+ elif path.is_dir() and path not in exclude:
+ yield path
+ yield from iterdirs(path)
+
+
+FilePath = Path
+Author = str
+LineCount = int
+Authorship = Dict[Author, LineCount]
+RepoAuthorship = Dict[FilePath, Authorship]
+
+
+def file_authorship(repo: Repo, path: Path) -> Authorship:
+ raw_blame = repo.blame("HEAD", str(path), rev_opts=["-M", "-C", "-C", "-C"])
+ blame = [
+ (f"{commit.author.name} <{commit.author.email}>", len(lines))
+ for commit, lines in (raw_blame or [])
+ ]
+
+ authorship = defaultdict(int)
+ for author, lines in blame:
+ authorship[author] += lines
+
+ return authorship
+
+
+def repo_authorship(repo: Repo) -> RepoAuthorship:
+ """
+ Calculates how many lines each author has contributed to the repo, with breakdowns
+ by folder and file.
+
+ e.g. For a repo with the following structure:
+
+ ```
+ .
+ ├── folder1
+ │ ├── file1.txt (author1: 25 lines, author2: 150 lines)
+ │ └── file2.txt (author1: 25 lines)
+ ├── folder2
+ │ ├── file1.txt (author1: 25 lines, author2: 25 lines)
+ │ └── file2.txt (author1: 25 lines, author2: 25 lines)
+ ```
+
+ The result will be
+
+ ```
+ {
+ ".": { "author1": 100, "author2": 200 },
+ "./folder1": { "author1": 50, "author2": 150 },
+ "./folder1/file1.txt": { "author1": 25, "author2": 150 },
+ "./folder1/file2.txt": { "author1": 25 },
+ "./folder2": { "author1": 50, "author2": 50 },
+ "./folder2/file1.txt": { "author1": 25, "author2": 25 },
+ "./folder2/file2.txt": { "author1": 25, "author2": 25 },
+ }
+ ```
+
+ """
+ root = Path(repo.working_dir)
+ filepaths = [
+ Path(str(f)[len(str(root)) + 1 :])
+ for f in iterfiles(root, exclude=[root / d for d in EXCLUDE_DIRS])
+ ]
+ file_authorships = {path: file_authorship(repo, path) for path in filepaths}
+
+ repo_authorship: RepoAuthorship = defaultdict(lambda: defaultdict(int))
+ for file, authorship in file_authorships.items():
+ parts = f"./{file}".split("/")
+ for i in range(len(parts)):
+ cur = "/".join(parts[: i + 1])
+ for author, lines in authorship.items():
+ repo_authorship[Path(cur)][author] += lines
+
+ return repo_authorship
+
+
+def export_treemap(authorship: RepoAuthorship, output: Path = Path("authorship.html")):
+ ids = [str(file) for file in authorship.keys()]
+ parents = [
+ str(file.parent) if str(file) != "." else "" for file in authorship.keys()
+ ]
+ values = [sum(authors.values()) for authors in authorship.values()]
+ labels = [file.name for file in authorship.keys()]
+ descriptions = [
+ "
Authors:
- "
+ + "
- ".join(f"{author}: {lines}" for author, lines in authorship.items())
+ for authorship in authorship.values()
+ ]
+
+ fig = go.Figure(
+ go.Treemap(
+ ids=ids,
+ labels=labels,
+ parents=parents,
+ values=values,
+ maxdepth=3,
+ branchvalues="total",
+ text=descriptions,
+ hovertemplate="%{label}
%{value} lines
%{text}",
+ root_color="lightgrey",
+ )
+ )
+
+ fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
+ fig.write_html(output)
+
+
+def export_json(authorship: RepoAuthorship, output: Path = Path("authorship.json")):
+ with open(output, "w") as f:
+ json.dump({str(path): authors for path, authors in authorship.items()}, f)
+
+
+if __name__ == "__main__":
+ args = parse_args()
+ repo = ensure_cloned_and_pulled(args.location, args.clone_to)
+ authorship = repo_authorship(repo)
+ export_treemap(authorship)
+ export_json(authorship)
diff --git a/git_authorship/authorship_analyzer.py b/git_authorship/authorship_analyzer.py
deleted file mode 100644
index f972157..0000000
--- a/git_authorship/authorship_analyzer.py
+++ /dev/null
@@ -1,261 +0,0 @@
-# Copyright (c) 2022 Joseph Hale
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-import json
-import os
-from typing import Dict
-from typing import List
-from typing import Optional
-
-from git import Repo
-
-# Global Configuration
-GIT_REPO_BASE_PATH = "./repo"
-AUTHOR_LICENSE_FILE = "./config/author-licenses.txt"
-PSEUDONYMS_FILE = "./config/pseudonyms.txt"
-EXCLUDE_DIRS = [".git"]
-
-
-def _load_pseudonyms(path: str):
- cache = {}
- if os.path.exists(path):
- with open(path) as f:
- for line in f:
- module, author, email, license = line.strip().split("|")
- cache[module] = {
- "author": author,
- "email": email,
- "license": license,
- }
- return cache
-
-
-def _load_author_licenses(path: str):
- cache = {}
- if os.path.exists(path):
- with open(path) as f:
- for line in f:
- author, license = line.strip().split("|")
- cache[author] = license
- return cache
-
-
-def _get_git_repo_path(base_path: str):
- _root, dirs, _files = next(os.walk(base_path))
- return os.path.join(base_path, dirs[0])
-
-
-# Internal Configuration
-_GIT_REPO_PATH = _get_git_repo_path(GIT_REPO_BASE_PATH)
-_BUILT_PATH = "./build"
-_BUILT_BLAME_FILE = f"{_BUILT_PATH}/blame.json"
-_BUILT_HTML_FILE = f"{_BUILT_PATH}/authorship.html"
-_PSEUDONYMS_CACHE = _load_pseudonyms(PSEUDONYMS_FILE)
-_AUTHOR_LICENSE_CACHE = _load_author_licenses(AUTHOR_LICENSE_FILE)
-
-
-repo = Repo(_GIT_REPO_PATH)
-
-
-def file_blame(path: str):
- blame = repo.blame("HEAD", path, rev_opts=["-M", "-C", "-C", "-C"])
- return (
- [
- [(commit.author.name, commit.author.email), len(lines)]
- for commit, lines in blame
- ]
- if blame
- else []
- )
-
-
-def folder_blame(path: str) -> Dict:
- print(f"Blaming {path}")
- root, dirs, files = next(os.walk(path))
- return {
- "files": {
- file_name: file_blame(
- os.path.join(root, file_name)[len(_GIT_REPO_PATH) + 1 :]
- )
- for file_name in files
- },
- "dirs": {
- dir_name: folder_blame(os.path.join(root, dir_name))
- for dir_name in dirs
- if dir_name not in EXCLUDE_DIRS
- },
- }
-
-
-def repo_blame():
- from pathlib import Path
-
- if os.path.exists(_BUILT_BLAME_FILE):
- with open(_BUILT_BLAME_FILE) as f:
- return json.load(f)
- else:
- blame = folder_blame(_GIT_REPO_PATH)
- Path(_BUILT_PATH).mkdir(parents=True, exist_ok=True)
- with open(_BUILT_BLAME_FILE, "w") as f:
- json.dump(blame, f)
- return blame
-
-
-#####################################
-
-
-class ModuleAnalyzer:
- name: str
- parent: str
- blame: List
- submodules: List["ModuleAnalyzer"]
-
- def __init__(
- self,
- name: str,
- parent: str,
- submodules: Optional[List["ModuleAnalyzer"]] = None,
- ):
- self.name = name
- self.parent = parent
- self.submodules = submodules or []
-
- def with_blame(self, blame) -> "ModuleAnalyzer":
- self.blame = blame
- return self
-
- def authorship(self):
- authors = {}
- for submodule in self.submodules:
- for author, lines in submodule.authorship().items():
- authors[author] = authors.get(author, 0) + lines
- return authors
-
- def licensing(self):
- licenses = {}
- for submodule in self.submodules:
- for license, lines in submodule.licensing().items():
- licenses[license] = licenses.get(license, 0) + lines
- return licenses
-
- def flatten(self):
- flat = [self]
- for submodule in self.submodules:
- flat += submodule.flatten()
- return flat
-
-
-class FileModuleAnalyzer(ModuleAnalyzer):
- def authorship(self):
- NAME = 0 # list idx
- authors = {}
- for author, lines in self.blame:
- name = self.__author_override() or author[NAME]
- authors[name] = authors.get(name, 0) + lines
- return authors
-
- def __author_override(self) -> Optional[str]:
- for override_path, override in _PSEUDONYMS_CACHE.items():
- if override_path in self.name:
- return override["author"]
- return None
-
- def licensing(self):
- NAME = 0 # list idx
- licenses = {}
- for author, lines in self.blame:
- license = self.__license_override() or _AUTHOR_LICENSE_CACHE.get(
- author[NAME], "???"
- )
- licenses[license] = licenses.get(license, 0) + lines
- return licenses
-
- def __license_override(self) -> Optional[str]:
- # Note, uses the first matching entry.
- # Consider using the longest matching entry instead.
- for override_path, override in _PSEUDONYMS_CACHE.items():
- if override_path in self.name:
- return override["license"]
- return None
-
-
-def raw_blame_to_module_analyzer(
- module_name: str, parent_name: str, raw_blame: Dict[str, Dict]
-) -> ModuleAnalyzer:
- return ModuleAnalyzer(
- module_name,
- parent_name,
- [
- *[
- FileModuleAnalyzer(
- f"{module_name}/{file_name}", module_name
- ).with_blame(blame)
- for file_name, blame in raw_blame["files"].items()
- ],
- *[
- raw_blame_to_module_analyzer(
- f"{module_name}/{dirname}", module_name, dirblame
- )
- for dirname, dirblame in raw_blame["dirs"].items()
- ],
- ],
- )
-
-
-#####################################
-
-
-blame = repo_blame()
-analyzer = raw_blame_to_module_analyzer(_GIT_REPO_PATH, "", blame)
-stats = analyzer.authorship()
-print(stats)
-
-
-#####################################
-
-
-import plotly.graph_objects as go
-
-modules = analyzer.flatten()
-ids = [module.name for module in modules]
-labels = [i.split("/")[-1] for i in ids]
-parents = [module.parent for module in modules]
-values = [sum(module.authorship().values()) for module in modules]
-
-
-def authorship_str(module: ModuleAnalyzer) -> str:
- authors = module.authorship()
- return "
Authors:
- " + "
- ".join(
- f"{author}: {lines}" for author, lines in authors.items()
- )
-
-
-def licensing_str(module: ModuleAnalyzer) -> str:
- licenses = module.licensing()
- return "
Licenses:
- " + "
- ".join(
- f"{license}: {lines}" for license, lines in licenses.items()
- )
-
-
-def info_str(module: ModuleAnalyzer) -> str:
- return f"{authorship_str(module)}
{licensing_str(module)}"
-
-
-fig = go.Figure(
- go.Treemap(
- ids=ids,
- labels=labels,
- parents=parents,
- values=values,
- maxdepth=3,
- branchvalues="total",
- text=[info_str(module) for module in modules],
- hovertemplate="%{label}
%{value} lines
%{text}",
- root_color="lightgrey",
- )
-)
-
-fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
-fig.write_html(_BUILT_HTML_FILE)
diff --git a/repo/.gitignore b/repo/.gitignore
deleted file mode 100644
index 9ce1769..0000000
--- a/repo/.gitignore
+++ /dev/null
@@ -1,8 +0,0 @@
-# Copyright (c) 2022 Joseph Hale
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-*
-!.gitignore
\ No newline at end of file