feat: Add author licensing config

Restores the feature allowing for reporting licensing statistics based on author. closes #88
thehale · Jan 11, 2025 · c8c3164 · c8c3164
1 parent 38347de
commit c8c3164
Show file tree

Hide file tree

Showing 10 changed files with 253 additions and 84 deletions.
diff --git a/README.md b/README.md
@@ -47,26 +47,31 @@ functionality), it does help you clearly identify who your contributors are and
 the exact lines of code they wrote.
 
 
-<!-- 
 To support libraries undergoing re-licensing, `git-authorship` includes config
 files for labelling the licenses under which contributors have shared their code.
 
 ## Other Features
 
 ### Author Licenses
-If you want to include information about the OSS license offered by each
-contributor, simply add a line for each author to `config/author-licenses.txt`
-in the following format:
 
+You can include OSS licensing information for each author via a `.csv` file. 
+The `author-name` will be matched to the values shown in the generated
+authorship report.
+
+_licensing.csv_ 
 ```
-author-name|license-SPDX-id
+author-name,license-SPDX-id
 ```
 
-The `author-name` will be matched to the values shown in the generated
-authorship report.
+<sub>A list of SPDX license identifiers can be found at [spdx.org/licenses](https://spdx.org/licenses)</sub>
 
-_A list of SPDX license identifiers can be found here:
-https://spdx.org/licenses/_
+Then tell the CLI about the authorship file
+
+```bash
+git-authorship REPO_URL --author-licenses licensing.csv
+```
+
+<!-- 
 
 
 ### Pseudonyms

diff --git a/git_authorship/_types.py b/git_authorship/_types.py
@@ -5,12 +5,23 @@
 # file, You can obtain one at https://mozilla.org/MPL/2.0/.
 from pathlib import Path
 from typing import Dict
+from typing import TypedDict
+
+from typing_extensions import NotRequired
 
 
 FilePath = Path
 Author = str
 LineCount = int
-Authorship = Dict[Author, LineCount]
+License = str
+
+
+class AuthorshipInfo(TypedDict):
+    lines: LineCount
+    license: NotRequired[License]
+
+
+Authorship = Dict[Author, AuthorshipInfo]
 RepoAuthorship = Dict[FilePath, Authorship]
 
 __all__ = ["FilePath", "Author", "LineCount", "Authorship", "RepoAuthorship"]
diff --git a/git_authorship/authorship.py b/git_authorship/authorship.py
@@ -3,15 +3,18 @@
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at https://mozilla.org/MPL/2.0/.
+import csv
 import json
 import logging
 from collections import defaultdict
 from pathlib import Path
+from typing import Optional
 
 from git import Repo
 
 from ._pathutils import iterfiles
 from ._types import Authorship
+from ._types import AuthorshipInfo
 from ._types import RepoAuthorship
 from git_authorship import export
 
@@ -20,7 +23,13 @@
 log = logging.getLogger(__name__)
 
 
-def for_repo(repo: Repo, cache_dir: Path = Path("build/cache")) -> RepoAuthorship:
+def for_repo(
+    repo: Repo,
+    *,
+    license_file: Optional[Path] = None,
+    cache_dir: Path = Path("build/cache"),
+    use_cache: bool = True,
+) -> RepoAuthorship:
     """
     Calculates how many lines each author has contributed to the repo, with breakdowns
     by folder and file.
@@ -41,24 +50,25 @@ def for_repo(repo: Repo, cache_dir: Path = Path("build/cache")) -> RepoAuthorshi
 
     ```
     {
-      ".": { "author1": 100, "author2": 200 },
-      "./folder1": { "author1": 50, "author2": 150 },
-      "./folder1/file1.txt": { "author1": 25, "author2": 150 },
-      "./folder1/file2.txt": { "author1": 25 },
-      "./folder2": { "author1": 50, "author2": 50 },
-      "./folder2/file1.txt": { "author1": 25, "author2": 25 },
-      "./folder2/file2.txt": { "author1": 25, "author2": 25 },
+      ".": { "author1": {"lines": 100}, "author2": {"lines": 200} },
+      "./folder1": { "author1": {"lines": 50}, "author2": {"lines": 150} },
+      "./folder1/file1.txt": { "author1": {"lines": 25}, "author2": {"lines": 150} },
+      "./folder1/file2.txt": { "author1": {"lines": 25} },
+      "./folder2": { "author1": {"lines": 50}, "author2": {"lines": 50} },
+      "./folder2/file1.txt": { "author1": {"lines": 25}, "author2": {"lines": 25} },
+      "./folder2/file2.txt": { "author1": {"lines": 25}, "author2": {"lines": 25} },
     }
     ```
 
     """
     cache_key = cache_dir / f"{repo.head.commit.hexsha}.json"
 
-    if cache_key.exists():
+    if use_cache and cache_key.exists():
         with open(cache_key, "r") as f:
             return {Path(k): v for k, v in (json.load(f) or {}).items()}
     else:
         data = _compute_repo_authorship(repo)
+        data = _augment_author_licenses(data, license_file)
         cache_key.parent.mkdir(exist_ok=True, parents=True)
         export.as_json(data, cache_key)
         return data
@@ -81,8 +91,8 @@ def for_file(repo: Repo, path: Path) -> Authorship:
     The returned authorship would be:
     ```
     {
-      "author1": 3,
-      "author2": 2,
+      "author1": {"lines": 3},
+      "author2": {"lines": 2},
     }
     ```
     """
@@ -94,9 +104,9 @@ def for_file(repo: Repo, path: Path) -> Authorship:
             for commit, lines in (raw_blame or [])
         ]
 
-        authorship: Authorship = defaultdict(int)
+        authorship: Authorship = defaultdict(_AuthorshipInfo)
         for author, lines in blame:
-            authorship[author] += lines
+            authorship[author]["lines"] += lines
     except FileNotFoundError as e:
         log.warning(f"Failed to blame {path}: {e}")
         authorship = {}
@@ -112,15 +122,35 @@ def _compute_repo_authorship(repo: Repo) -> RepoAuthorship:
     ]
     file_authorships = {path: for_file(repo, path) for path in filepaths}
 
-    repo_authorship: RepoAuthorship = defaultdict(lambda: defaultdict(int))
+    repo_authorship: RepoAuthorship = defaultdict(lambda: defaultdict(_AuthorshipInfo))
     for file, authorship in file_authorships.items():
         parts = f"./{file}".split("/")
         for i in range(len(parts)):
             cur = "/".join(parts[: i + 1])
-            for author, lines in authorship.items():
-                repo_authorship[Path(cur)][author] += lines
+            for author, info in authorship.items():
+                repo_authorship[Path(cur)][author]["lines"] += info["lines"]
 
     return repo_authorship
 
 
+def _augment_author_licenses(
+    repo_authorship: RepoAuthorship, licenses_path: Optional[Path] = None
+) -> RepoAuthorship:
+    if licenses_path:
+        with open(licenses_path, "r") as f:
+            reader = csv.reader(f)
+            licenses = {row[0]: row[1] for row in reader}
+
+        for path, authorship in repo_authorship.items():
+            for author in authorship.keys():
+                if author in licenses:
+                    repo_authorship[path][author]["license"] = licenses[author]
+
+    return repo_authorship
+
+
+def _AuthorshipInfo() -> AuthorshipInfo:
+    return {"lines": 0}
+
+
 __all__ = ["file", "repo"]
diff --git a/git_authorship/cli.py b/git_authorship/cli.py
@@ -7,6 +7,7 @@
 import logging
 from dataclasses import dataclass
 from pathlib import Path
+from typing import Optional
 
 from git import Repo
 
@@ -21,6 +22,8 @@ class Args:
     location: str
     clone_to: str
     branch: str
+    author_licenses_path: Optional[Path]
+    use_authorship_cache: bool = True
 
 
 def parse_args(argv=None) -> Args:
@@ -34,9 +37,35 @@ def parse_args(argv=None) -> Args:
     parser.add_argument(
         "--branch", nargs="?", default=None, help="The branch/revision to checkout"
     )
+    parser.add_argument(
+        "--author-licenses",
+        nargs="?",
+        default=None,
+        help="The path to a CSV file containing author licenses",
+    )
+    parser.add_argument(
+        "--no-authorship-cache", action="store_true", help="Ignore authorship cache"
+    )
 
     args = parser.parse_args(argv)
-    return Args(args.location, args.clone_to, args.branch)
+
+    return Args(
+        args.location,
+        args.clone_to,
+        args.branch,
+        _parse_author_licenses_path(args.author_licenses),
+        not args.no_authorship_cache,
+    )
+
+
+def _parse_author_licenses_path(arg: Optional[str] = None) -> Optional[Path]:
+    if not arg:
+        return None
+    else:
+        if not Path(arg).exists():
+            raise FileNotFoundError(arg)
+        else:
+            return Path(arg)
 
 
 def clone_and_checkout(args: Args):
@@ -57,7 +86,11 @@ def clone_and_checkout(args: Args):
 
 def run(args: Args):
     repo = clone_and_checkout(args)
-    repo_authorship = authorship.for_repo(repo)
+    repo_authorship = authorship.for_repo(
+        repo,
+        license_file=args.author_licenses_path,
+        use_cache=args.use_authorship_cache,
+    )
     export.as_treemap(repo_authorship)
     export.as_json(repo_authorship)
     export.as_csv(repo_authorship)

diff --git a/git_authorship/export.py b/git_authorship/export.py
@@ -5,12 +5,15 @@
 # file, You can obtain one at https://mozilla.org/MPL/2.0/.
 import csv
 import json
+from collections import defaultdict
 from pathlib import Path
+from typing import Dict
 
 import plotly.graph_objects as go
 
 from ._pathutils import io_handle
 from ._pathutils import Writeable
+from ._types import Authorship
 from ._types import RepoAuthorship
 
 
@@ -29,11 +32,27 @@ def as_treemap(
     parents = [
         str(file.parent) if str(file) != "." else "" for file in authorship.keys()
     ]
-    values = [sum(authors.values()) for authors in authorship.values()]
+    values = [
+        sum(info["lines"] for info in authors.values())
+        for authors in authorship.values()
+    ]
     labels = [file.name for file in authorship.keys()]
+
+    def author_list(authorship: Authorship):
+        return "<br>Authors:<br> - " + "<br> - ".join(
+            f"{author}: {info['lines']}" for author, info in authorship.items()
+        )
+
+    def license_list(authorship: Authorship):
+        licensing: Dict[str, int] = defaultdict(int)
+        for _, info in authorship.items():
+            licensing[info.get("license", "Unknown")] += info["lines"]
+        return "<br>Licenses:<br> - " + "<br> - ".join(
+            f"{license}: {lines}" for license, lines in licensing.items()
+        )
+
     descriptions = [
-        "<br>Authors:<br> - "
-        + "<br> - ".join(f"{author}: {lines}" for author, lines in authorship.items())
+        f"{author_list(authorship)}<br>{license_list(authorship)}"
         for authorship in authorship.values()
     ]
 
@@ -83,12 +102,12 @@ def as_csv(
     """
     with io_handle(output) as f:
         writer = csv.writer(f)
-        writer.writerow(["path", "author", "lines"])
+        writer.writerow(["path", "author", "lines", "license"])
         for path, authors in sorted(authorship.items(), key=lambda x: x[0]):
-            for author, lines in sorted(
-                authors.items(), key=lambda x: x[1], reverse=True
+            for author, info in sorted(
+                authors.items(), key=lambda x: x[1]["lines"], reverse=True
             ):
-                writer.writerow([path, author, lines])
+                writer.writerow([path, author, info["lines"], info.get("license")])
 
 
 __all__ = ["as_treemap", "as_json", "as_csv"]
diff --git a/test/fixtures/licensing.csv b/test/fixtures/licensing.csv
@@ -0,0 +1 @@
+Joseph Hale <[email protected]>,MPL-2.0