Prototype markdown table import into Github issues

senier · senier · commit 2a325c2db349 · 2025-09-12T13:24:58.000+02:00
diff --git a/Makefile b/Makefile
@@ -0,0 +1,22 @@
+PYTHON_FILES = \
+	scripts/markdown_to_github_issue.py \
+	scripts/tests/*.py \
+
+all: check test
+
+check: check_python
+
+check_python: check_ruff check_mypy
+
+check_ruff:
+	ruff check $(PYTHON_FILES)
+
+check_mypy:
+	mypy $(PYTHON_FILES)
+
+test: test_python
+
+test_python:
+	pytest scripts/tests/*.py
+
+.PHONY: all check check_python check_mypy test test_python
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,6 +12,10 @@ dependencies = [
     "sphinx-autobuild>=2024.10.3",
     "sphinx-needs>=5.1.0",
     "sphinx-rtd-theme>=3.0.2",
+    "markdown>=3.9",
+    "beautifulsoup4[xml]>=4.13.5",
+    "lxml>=6.0.1",
+    "pygithub>=2.8.1",
 ]
 
 [tool.uv.workspace]
@@ -30,6 +34,7 @@ lint.select = [
   "TID",      # Some good import practices
   "C4",       # Catch incorrect use of comprehensions, dict, list, etc
 ]
+
 # Remove or reduce ignores to catch more issues
 lint.ignore = [
     "E501",  # Ignore long lines, bc we use them frequently while composing .rst templates
@@ -44,6 +49,8 @@ allow-dict-calls-with-keyword-arguments = true
 
 [dependency-groups]
 dev = [
+    "mypy>=1.17.1",
+    "pytest>=8.4.2",
     "ruff>=0.12.3",
+    "types-markdown>=3.9.0.20250906",
 ]
-
diff --git a/scripts/README.md b/scripts/README.md
@@ -1,4 +1,6 @@
-### `auto-pr-helper.py`
+# Scripts for automating processes around the coding guidelines
+
+## `auto-pr-helper.py`
 
 This script is a utility for automating the generation of guidelines. It takes a GitHub issue's JSON data from standard input, parses its body (which is expected to follow a specific issue template), and converts it into a formatted reStructuredText (`.rst`) guideline.
 
@@ -17,9 +19,18 @@ cat path/to/your_issue.json | uv run scripts/auto-pr-helper.py
 ```
 
 #### 2. Fetching from the GitHub API directly
+
 You can fetch the data for a live issue directly from the GitHub API using curl and pipe it to the script. This is useful for getting the most up-to-date content.
 
 ```bash
 curl https://api.github.com/repos/rustfoundation/safety-critical-rust-coding-guidelines/issues/156 | uv run ./scripts/auto-pr-helper.py
 ```
-```
+
+## `markdown_to_github_issue.py`
+
+### How to use
+
+You need to create a personal access token for the target repository as [described here](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens#creating-a-fine-grained-personal-access-token).
+Make sure the "Issues" permission is granted as "read and write" for the token.
+
+Pass the token to the tool via the `--auth-token` command line parameter.
diff --git a/scripts/__init__.py b/scripts/__init__.py
diff --git a/scripts/markdown_to_github_issue.py b/scripts/markdown_to_github_issue.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import argparse
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Final
+
+import bs4
+import markdown
+from github import Auth, Github
+
+EXPECTED_HEADINGS: Final[list[str]] = [
+    "Guideline",
+    "Guideline Name",
+    "MISRA C:2025 Status",
+    "Decidability",
+    "Scope",
+    "Rationale",
+    "Applicability",
+    "Adjusted Category",
+]
+
+
+@dataclass(eq=True)
+class MISRA_Rule:
+    title: str
+    section: str | None = None
+    status: str | None = None
+    decidability: str | None = None
+    scope: str | None = None
+    rationale: str | None = None
+    applicability: str | None = None
+    category: str | None = None
+
+    @classmethod
+    def from_cols(cls, cols: list[str | None]) -> MISRA_Rule | None:
+        assert len(cols) == len(EXPECTED_HEADINGS), (
+            f"Expected {len(EXPECTED_HEADINGS)}, got {len(cols)}"
+        )
+        # Cannot create rule without a title
+        title = cols[EXPECTED_HEADINGS.index("Guideline Name")]
+        if title is None:
+            return None
+        return MISRA_Rule(
+            title=title,
+            section=cols[EXPECTED_HEADINGS.index("Guideline")],
+            status=cols[EXPECTED_HEADINGS.index("MISRA C:2025 Status")],
+            decidability=cols[EXPECTED_HEADINGS.index("Decidability")],
+            scope=cols[EXPECTED_HEADINGS.index("Scope")],
+            rationale=cols[EXPECTED_HEADINGS.index("Rationale")],
+            applicability=cols[EXPECTED_HEADINGS.index("Applicability")],
+            category=cols[EXPECTED_HEADINGS.index("Adjusted Category")],
+        )
+
+    @property
+    def issue_body(self) -> str:
+        # FIXME(senier): Properly layout (we could even use .github/ISSUE_TEMPLATE/CODING-GUILDELINE.yml to validate the format)
+        # FIXME(senier): Transform into dedicated coding guidline object and do layouting there
+        return str(self)
+
+
+def convert_md(file: Path) -> list[MISRA_Rule] | None:
+    result = None
+
+    with file.open() as f:
+        html = markdown.markdown(f.read(), extensions=["tables"], output_format="xhtml")
+        soup = bs4.BeautifulSoup(html, features="lxml")
+
+        table = soup.find("table")
+        if table is None or not isinstance(table, bs4.Tag):
+            return None
+
+        headings = table.find("thead")
+        if headings is None or not isinstance(headings, bs4.Tag):
+            return None
+
+        values = [h.text for h in headings.find_all("th")]
+        if values != EXPECTED_HEADINGS:
+            return None
+
+        body = table.find("tbody")
+        if body is None or not isinstance(body, bs4.Tag):
+            return None
+
+        for row in body.find_all("tr"):
+            if row is None or not isinstance(row, bs4.Tag):
+                continue
+
+            cols = [r.text or None for r in row.find_all("td")]
+            assert len(cols) == 0 or len(cols) == len(EXPECTED_HEADINGS), f"{cols}"
+
+            # skip empty rows
+            if all(c is None for c in cols):
+                continue
+
+            if result is None:
+                result = []
+            rule = MISRA_Rule.from_cols(cols)
+            if rule is not None:
+                result.append(rule)
+    return result
+
+
+def create_issues(repo_name: str, token: str, rules: list[MISRA_Rule]):
+    auth = Auth.Token(token=token)
+    github = Github(auth=auth)
+    repo = github.get_repo(repo_name)
+
+    for rule in rules:
+        if rule.title is None:
+            continue
+        repo.create_issue(title=rule.title, body=rule.issue_body)
+
+
+def import_rules(file: Path, repo: str, token: str) -> int | str:
+    md = convert_md(file)
+    if md is None:
+        return "No rules found"
+    create_issues(repo_name=repo, token=token, rules=md)
+    return 1
+
+
+def main() -> int | str:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-m",
+        "--markdown",
+        type=Path,
+        required=True,
+        help="Markdown file to extract rules from",
+    )
+    parser.add_argument(
+        "-r",
+        "--repository",
+        type=str,
+        required=True,
+        help="Github repository to import rules to (format: account/repository)",
+    )
+    parser.add_argument(
+        "-a",
+        "--auth-token",
+        type=str,
+        required=True,
+        help="Github authentication token",
+    )
+    args = parser.parse_args()
+    return import_rules(file=args.markdown, repo=args.repository, token=args.auth_token)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/tests/__init__.py b/scripts/tests/__init__.py
diff --git a/scripts/tests/data/empty_table.md b/scripts/tests/data/empty_table.md
@@ -0,0 +1,2 @@
+| Guideline | Guideline Name | MISRA C:2025 Status | Decidability | Scope | Rationale | Applicability | Adjusted Category |
+| --- | --- | --- | --- | --- | --- | --- | --- |
diff --git a/scripts/tests/data/table_with_multiple_lines.md b/scripts/tests/data/table_with_multiple_lines.md
@@ -0,0 +1,4 @@
+| Guideline | Guideline Name | MISRA C:2025 Status | Decidability | Scope | Rationale | Applicability | Adjusted Category |
+| --- | --- | --- | --- | --- | --- | --- | --- |
+| D.1.2 | The use of language extensions should be minimized | Advisory |  |  | IDB | Yes Yes | Required |
+| R.1.3 | There shall be no occurrence of undefined or critical unspecified behaviour | Required | Undecidable | System | UB, IDB | Yes Yes | Required |
diff --git a/scripts/tests/data/table_with_single_line.md b/scripts/tests/data/table_with_single_line.md
@@ -0,0 +1,3 @@
+| Guideline | Guideline Name | MISRA C:2025 Status | Decidability | Scope | Rationale | Applicability | Adjusted Category |
+| --- | --- | --- | --- | --- | --- | --- | --- |
+| D.1.2 | The use of language extensions should be minimized | Advisory |  |  | IDB | Yes Yes | Required |
diff --git a/scripts/tests/test_markdown_to_github_issue.py b/scripts/tests/test_markdown_to_github_issue.py
@@ -0,0 +1,53 @@
+from pathlib import Path
+
+import pytest
+
+from .. import markdown_to_github_issue as mtgi  # noqa: TID252
+
+DATA_DIR = Path("scripts/tests/data")
+
+
+@pytest.mark.parametrize(
+    ["file", "expected"],
+    [
+        ("empty_table", None),
+        (
+            "table_with_single_line",
+            [
+                mtgi.MISRA_Rule(
+                    section="D.1.2",
+                    title="The use of language extensions should be minimized",
+                    status="Advisory",
+                    rationale="IDB",
+                    applicability="Yes Yes",
+                    category="Required",
+                ),
+            ],
+        ),
+        (
+            "table_with_multiple_lines",
+            [
+                mtgi.MISRA_Rule(
+                    section="D.1.2",
+                    title="The use of language extensions should be minimized",
+                    status="Advisory",
+                    rationale="IDB",
+                    applicability="Yes Yes",
+                    category="Required",
+                ),
+                mtgi.MISRA_Rule(
+                    section="R.1.3",
+                    title="There shall be no occurrence of undefined or critical unspecified behaviour",
+                    status="Required",
+                    decidability="Undecidable",
+                    scope="System",
+                    rationale="UB, IDB",
+                    applicability="Yes Yes",
+                    category="Required",
+                ),
+            ],
+        ),
+    ],
+)
+def test_foo(file: str, expected: list[mtgi.MISRA_Rule] | None) -> None:
+    assert mtgi.convert_md(DATA_DIR / f"{file}.md") == expected
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+\| Guideline \| Guideline Name \| MISRA C:2025 Status \| Decidability \| Scope \| Rationale \| Applicability \| Adjusted Category \|`
	`2`	`+\| --- \| --- \| --- \| --- \| --- \| --- \| --- \| --- \|`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+\| Guideline \| Guideline Name \| MISRA C:2025 Status \| Decidability \| Scope \| Rationale \| Applicability \| Adjusted Category \|`
	`2`	`+\| --- \| --- \| --- \| --- \| --- \| --- \| --- \| --- \|`
	`3`	`+\| D.1.2 \| The use of language extensions should be minimized \| Advisory \| \| \| IDB \| Yes Yes \| Required \|`