Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 57 additions & 35 deletions ontokit/api/routes/pull_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -660,45 +660,67 @@ async def github_webhook(
push_branch = ref[len("refs/heads/") :]
commits = payload.get("commits", [])

from ontokit.models.remote_sync import RemoteSyncConfig

sync_result = await service.db.execute(
select(RemoteSyncConfig).where(
RemoteSyncConfig.project_id == project_id,
RemoteSyncConfig.enabled.is_(True),
RemoteSyncConfig.frequency == "webhook",
# Loop prevention: skip remote sync check if all commits in this push
# were authored by OntoKit itself (prevents feedback loops when
# OntoKit pushes to the same repo it tracks as upstream).
from ontokit.core.constants import ONTOKIT_COMMITTER_EMAILS

external_commits = [
c
for c in commits
if c.get("committer", {}).get("email") not in ONTOKIT_COMMITTER_EMAILS
]

if not external_commits:
logger.info(
"Skipping remote sync check for project %s: "
"all %d commit(s) in push are OntoKit-authored",
project_id,
len(commits),
)
else:
from ontokit.models.remote_sync import RemoteSyncConfig

sync_result = await service.db.execute(
select(RemoteSyncConfig).where(
RemoteSyncConfig.project_id == project_id,
RemoteSyncConfig.enabled.is_(True),
RemoteSyncConfig.frequency == "webhook",
)
)
)
sync_config = sync_result.scalar_one_or_none()

if sync_config and sync_config.branch == push_branch:
# Check if any commit touches the tracked file
touched_files: set[str] = set()
for commit in commits:
touched_files.update(commit.get("added", []))
touched_files.update(commit.get("modified", []))

if sync_config.file_path in touched_files:
previous_status = sync_config.status
sync_config.status = "checking"
await service.db.commit()

try:
pool = await get_arq_pool()
if pool is not None:
await pool.enqueue_job("run_remote_check_task", str(project_id))
else:
sync_config = sync_result.scalar_one_or_none()

if sync_config and sync_config.branch == push_branch:
# Check if any external commit touches the tracked file
touched_files: set[str] = set()
for commit in external_commits:
touched_files.update(commit.get("added", []))
touched_files.update(commit.get("modified", []))

if sync_config.file_path in touched_files:
Comment on lines +695 to +700
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

# First, locate and check the file
git ls-files | grep -E "pull_requests\.py"

Repository: CatholicOS/ontokit-api

Length of output: 175


🏁 Script executed:

# Check the file size first
wc -l ontokit/api/routes/pull_requests.py

Repository: CatholicOS/ontokit-api

Length of output: 105


🏁 Script executed:

# Read the specific lines and surrounding context
sed -n '685,710p' ontokit/api/routes/pull_requests.py

Repository: CatholicOS/ontokit-api

Length of output: 1351


🏁 Script executed:

# Look at broader context - find where external_commits is defined and what triggers this code
sed -n '650,720p' ontokit/api/routes/pull_requests.py

Repository: CatholicOS/ontokit-api

Length of output: 3264


🏁 Script executed:

# Search for other references to "removed" in the file to understand the context better
grep -n "removed" ontokit/api/routes/pull_requests.py

Repository: CatholicOS/ontokit-api

Length of output: 48


🏁 Script executed:

# Search the entire codebase for usage of "removed" key in commits
rg "removed" --type py -B 2 -A 2

Repository: CatholicOS/ontokit-api

Length of output: 16306


🏁 Script executed:

# Check GitHub webhook structure - search for any documentation or references
rg "commit.*get\(.*removed" --type py

Repository: CatholicOS/ontokit-api

Length of output: 48


🌐 Web query:

GitHub webhook push event commit object structure "removed" field

💡 Result:

In a GitHub “push” webhook delivery (the push event sent to your webhook URL), each item in the top-level commits array is a “commit summary” object that can include these file lists:

{
  "id": "",
  "message": "",
  "timestamp": "",
  "url": "",
  "author": { "name": "", "email": "" },
  "committer": { "name": "", "email": "" },

  "added":    ["path/created.ext", "..."],
  "modified": ["path/changed.ext", "..."],
  "removed":  ["path/deleted.ext", "..."]
}

What removed means: it’s an array of repository-relative file paths that were deleted by that commit (as represented in the push payload’s commit summaries). [1]

Important caveat (common source of confusion): if you’re looking at the push event payload inside GitHub Actions (github.event / event.json), GitHub explicitly does not include the added, removed, or modified arrays in the commit objects there; you must call the API to fetch file-level details. [2][3]

Sources:
[1] Push event payload example showing removed in commit objects: GitHub Events API docs mirror (PushEvent example payload) (docs2.lfe.io)
[2] GitHub Actions docs noting added/removed/modified are not present in Actions push payload: (docs.github.com)
[3] GitHub changelog describing removal of those fields from the Actions push payload: (github.blog)

Citations:


🏁 Script executed:

# Search for tests related to webhook push handling
rg "webhook.*push\|push.*webhook" --type py -i -A 5

Repository: CatholicOS/ontokit-api

Length of output: 48


🏁 Script executed:

# Look for tests of the touched_files logic
rg "touched_files\|external_commits" --type py -B 3 -A 3

Repository: CatholicOS/ontokit-api

Length of output: 48


🏁 Script executed:

# Check if there are any tests in test_pull_requests or test_webhooks
fd -e py "test.*webhook\|test.*pull_request" tests/

Repository: CatholicOS/ontokit-api

Length of output: 48


🏁 Script executed:

# List test files to understand test coverage
find tests -name "*.py" -type f | head -20

Repository: CatholicOS/ontokit-api

Length of output: 693


🏁 Script executed:

# Search for RemoteSyncConfig model to understand the data structure
rg "class RemoteSyncConfig" --type py -A 15

Repository: CatholicOS/ontokit-api

Length of output: 4149


🏁 Script executed:

# Check if there are any existing comments or logic about file deletion handling
rg "delete\|removed" ontokit/api/routes/pull_requests.py -B 2 -A 2

Repository: CatholicOS/ontokit-api

Length of output: 48


🏁 Script executed:

# Search for run_remote_check_task to understand what happens after sync is triggered
rg "run_remote_check_task" --type py -B 2 -A 5

Repository: CatholicOS/ontokit-api

Length of output: 8190


🏁 Script executed:

# Look at the broader context of the webhook handler to confirm understanding
sed -n '640,730p' ontokit/api/routes/pull_requests.py

Repository: CatholicOS/ontokit-api

Length of output: 3765


🏁 Script executed:

# Check test_pull_requests_routes.py for webhook push tests
grep -n "test_push\|external_commits" tests/unit/test_pull_requests_routes.py

Repository: CatholicOS/ontokit-api

Length of output: 442


🏁 Script executed:

# Look at the actual test to see what scenarios are covered
sed -n '1,100p' tests/unit/test_pull_requests_routes.py

Repository: CatholicOS/ontokit-api

Length of output: 2348


🏁 Script executed:

# Look at the specific test cases for push webhook handling
sed -n '525,650p' tests/unit/test_pull_requests_routes.py

Repository: CatholicOS/ontokit-api

Length of output: 4855


🏁 Script executed:

# Look for any other tests related to file deletion in webhooks
grep -n "removed\|delete" tests/unit/test_pull_requests_routes.py

Repository: CatholicOS/ontokit-api

Length of output: 587


Include deleted files when checking whether the tracked file changed.

touched_files only aggregates added and modified commits. If an external push deletes sync_config.file_path, the sync check is never triggered, causing webhook-based sync to miss legitimate upstream changes.

🔧 Suggested fix
                     touched_files: set[str] = set()
                     for commit in external_commits:
-                        touched_files.update(commit.get("added", []))
-                        touched_files.update(commit.get("modified", []))
+                        touched_files.update(commit.get("added") or [])
+                        touched_files.update(commit.get("modified") or [])
+                        touched_files.update(commit.get("removed") or [])
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@ontokit/api/routes/pull_requests.py` around lines 695 - 700, The current
check builds touched_files from commit.get("added", []) and
commit.get("modified", []) and therefore misses deletions; update the loop that
aggregates touched_files (where external_commits is iterated and touched_files
is updated) to also include commit.get("removed", []) so that deletions of
sync_config.file_path will trigger the sync check (ensure the same set[str]
touched_files variable and the if sync_config.file_path in touched_files
condition are used).

previous_status = sync_config.status
sync_config.status = "checking"
await service.db.commit()

try:
pool = await get_arq_pool()
if pool is not None:
await pool.enqueue_job(
"run_remote_check_task", str(project_id)
)
else:
logger.warning(
"ARQ pool is None; skipping remote sync check"
" after webhook push"
)
sync_config.status = previous_status
await service.db.commit()
except Exception:
logger.warning(
"ARQ pool is None; skipping remote sync check after webhook push"
"Failed to queue remote sync check after webhook push",
exc_info=True,
)
sync_config.status = previous_status
await service.db.commit()
except Exception:
logger.warning(
"Failed to queue remote sync check after webhook push",
exc_info=True,
)
sync_config.status = previous_status
await service.db.commit()

return {"status": "ok"}
16 changes: 16 additions & 0 deletions ontokit/core/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
"""OntoKit application-wide constants."""

# Committer identities used by OntoKit for automated commits.
# Any commit with one of these committer emails is considered OntoKit-authored
# and should be excluded from upstream sync processing to prevent feedback loops.
ONTOKIT_COMMITTER_NAME = "OntoKit"
ONTOKIT_COMMITTER_EMAIL = "noreply@ontokit.dev"

ONTOKIT_SYNC_COMMITTER_NAME = "OntoKit Sync"
ONTOKIT_SYNC_COMMITTER_EMAIL = "sync@ontokit.dev"

# Set of all emails used by OntoKit as committer identity.
# Used by the webhook handler to detect and skip self-authored commits.
ONTOKIT_COMMITTER_EMAILS: frozenset[str] = frozenset(
{ONTOKIT_COMMITTER_EMAIL, ONTOKIT_SYNC_COMMITTER_EMAIL}
)
6 changes: 4 additions & 2 deletions ontokit/git/bare_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,11 @@ def is_initialized(self) -> bool:

def _get_signature(self, name: str | None = None, email: str | None = None) -> pygit2.Signature:
"""Create a pygit2 signature for commits."""
from ontokit.core.constants import ONTOKIT_COMMITTER_EMAIL, ONTOKIT_COMMITTER_NAME

return pygit2.Signature(
name=name or "OntoKit",
email=email or "noreply@ontokit.dev",
name=name or ONTOKIT_COMMITTER_NAME,
email=email or ONTOKIT_COMMITTER_EMAIL,
)

def _resolve_ref(self, ref: str) -> pygit2.Commit:
Expand Down
7 changes: 6 additions & 1 deletion ontokit/services/github_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,12 @@ def _try_merge(
# Create merge commit
local_commit = cast(pygit2.Commit, repo.get(local_oid))
remote_commit = cast(pygit2.Commit, repo.get(remote_oid))
sig = pygit2.Signature("OntoKit Sync", "sync@ontokit.dev")
from ontokit.core.constants import (
ONTOKIT_SYNC_COMMITTER_EMAIL,
ONTOKIT_SYNC_COMMITTER_NAME,
)

sig = pygit2.Signature(ONTOKIT_SYNC_COMMITTER_NAME, ONTOKIT_SYNC_COMMITTER_EMAIL)
repo.create_commit(
f"refs/heads/{branch}",
sig,
Expand Down
100 changes: 100 additions & 0 deletions tests/unit/test_upstream_loop_prevention.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
"""Tests for upstream sync loop prevention logic."""

from typing import Any

from ontokit.core.constants import (
ONTOKIT_COMMITTER_EMAIL,
ONTOKIT_COMMITTER_EMAILS,
ONTOKIT_SYNC_COMMITTER_EMAIL,
)


def _make_commit(
email: str,
added: list[str] | None = None,
modified: list[str] | None = None,
) -> dict[str, Any]:
"""Create a minimal GitHub webhook commit payload."""
return {
"id": "abc123",
"committer": {"name": "Test", "email": email},
"added": added or [],
"modified": modified or [],
}


def _filter_external(commits: list[dict[str, Any]]) -> list[dict[str, Any]]:
"""Reproduce the filtering logic from the webhook handler."""
return [
c for c in commits if c.get("committer", {}).get("email") not in ONTOKIT_COMMITTER_EMAILS
]


class TestCommitterEmailConstants:
"""Verify identity constants are correctly defined."""

def test_ontokit_emails_in_set(self) -> None:
assert ONTOKIT_COMMITTER_EMAIL in ONTOKIT_COMMITTER_EMAILS
assert ONTOKIT_SYNC_COMMITTER_EMAIL in ONTOKIT_COMMITTER_EMAILS

def test_set_is_frozen(self) -> None:
assert isinstance(ONTOKIT_COMMITTER_EMAILS, frozenset)


class TestWebhookCommitFiltering:
"""Test the commit filtering logic used in the push webhook handler."""

def test_all_ontokit_commits_filtered(self) -> None:
"""Push with only OntoKit commits → no external commits → skip sync."""
commits = [
_make_commit(ONTOKIT_COMMITTER_EMAIL, modified=["ontology.ttl"]),
_make_commit(ONTOKIT_SYNC_COMMITTER_EMAIL, modified=["ontology.ttl"]),
]
assert _filter_external(commits) == []

def test_external_commits_pass_through(self) -> None:
"""Push with external commits → they pass through the filter."""
commits = [
_make_commit("dev@example.com", modified=["ontology.ttl"]),
]
result = _filter_external(commits)
assert len(result) == 1
assert result[0]["committer"]["email"] == "dev@example.com"

def test_mixed_commits_only_keep_external(self) -> None:
"""Push with both OntoKit and external commits → only external kept."""
commits = [
_make_commit(ONTOKIT_SYNC_COMMITTER_EMAIL, modified=["ontology.ttl"]),
_make_commit("ci-bot@corp.com", modified=["ontology.ttl"]),
_make_commit(ONTOKIT_COMMITTER_EMAIL, modified=["README.md"]),
]
result = _filter_external(commits)
assert len(result) == 1
assert result[0]["committer"]["email"] == "ci-bot@corp.com"

def test_empty_commits_list(self) -> None:
"""Empty push payload → no external commits."""
assert _filter_external([]) == []

def test_commit_without_committer_field(self) -> None:
"""Malformed commit without committer → treated as external (safe default)."""
commits = [{"id": "abc", "added": [], "modified": ["ontology.ttl"]}]
result = _filter_external(commits)
assert len(result) == 1

def test_file_touch_detection_uses_only_external(self) -> None:
"""Only external commits' file lists should matter for triggering sync."""
ontokit_commit = _make_commit(ONTOKIT_SYNC_COMMITTER_EMAIL, modified=["ontology.ttl"])
external_commit = _make_commit("dev@example.com", modified=["README.md"])

external = _filter_external([ontokit_commit, external_commit])

# Collect touched files from external commits only
touched: set[str] = set()
for c in external:
touched.update(c.get("added", []))
touched.update(c.get("modified", []))

# ontology.ttl was only touched by the OntoKit commit, not external
assert "ontology.ttl" not in touched
assert "README.md" in touched
Loading