From a5406278959f02154b3a6fa63b3bf2b155649918 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A9o=20Gillot-Lamure?= Date: Wed, 27 Nov 2024 20:44:18 +0100 Subject: [PATCH 1/3] Add helper for using GraphQL API --- main.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/main.py b/main.py index 2d85198..fd33026 100755 --- a/main.py +++ b/main.py @@ -659,6 +659,35 @@ def _http_get_json_paginated(self, url: str) -> Iterable[Any]: f"Got {response.status} from {next_url!r}: {body!r}", response ) + def _http_graphql(self, query: str, variables: dict[str, Any] = {}): + request_body = json.dumps({ + "query": query, + "variables": variables + }) + + self.connection.request( + method="POST", + url="/graphql", + headers={ + "Authorization": f"token {self.github_token}", + "Accept": "application/vnd.github.v3+json", + "User-Agent": "Github Access Manager", + }, + body=request_body + ) + # TODO: see _http_get() regarding unimplemented headers + + with self.connection.getresponse() as response: + if 200 <= response.status < 300: + json_response = json.load(response) + if "errors" in json_response: + errors = json_response['errors'] + raise Exception("Got GraphQL errors", errors) + return json_response['data'] + + body = response.read() + raise Exception(f"Got {response.status} from {url!r}: {body!r}", response) + def get_organization(self, org: str) -> Organization: org_data: Dict[str, Any] = self._http_get_json(f"/orgs/{org}") default_repo_permission: str = org_data["default_repository_permission"] From 89b095701d88782e5819d3a0656c1bd4d77bb728 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A9o=20Gillot-Lamure?= Date: Wed, 27 Nov 2024 20:45:07 +0100 Subject: [PATCH 2/3] Replace get_organization_members with GraphQL implementation --- main.py | 50 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/main.py b/main.py index fd33026..7bf8451 100755 --- a/main.py +++ b/main.py @@ -699,29 +699,39 @@ def get_organization(self, org: str) -> Organization: ) def get_organization_members(self, org: str) -> Iterable[OrganizationMember]: - # Collect the members into a list first, so we can show an accurate - # progress meter later. - members = list(self._http_get_json_paginated(f"/orgs/{org}/members")) - for i, member in enumerate(members): - username: str = member["login"] - print_status_stderr( - f"[{i + 1} / {len(members)}] Retrieving membership: {username}", - ) - membership: Dict[str, Any] = self._http_get_json( - f"/orgs/{org}/memberships/{username}" - ) + query = """ + query($org: String!) { + organization(login: $org) { + membersWithRole(first:100) { + edges { + node { + login + databaseId + } + role + } + pageInfo { + hasNextPage + } + } + } + } + """ + variables = { "org": org } + response = self._http_graphql(query, variables) + + members_with_role = response['organization']['membersWithRole'] + # TODO: Support more than 100 team members + assert(members_with_role['pageInfo']['hasNextPage'] == False) + + for edge in members_with_role['edges']: + node = edge['node'] yield OrganizationMember( - user_name=username, - user_id=member["id"], - role=OrganizationRole(membership["role"]), + user_name=node['login'], + user_id=node['databaseId'], + role=OrganizationRole(edge['role'].lower()), ) - # After the final status update, clear the line again, so the final - # output is not mixed with status updates. (They go separately to stdout - # and stderr anyway, but in a terminal you don’t want interleaved - # output.) - print_status_stderr("") - def get_organization_teams(self, org: str) -> Iterable[Team]: teams = self._http_get_json_paginated(f"/orgs/{org}/teams") for team in teams: From bb23e581401404a471aa4bc85affb623bca3f842 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A9o=20Gillot-Lamure?= Date: Wed, 27 Nov 2024 23:58:14 +0100 Subject: [PATCH 3/3] Replace get_organization_repositories with GraphQL implementation --- main.py | 188 ++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 135 insertions(+), 53 deletions(-) diff --git a/main.py b/main.py index 7bf8451..bb7010d 100755 --- a/main.py +++ b/main.py @@ -140,6 +140,7 @@ Protocol, ) from enum import Enum +from collections import defaultdict from difflib import SequenceMatcher from http.client import HTTPSConnection, HTTPResponse from dataclasses import dataclass @@ -757,62 +758,143 @@ def get_team_members(self, org: str, team: Team) -> Iterable[TeamMember]: team_name=team.name, ) - def get_repository_teams( - self, org: str, repo: str - ) -> Iterable[TeamRepositoryAccess]: - teams = self._http_get_json_paginated(f"/repos/{org}/{repo}/teams") - for team in teams: - permissions: Dict[str, bool] = team["permissions"] - yield TeamRepositoryAccess( - team_name=team["name"], - role=RepositoryAccessRole.from_permissions_dict(permissions), - ) + def get_organization_repo_to_teams_map(self, org: str) -> dict[str, [TeamRepositoryaccess]]: + query = """ + query($org: String!, $cursor: String) { + organization(login: $org) { + teams(first: 100) { + nodes { + name + repositories(first: 100, after: $cursor) { + edges { + permission + node { + databaseId + } + } + pageInfo { + hasNextPage + endCursor + } + totalCount + } + } + pageInfo { + hasNextPage + } + } + } + } + """ - def get_repository_users( - self, org: str, repo: str - ) -> Iterable[UserRepositoryAccess]: - # We query with affiliation=direct to get all users that have explicit - # access to the repository (i.e. not those who have implicit access - # through being a member of a group). The default is affiliation=all, - # which also returns users with implicit access. - users = self._http_get_json_paginated(f"/repos/{org}/{repo}/collaborators?affiliation=direct") - for user in users: - permissions: Dict[str, bool] = user["permissions"] - yield UserRepositoryAccess( - user_id=user["id"], - user_name=user["login"], - role=RepositoryAccessRole.from_permissions_dict(permissions), - ) + repo_to_teams: defaultdict[str, [TeamRepositoryaccess]] = defaultdict(list) + + cursor = None + while True: + variables = { "org": org, "cursor": cursor } + response = self._http_graphql(query, variables) + + teams = response['organization']['teams'] + # Assume we have less than 100 teams and skip pagination + assert(teams['pageInfo']['hasNextPage'] == False) + + has_next_page = False + next_cursors = [] + + for team in teams['nodes']: + for repo in team['repositories']['edges']: + repo_to_teams[repo['node']['databaseId']].append(TeamRepositoryAccess( + team_name=team['name'], + role=RepositoryAccessRole(repo['permission'].lower()) + )) + + team_has_next_page = team['repositories']['pageInfo']['hasNextPage'] + has_next_page |= team_has_next_page + if team_has_next_page: + next_cursors.append(team['repositories']['pageInfo']['endCursor']) + + if not has_next_page: + break + + [cursor] = set(next_cursors) # Asserts that all next cursors are the same + + print(json.dumps({ key: [team.team_name for team in teams] for key, teams in repo_to_teams.items()})) + return dict(repo_to_teams) def get_organization_repositories(self, org: str) -> Iterable[Repository]: - # Listing repositories is a slow endpoint, and paginated as well, print - # some progress. Technically from the pagination headers we could - # extract more precise progress, but I am not going to bother. - print_status_stderr("[1 / ??] Listing organization repositories") - repos = [] - for i, more_repos in enumerate( - self._http_get_json_paginated(f"/orgs/{org}/repos?per_page=100") - ): - repos.append(more_repos) - print_status_stderr( - f"[{len(repos)} / ??] Listing organization repositories" - ) - # Materialize to a list so we know the total so we can show a progress - # counter. - n = len(repos) - for i, repo in enumerate(repos): - name = repo["name"] - print_status_stderr(f"[{i + 1} / {n}] Getting access on {name}") - user_access = tuple(sorted(self.get_repository_users(org, name))) - team_access = tuple(sorted(self.get_repository_teams(org, name))) - yield Repository( - repo_id=repo["id"], - name=name, - visibility=RepositoryVisibility(repo["visibility"]), - user_access=user_access, - team_access=team_access, - ) - print_status_stderr("") + query = """ + query($org: String!, $cursor: String) { + organization(login: $org) { + repositories(first:100, after: $cursor) { + nodes { + databaseId + name + visibility + # We query with affiliation=direct to get all users that have explicit + # access to the repository (i.e. not those who have implicit access + # through being a member of a group). The default is affiliation=all, + # which also returns users with implicit access. + collaborators(affiliation: DIRECT, first: 100) { + edges { + node { + databaseId + login + } + permission + } + pageInfo { + hasNextPage + } + } + } + pageInfo { + hasNextPage + endCursor + } + totalCount + } + } + } + """ + + repo_to_teams = self.get_organization_repo_to_teams_map(org) + + cursor = None + while True: + variables = { "org": org, "cursor": cursor } + print(f"shooting repositories query for cursor {cursor}") + response = self._http_graphql(query, variables) + + repos = response['organization']['repositories'] + + for repo in repos['nodes']: + repo_id = repo['databaseId'] + + collaborators = repo['collaborators'] + # Assume we have less than 100 directs collaborators to any repo and skip pagination + assert(collaborators['pageInfo']['hasNextPage'] == False) + user_access = tuple(sorted(UserRepositoryAccess( + user_id=collaborator['node']['databaseId'], + user_name=collaborator['node']['login'], + role=RepositoryAccessRole(collaborator['permission'].lower()), + ) for collaborator in collaborators['edges'])) + + if repo_id == 733475299: + print(f"BLEHBLEH {repo['name']}") + team_access = tuple(sorted(repo_to_teams.get(repo_id, []))) + + yield Repository( + repo_id=repo_id, + name=repo['name'], + visibility=RepositoryVisibility(repo["visibility"].lower()), + user_access=user_access, + team_access=team_access, + ) + + page_info = repos['pageInfo'] + if not page_info['hasNextPage']: + break + cursor = page_info['endCursor'] def print_indented(lines: str) -> None: