diff --git a/README.md b/README.md index 4c89191..be2f030 100644 --- a/README.md +++ b/README.md @@ -118,7 +118,7 @@ The basic structure is: ```jsonc "TFS": [ { - "url": "https://tfs.internal", // Base URL for a Team Foundation Server (TFS) or Visual Studio Team Services (VSTS) or Azure DevOps instance + "url": "https://tfs.internal", // Base URL for a Team Foundation Server (TFS) or Visual Studio Team Services (VSTS) "token": null, // Private token for accessing this TFS instance "exclude": [ ... ] // List of projects / repositories to exclude from inventory @@ -126,6 +126,17 @@ The basic structure is: ] ``` +```jsonc +"AzureDevOps": [ + { + "url": "https://dev.azure.com", // Base URL for an Azure Dev Ops Server or Azure Dev Ops Cloud instance + "token": null, // Personal Access Token for accessing this ADO instance + "apiVersion": "", // API Version + "exclude": [ ... ] // List of projects to exclude from inventory + } +] +``` + ## License Scraper is released under an MIT license. For more details see the diff --git a/scraper/azuredevops/__init__.py b/scraper/azuredevops/__init__.py new file mode 100644 index 0000000..c1b1a00 --- /dev/null +++ b/scraper/azuredevops/__init__.py @@ -0,0 +1,249 @@ +import base64 +import logging +import os +import re +from typing import List + +import requests + +from scraper.azuredevops.models import AzureDevOpsCollection, AzureDevOpsProject + +logger = logging.getLogger(__name__) + + +class AzureDevOpsClient: + def __init__(self, baseurl, api_version, token=None): + self.baseurl = baseurl + self.api_version = api_version + self.is_cloud_ado = "dev.azure.com" in baseurl + self.session = self._create_client_session(token) + + def get_projects_metadata(self) -> List[AzureDevOpsProject]: + """ + Get metadata for all projects + """ + collections = self._get_all_collections() + return self._get_all_projects(collections) + + def _create_client_session(self, token): + """ + Creates the Azure DevOps Client Context with the provided token. + If no token is provided, it will look for the ADO_API_TOKEN environment variable. + """ + if token is None: + token = os.environ.get("ADO_API_TOKEN", None) + + if token is None: + raise RuntimeError("Azure Dev Ops Token was not provided.") + + session = requests.Session() + auth_string = f":{token}" + encoded_auth = base64.b64encode(auth_string.encode("ascii")).decode("ascii") + session.headers.update( + {"Authorization": f"Basic {encoded_auth}", "Accept": "application/json"} + ) + return session + + def _get_all_collections(self) -> List[AzureDevOpsCollection]: + """ + Get all collections from the Azure DevOps API. + """ + collections = [] + + if self.is_cloud_ado: + # For cloud Azure DevOps, get all organizations from the API + profile_url = f"https://app.vssps.visualstudio.com/_apis/profile/profiles/me?api-version={self.api_version}" + profile_response = self.session.get(profile_url) + + if profile_response.status_code == 200: + profile = profile_response.json() + + # Get user's organizations/accounts + accounts_url = f"https://app.vssps.visualstudio.com/_apis/accounts?memberId={profile['id']}&api-version={self.api_version}" + accounts_response = self.session.get(accounts_url) + + if accounts_response.status_code == 200: + accounts_json = accounts_response.json() + + if accounts_json.get("value") and len(accounts_json["value"]) > 0: + for org in accounts_json["value"]: + collections.append( + AzureDevOpsCollection( + id=org["accountId"], + name=org["accountName"], + url=f"https://dev.azure.com/{org['accountName']}", + ) + ) + logger.debug( + f"Found cloud organization: {org['accountName']}" + ) + else: + logger.warning("No organizations found with your access token.") + + # Fallback: Try to extract organization from baseAddress + org_name = self.baseurl.rstrip("/").split("/")[-1] + if org_name and org_name != "dev.azure.com": + collections.append( + AzureDevOpsCollection( + id=org_name, + name=org_name, + url=f"https://dev.azure.com/{org_name}", + ) + ) + logger.debug( + f"Using organization from base address: {org_name}" + ) + else: + raise RuntimeError( + f"Failed to retrieve organizations. Status Code: {accounts_response.status_code} Response: {accounts_response.text}" + ) + else: + logger.warning( + f"Failed to retrieve user profile: {profile_response.status_code} Response: {profile_response.text}" + ) + logger.warning( + "Falling back to base address for organization extraction." + ) + # Fallback: Try to extract organization from baseAddress + org_name = self.baseurl.rstrip("/").split("/")[-1] + if org_name and org_name != "dev.azure.com": + collections.append( + AzureDevOpsCollection( + id=org_name, + name=org_name, + url=f"https://dev.azure.com/{org_name}", + ) + ) + logger.debug(f"Using organization from base address: {org_name}") + else: + raise RuntimeError( + "Could not determine organization. Please specify organization in the baseurl." + ) + else: + # For on-premises, get collections via API + collections_url = f"{self.baseurl}/_apis/projectcollections?api-version={self.api_version}" + collections_response = self.session.get(collections_url) + + if collections_response.status_code == 200: + collections_json = collections_response.json() + for collection in collections_json.get("value", []): + collections.append( + AzureDevOpsCollection( + id=collection["id"], + name=collection["name"], + url=collection["url"], + ) + ) + else: + raise RuntimeError( + f"Failed to retrieve collections. Status Code: {collections_response.status_code} Response: {collections_response.text}" + ) + + logger.debug(f"Found {len(collections)} collections/organizations") + return collections + + def _get_web_url_from_api_url(self, api_url, project_name): + """ + Convert an API URL to a web-accessible URL + + Parameters: + api_url (str): API URL for the project + project_name (str): Name of the project + + Returns: + str: Web URL for the project + """ + if self.is_cloud_ado: + # For cloud ADO, convert URL like: + # https://dev.azure.com/org-name/_apis/projects/project-id + # to: https://dev.azure.com/org-name/project-name + match = re.search(r"https://dev\.azure\.com/([^/]+)", api_url) + if match: + org_name = match.group(1) + return f"https://dev.azure.com/{org_name}/{project_name}" + else: + # For on-premises ADO, convert URL like: + # https://server/collection/_apis/projects/project-id + # to: https://server/collection/project-name + base_url = api_url.split("/_apis/projects")[0] + return f"{base_url}/{project_name}" + + def _get_repo_web_url(self, api_url, project_name): + """ + Generate web-accessible URL for repositories page + + Parameters: + api_url (str): API URL for the project + project_name (str): Name of the project + + Returns: + str: Web URL for the project's repositories page + """ + project_web_url = self._get_web_url_from_api_url(api_url, project_name) + return f"{project_web_url}/_git" + + def _get_all_projects( + self, collections: List[AzureDevOpsCollection] = None + ) -> List[AzureDevOpsProject]: + """ + Get all projects from the provided collections or from all collections if none are provided + + Parameters: + collections (List[AzureDevOpsCollection]): List of collections to get projects from + """ + if collections is None: + collections = self._get_all_collections() + + projects = [] + for collection in collections: + collection_url = ( + f"https://dev.azure.com/{collection.name}" + if self.is_cloud_ado + else f"{self.baseurl}/{collection.name}" + ) + logger.debug("Getting projects from collection: %s", collection_url) + + top = 100 + project_skip = 0 + total_projects = 0 + has_more_projects = True + + while has_more_projects: + url = f"{collection_url}/_apis/projects?$top={top}&$skip={project_skip}&api-version={self.api_version}&includeCapabilities=true" + + response = self.session.get(url) + if response.status_code != 200: + raise RuntimeError( + f"Failed to get projects: {response.status_code}" + ) + + result = response.json() + for project in result.get("value", []): + project_api_url = project.get("url") + project_name = project.get("name") + + project_web_url = self._get_web_url_from_api_url( + project_api_url, project_name + ) + repo_web_url = self._get_repo_web_url(project_api_url, project_name) + + projects.append( + AzureDevOpsProject( + project_id=project.get("id"), + project_name=project_name, + project_description=project.get("description") or "", + project_url=project_web_url, + repo_url=repo_web_url, + project_create_time="", # Not provided in API response + project_last_update_time=project.get("lastUpdateTime"), + collection_or_org_name=collection.name, + ) + ) + + count = len(result.get("value", [])) + total_projects += count + project_skip += top + + has_more_projects = count == top + + return projects diff --git a/scraper/azuredevops/models.py b/scraper/azuredevops/models.py new file mode 100644 index 0000000..515c565 --- /dev/null +++ b/scraper/azuredevops/models.py @@ -0,0 +1,27 @@ +class AzureDevOpsCollection: + def __init__(self, id="", name="", url=""): + self.id = id + self.name = name + self.url = url + + +class AzureDevOpsProject: + def __init__( + self, + project_id="", + project_name="", + project_description="", + project_url="", + repo_url="", + project_create_time="", + project_last_update_time="", + collection_or_org_name="", + ): + self.project_id = project_id + self.project_name = project_name + self.project_description = project_description + self.project_url = project_url + self.repo_url = repo_url + self.project_create_time = project_create_time + self.project_last_update_time = project_last_update_time + self.collection_or_org_name = collection_or_org_name diff --git a/scraper/code_gov/__init__.py b/scraper/code_gov/__init__.py index f5e0a77..e430a8e 100644 --- a/scraper/code_gov/__init__.py +++ b/scraper/code_gov/__init__.py @@ -4,6 +4,7 @@ import logging from scraper import bitbucket, doecode, github, gitlab, tfs +from scraper.azuredevops import AzureDevOpsClient from scraper.code_gov.models import Metadata, Project from scraper.github import gov_orgs @@ -128,6 +129,26 @@ def process_config(config): ) code_gov_metadata["releases"].append(code_gov_project) + # parse config for AzureDevOps repositories + ado_instances = config.get("AzureDevOps", []) + for instance in ado_instances: + url = instance.get("url") + token = instance.get("token", None) + api_version = instance.get("apiVersion", "6.1-preview") + excluded = instance.get("exclude", []) + + ado_client = AzureDevOpsClient(url, api_version, token) + projects = ado_client.get_projects_metadata() + for project in projects: + if project.project_name in excluded: + logger.info("Excluding: %s", project.project_name) + continue + + code_gov_project = Project.from_ado( + project, labor_hours=compute_labor_hours + ) + code_gov_metadata["releases"].append(code_gov_project) + # Handle parsing of DOE CODE records doecode_config = config.get("DOE CODE", {}) diff --git a/scraper/code_gov/models.py b/scraper/code_gov/models.py index 02cfe21..5af1e9d 100644 --- a/scraper/code_gov/models.py +++ b/scraper/code_gov/models.py @@ -9,6 +9,7 @@ import gitlab from requests.utils import requote_uri +from scraper.azuredevops.models import AzureDevOpsProject from scraper.github.util import _license_obj from scraper.util import _prune_dict_null_str, labor_hours_from_url @@ -598,7 +599,7 @@ def from_tfs(klass, tfs_project, labor_hours=True): project["description"] = tfs_project.projectInfo.description - project["vcs"] = "TFS/AzureDevOps" + project["vcs"] = "TFS" project["permissions"]["license"] = None @@ -629,3 +630,53 @@ def from_tfs(klass, tfs_project, labor_hours=True): _prune_dict_null_str(project) return project + + @classmethod + def from_ado(klass, ado_project: AzureDevOpsProject, labor_hours=True): + """ + Creates CodeGovProject object from AzureDevOps Instance + """ + project = klass() + project_web_url = "" + + # -- REQUIRED FIELDS -- + project["name"] = ado_project.project_name + + project["repositoryURL"] = requote_uri(ado_project.repo_url) + + project["homepageURL"] = requote_uri(ado_project.project_url) + + project["description"] = ado_project.project_description + + project["vcs"] = "AzureDevOps" + + project["permissions"]["license"] = None + + project["tags"] = [] + + if labor_hours: + logger.debug("Sorry labor hour calculation not currently supported.") + # project['laborHours'] = labor_hours_from_url(project['repositoryURL']) + else: + project["laborHours"] = 0 + + last_update_time_as_date = date_parse(ado_project.project_last_update_time) + if last_update_time_as_date < POLICY_START_DATE: + project["permissions"]["usageType"] = "exemptByPolicyDate" + else: + project["permissions"]["usageType"] = "exemptByAgencyMission" + project["permissions"][ + "exemptionText" + ] = "This source code resides on a private server and has not been properly evaluated for releaseability." + + project["contact"] = {"email": "", "URL": project_web_url} + + project["date"] = { + "lastModified": last_update_time_as_date.isoformat(), + "created": "", + "metadataLastUpdated": "", + } + + _prune_dict_null_str(project) + + return project