feat(aci): Enforce quota limits when creating metric detectors (#97445)

leedongwei · andrewshie-sentry · commit 1cd76a7af12c · 2025-08-26T11:30:50.000-07:00
diff --git a/src/sentry/features/temporary.py b/src/sentry/features/temporary.py
@@ -535,6 +535,8 @@ def register_temporary_features(manager: FeatureManager):
     manager.add("organizations:workflow-engine-rule-serializers", OrganizationFeature, FeatureHandlerStrategy.FLAGPOLE, api_expose=False)
     # Enable single processing of metric issues
     manager.add("organizations:workflow-engine-single-process-metric-issues", OrganizationFeature, FeatureHandlerStrategy.FLAGPOLE, api_expose=False)
+    # Enable metric detector limits by plan type
+    manager.add("organizations:workflow-engine-metric-detector-limit", OrganizationFeature, FeatureHandlerStrategy.FLAGPOLE, api_expose=True)
     # Enable EventUniqueUserFrequencyConditionWithConditions special alert condition
     manager.add("organizations:event-unique-user-frequency-condition-with-conditions", OrganizationFeature, FeatureHandlerStrategy.FLAGPOLE, api_expose=True)
     # Use spans instead of transactions for dynamic sampling calculations. This will become the new default.
diff --git a/src/sentry/incidents/endpoints/organization_alert_rule_index.py b/src/sentry/incidents/endpoints/organization_alert_rule_index.py
@@ -12,7 +12,7 @@
 from rest_framework.request import Request
 from rest_framework.response import Response
 
-from sentry import features, options
+from sentry import features, options, quotas
 from sentry.api.api_owners import ApiOwner
 from sentry.api.api_publish_status import ApiPublishStatus
 from sentry.api.base import Endpoint, region_silo_endpoint
@@ -562,6 +562,19 @@ def check_can_create_alert(self, request: Request, organization: Organization) -
         permission, then we must verify that the user is a team admin with "alerts:write" access to the project(s)
         in their request.
         """
+        if features.has(
+            "organizations:workflow-engine-metric-detector-limit", organization, actor=request.user
+        ):
+            alert_limit = quotas.backend.get_metric_detector_limit(organization.id)
+            alert_count = AlertRule.objects.fetch_for_organization(
+                organization=organization
+            ).count()
+
+            if alert_limit >= 0 and alert_count >= alert_limit:
+                raise ValidationError(
+                    f"You may not exceed {alert_limit} metric alerts on your current plan."
+                )
+
         # if the requesting user has any of these org-level permissions, then they can create an alert
         if (
             request.access.has_scope("alerts:write")
diff --git a/src/sentry/incidents/metric_issue_detector.py b/src/sentry/incidents/metric_issue_detector.py
@@ -3,13 +3,16 @@
 
 from rest_framework import serializers
 
+from sentry import features, quotas
+from sentry.constants import ObjectStatus
 from sentry.incidents.logic import enable_disable_subscriptions
 from sentry.snuba.models import QuerySubscription, SnubaQuery, SnubaQueryEventType
 from sentry.snuba.snuba_query_validator import SnubaQueryValidator
 from sentry.snuba.subscriptions import update_snuba_query
 from sentry.workflow_engine.endpoints.validators.base import (
     BaseDataConditionGroupValidator,
     BaseDetectorTypeValidator,
+    DetectorQuota,
 )
 from sentry.workflow_engine.endpoints.validators.base.data_condition import (
     BaseDataConditionValidator,
@@ -88,6 +91,32 @@ def validate(self, attrs):
 
         return attrs
 
+    def get_quota(self) -> DetectorQuota:
+        organization = self.context.get("organization")
+        request = self.context.get("request")
+        if organization is None or request is None:
+            raise serializers.ValidationError("Missing organization/request context")
+
+        detector_limit = quotas.backend.get_metric_detector_limit(organization.id)
+        if (
+            not features.has(
+                "organizations:workflow-engine-metric-detector-limit",
+                organization,
+                actor=request.user,
+            )
+            or detector_limit == -1
+        ):
+            return DetectorQuota(has_exceeded=False, limit=-1, count=-1)
+
+        detector_count = Detector.objects.filter(
+            project__organization=organization,
+            type="metric_issue",  # Avoided circular import. TODO: move magic strings to constant file
+            status=ObjectStatus.ACTIVE,
+        ).count()
+        has_exceeded = detector_count >= detector_limit
+
+        return DetectorQuota(has_exceeded=has_exceeded, limit=detector_limit, count=detector_count)
+
     def update_data_source(self, instance: Detector, data_source: SnubaQueryDataSourceType):
         try:
             source_instance = DataSource.objects.get(detector=instance)
diff --git a/src/sentry/quotas/base.py b/src/sentry/quotas/base.py
@@ -755,3 +755,9 @@ def get_dashboard_limit(self, org_id: int) -> int:
         Returns the maximum number of dashboards allowed for the organization's plan type.
         """
         return -1
+
+    def get_metric_detector_limit(self, org_id: int) -> int:
+        """
+        Returns the maximum number of detectors allowed for the organization's plan type.
+        """
+        return -1
diff --git a/src/sentry/workflow_engine/endpoints/organization_detector_index.py b/src/sentry/workflow_engine/endpoints/organization_detector_index.py
@@ -314,7 +314,7 @@ def post(self, request: Request, organization: Organization) -> Response:
         if not detector_type:
             raise ValidationError({"type": ["This field is required."]})
 
-        # restrict creating metric issue detectors by plan type
+        # Restrict creating metric issue detectors by plan type
         if detector_type == MetricIssue.slug and not features.has(
             "organizations:incidents", organization, actor=request.user
         ):
diff --git a/src/sentry/workflow_engine/endpoints/validators/base/__init__.py b/src/sentry/workflow_engine/endpoints/validators/base/__init__.py
@@ -6,10 +6,11 @@
     "BaseDataSourceValidator",
     "BaseDetectorTypeValidator",
     "DataSourceCreator",
+    "DetectorQuota",
 ]
 
 from .action import BaseActionValidator
 from .data_condition import AbstractDataConditionValidator, BaseDataConditionValidator
 from .data_condition_group import BaseDataConditionGroupValidator
 from .data_source import BaseDataSourceValidator, DataSourceCreator
-from .detector import BaseDetectorTypeValidator
+from .detector import BaseDetectorTypeValidator, DetectorQuota
diff --git a/src/sentry/workflow_engine/endpoints/validators/base/detector.py b/src/sentry/workflow_engine/endpoints/validators/base/detector.py
@@ -1,4 +1,5 @@
 import builtins
+from dataclasses import dataclass
 from typing import Any
 
 from django.db import router, transaction
@@ -29,6 +30,13 @@
 from sentry.workflow_engine.types import DataConditionType
 
 
+@dataclass(frozen=True)
+class DetectorQuota:
+    has_exceeded: bool
+    limit: int
+    count: int
+
+
 class BaseDetectorTypeValidator(CamelSnakeSerializer):
     name = serializers.CharField(
         required=True,
@@ -64,6 +72,22 @@ def data_source(self) -> BaseDataSourceValidator:
     def data_conditions(self) -> BaseDataConditionValidator:
         raise NotImplementedError
 
+    def get_quota(self) -> DetectorQuota:
+        return DetectorQuota(has_exceeded=False, limit=-1, count=-1)
+
+    def enforce_quota(self, validated_data) -> None:
+        """
+        Enforce quota limits for detector creation.
+        Raise ValidationError if quota limits are exceeded.
+
+        Only Metric Issues Detector has quota limits.
+        """
+        detector_quota = self.get_quota()
+        if detector_quota.has_exceeded:
+            raise serializers.ValidationError(
+                f"Used {detector_quota.count}/{detector_quota.limit} of allowed {validated_data["type"].slug} monitors."
+            )
+
     def update(self, instance: Detector, validated_data: dict[str, Any]):
         with transaction.atomic(router.db_for_write(Detector)):
             if "name" in validated_data:
@@ -110,6 +134,10 @@ def update(self, instance: Detector, validated_data: dict[str, Any]):
         return instance
 
     def create(self, validated_data):
+        # If quotas are exceeded, we will prevent creation of new detectors.
+        # Do not disable or prevent the users from updating existing detectors.
+        self.enforce_quota(validated_data)
+
         with transaction.atomic(router.db_for_write(Detector)):
             condition_group = DataConditionGroup.objects.create(
                 logic_type=DataConditionGroup.Type.ANY,
diff --git a/tests/sentry/incidents/endpoints/test_organization_alert_rule_index.py b/tests/sentry/incidents/endpoints/test_organization_alert_rule_index.py
@@ -30,6 +30,7 @@
     AlertRuleDetectionType,
     AlertRuleSeasonality,
     AlertRuleSensitivity,
+    AlertRuleStatus,
     AlertRuleThresholdType,
     AlertRuleTrigger,
     AlertRuleTriggerAction,
@@ -1640,6 +1641,58 @@ def test_performance_alert(self, record_analytics: MagicMock) -> None:
             resp = self.get_response(self.organization.slug, **valid_alert_rule)
             assert resp.status_code == 201
 
+    @with_feature("organizations:incidents")
+    @with_feature("organizations:workflow-engine-metric-detector-limit")
+    @patch("sentry.quotas.backend.get_metric_detector_limit")
+    def test_metric_alert_limit(self, mock_get_limit: MagicMock) -> None:
+        # Set limit to 2 alert rules
+        mock_get_limit.return_value = 2
+
+        # Create 2 existing metric alert rules (1 active, 1 to be deleted)
+        alert_rule = self.create_alert_rule(organization=self.organization)
+        alert_rule.status = AlertRuleStatus.PENDING.value
+        alert_rule.save()
+
+        alert_rule = self.create_alert_rule(organization=self.organization)
+        alert_rule.status = AlertRuleStatus.SNAPSHOT.value
+        alert_rule.save()
+
+        # Create another alert rule, it should succeed
+        data = deepcopy(self.alert_rule_dict)
+        with outbox_runner():
+            resp = self.get_success_response(
+                self.organization.slug,
+                status_code=201,
+                **data,
+            )
+        alert_rule = AlertRule.objects.get(id=resp.data["id"])
+        assert alert_rule.name == "JustAValidTestRule"
+
+        # Create another alert rule, it should fail
+        data = deepcopy(self.alert_rule_dict)
+        resp = self.get_error_response(
+            self.organization.slug,
+            status_code=400,
+            **data,
+        )
+
+    @with_feature("organizations:incidents")
+    @with_feature("organizations:workflow-engine-metric-detector-limit")
+    def test_metric_alert_limit_unlimited_plan(self) -> None:
+        # Create many alert rules
+        for _ in range(5):
+            self.create_alert_rule(organization=self.organization)
+
+        # Creating another alert rule, it should succeed
+        with outbox_runner():
+            resp = self.get_success_response(
+                self.organization.slug,
+                status_code=201,
+                **self.alert_rule_dict,
+            )
+        alert_rule = AlertRule.objects.get(id=resp.data["id"])
+        assert alert_rule.name == "JustAValidTestRule"
+
 
 @freeze_time()
 class AlertRuleCreateEndpointTestCrashRateAlert(AlertRuleIndexBase):
diff --git a/tests/sentry/incidents/endpoints/validators/test_validators.py b/tests/sentry/incidents/endpoints/validators/test_validators.py
@@ -1,8 +1,9 @@
 from unittest import mock
 
-from rest_framework.exceptions import ErrorDetail
+from rest_framework.exceptions import ErrorDetail, ValidationError
 
 from sentry import audit_log
+from sentry.constants import ObjectStatus
 from sentry.incidents.grouptype import MetricIssue
 from sentry.incidents.metric_issue_detector import (
     MetricIssueComparisonConditionValidator,
@@ -23,6 +24,7 @@
     SnubaQuery,
     SnubaQueryEventType,
 )
+from sentry.testutils.helpers.features import with_feature
 from sentry.workflow_engine.endpoints.validators.utils import get_unknown_detector_type_error
 from sentry.workflow_engine.models import DataCondition, DataConditionGroup, DataSource, Detector
 from sentry.workflow_engine.models.data_condition import Condition
@@ -360,3 +362,48 @@ def test_too_many_conditions(self) -> None:
         assert validator.errors.get("nonFieldErrors") == [
             ErrorDetail(string="Too many conditions", code="invalid")
         ]
+
+    @mock.patch("sentry.quotas.backend.get_metric_detector_limit")
+    def test_enforce_quota_feature_disabled(self, mock_get_limit: mock.MagicMock) -> None:
+        mock_get_limit.return_value = 0
+        validator = MetricIssueDetectorValidator(data=self.valid_data, context=self.context)
+
+        assert validator.is_valid()
+        assert validator.save()
+
+    @mock.patch("sentry.quotas.backend.get_metric_detector_limit")
+    @with_feature("organizations:workflow-engine-metric-detector-limit")
+    def test_enforce_quota_within_limit(self, mock_get_limit: mock.MagicMock) -> None:
+        mock_get_limit.return_value = 1
+
+        # Create a not-metric detector
+        self.create_detector(
+            project_id=self.project.id,
+            name="Error Detector",
+            status=ObjectStatus.ACTIVE,
+        )
+        # Create 3 inactive detectors
+        for status in [
+            ObjectStatus.DISABLED,
+            ObjectStatus.PENDING_DELETION,
+            ObjectStatus.DELETION_IN_PROGRESS,
+        ]:
+            self.create_detector(
+                project_id=self.project.id,
+                name=f"Inactive Detector {status}",
+                type=MetricIssue.slug,
+                status=status,
+            )
+
+        validator = MetricIssueDetectorValidator(data=self.valid_data, context=self.context)
+        assert validator.is_valid()
+        assert validator.save()
+        mock_get_limit.assert_called_once_with(self.project.organization.id)
+
+        validator = MetricIssueDetectorValidator(data=self.valid_data, context=self.context)
+        validator.is_valid()
+        with self.assertRaisesMessage(
+            ValidationError,
+            expected_message="Used 1/1 of allowed metric_issue monitors.",
+        ):
+            validator.save()
diff --git a/tests/sentry/workflow_engine/endpoints/test_organization_detector_index.py b/tests/sentry/workflow_engine/endpoints/test_organization_detector_index.py