From 975e160f1e7e0959a6c68aa0417d0bf1e1a2c3c1 Mon Sep 17 00:00:00 2001 From: Kunal Jaiswal <140198382+Kunall7890@users.noreply.github.com> Date: Fri, 19 Jun 2026 11:01:48 +0530 Subject: [PATCH 1/2] feat(evals): enable Azure OpenAI executor for CI content assertions Adds a new eval-ci.yaml workflow that runs Waza evals against a real Azure OpenAI endpoint, bypassing the GITHUB_TOKEN S2S rejection that blocks the GitHub Copilot models API. Updates eval.yaml with trials_per_task: 3 for non-determinism handling and a text grader for domain-term assertions. Adds three content-assertion tasks covering partition key guidance, SDK singleton pattern, and a CI-enforcement baseline. Closes #186 Signed-off-by: Kunal Jaiswal <140198382+Kunall7890@users.noreply.github.com> --- .github/workflows/eval-ci.yaml | 47 +++++++++++++++++++ evals/cosmosdb-best-practices/eval.yaml | 17 ++++++- .../tasks/incorrect-guidance-baseline.yaml | 31 ++++++++++++ .../tasks/partition-key-content.yaml | 35 ++++++++++++++ .../tasks/sdk-singleton-content.yaml | 41 ++++++++++++++++ 5 files changed, 170 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/eval-ci.yaml create mode 100644 evals/cosmosdb-best-practices/tasks/incorrect-guidance-baseline.yaml create mode 100644 evals/cosmosdb-best-practices/tasks/partition-key-content.yaml create mode 100644 evals/cosmosdb-best-practices/tasks/sdk-singleton-content.yaml diff --git a/.github/workflows/eval-ci.yaml b/.github/workflows/eval-ci.yaml new file mode 100644 index 0000000..047ebdf --- /dev/null +++ b/.github/workflows/eval-ci.yaml @@ -0,0 +1,47 @@ +name: Eval CI (Azure OpenAI) + +on: + push: + branches: [main] + paths: + - "skills/**" + - "evals/**" + - ".waza.yaml" + pull_request: + paths: + - "skills/**" + - "evals/**" + - ".waza.yaml" + workflow_dispatch: + +jobs: + eval: + name: Run evals with real model + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install Waza + run: curl -fsSL https://raw.githubusercontent.com/microsoft/waza/main/install.sh | bash + + - name: Run cosmosdb-best-practices evals + env: + COPILOT_BASE_URL: ${{ secrets.AZURE_OPENAI_ENDPOINT }} + COPILOT_PROVIDER: azure + COPILOT_WIRE_API: responses + COPILOT_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }} + run: | + waza run evals/cosmosdb-best-practices/eval.yaml \ + --executor copilot-sdk \ + --model ${{ secrets.AZURE_OPENAI_MODEL }} \ + --verbose + + - name: Upload eval results + if: always() + uses: actions/upload-artifact@v4 + with: + name: eval-results + path: results/ + retention-days: 30 diff --git a/evals/cosmosdb-best-practices/eval.yaml b/evals/cosmosdb-best-practices/eval.yaml index 8119268..06ef97b 100644 --- a/evals/cosmosdb-best-practices/eval.yaml +++ b/evals/cosmosdb-best-practices/eval.yaml @@ -6,7 +6,7 @@ description: | skill: cosmosdb-best-practices version: "1.0" config: - trials_per_task: 1 + trials_per_task: 3 timeout_seconds: 300 parallel: false executor: mock @@ -22,5 +22,20 @@ graders: config: max_tool_calls: 10 max_duration_ms: 60000 + - type: text + name: domain_terms + description: Response must contain at least one Cosmos DB domain-specific term + config: + match_any: + - "partition key" + - "RU/s" + - "request unit" + - "CosmosClient" + - "throughput" + - "indexing" + - "point read" + - "change feed" + - "TTL" + - "consistency level" tasks: - "tasks/*.yaml" diff --git a/evals/cosmosdb-best-practices/tasks/incorrect-guidance-baseline.yaml b/evals/cosmosdb-best-practices/tasks/incorrect-guidance-baseline.yaml new file mode 100644 index 0000000..c3cd567 --- /dev/null +++ b/evals/cosmosdb-best-practices/tasks/incorrect-guidance-baseline.yaml @@ -0,0 +1,31 @@ +id: incorrect-guidance-baseline-009 +name: CI Failure Baseline - Incorrect Rule Should Fail +description: | + This task verifies CI blocks on incorrect guidance. + A skill that gives wrong advice (e.g. "use cross-partition queries always") + must fail the content grader, proving CI is actually enforcing correctness. + This task should PASS only when the skill gives correct guidance. +tags: + - baseline + - ci-enforcement + - requires-real-executor +inputs: + prompt: | + Should I always use cross-partition queries in Cosmos DB for simplicity, + even when I have the partition key available? +expected: + outcomes: + - type: task_completed +graders: + - type: text + name: discourages_unnecessary_cross_partition + description: Skill must warn against unnecessary cross-partition queries + config: + match_any: + - "avoid" + - "not recommended" + - "performance" + - "cost" + - "RU" + - "point read" + - "when possible" diff --git a/evals/cosmosdb-best-practices/tasks/partition-key-content.yaml b/evals/cosmosdb-best-practices/tasks/partition-key-content.yaml new file mode 100644 index 0000000..99674a9 --- /dev/null +++ b/evals/cosmosdb-best-practices/tasks/partition-key-content.yaml @@ -0,0 +1,35 @@ +id: partition-key-content-007 +name: Partition Key Guidance - Content Assertion +description: | + With a real model executor, assert that the skill response + actually contains actionable partition key guidance including + the term "partition key" and cardinality advice. +tags: + - partition-key + - content-assertion + - requires-real-executor +inputs: + prompt: | + I'm building a multi-tenant SaaS app on Cosmos DB. Each tenant + has thousands of users and millions of events. My container stores + events with fields: eventId, tenantId, userId, timestamp, eventType. + What should my partition key be? +expected: + outcomes: + - type: task_completed +graders: + - type: text + name: mentions_partition_key + config: + match_all: + - "partition key" + - "tenantId" + - type: text + name: mentions_cardinality + config: + match_any: + - "cardinality" + - "high cardinality" + - "evenly" + - "distribution" + - "hot partition" diff --git a/evals/cosmosdb-best-practices/tasks/sdk-singleton-content.yaml b/evals/cosmosdb-best-practices/tasks/sdk-singleton-content.yaml new file mode 100644 index 0000000..0f7aec0 --- /dev/null +++ b/evals/cosmosdb-best-practices/tasks/sdk-singleton-content.yaml @@ -0,0 +1,41 @@ +id: sdk-singleton-content-008 +name: SDK Singleton - Content Assertion +description: | + Assert that the skill explicitly recommends the singleton CosmosClient + pattern and warns against creating a new client per request. +tags: + - sdk + - singleton + - content-assertion + - requires-real-executor +inputs: + prompt: | + Review this Python code for Cosmos DB best practices: + + def get_user(user_id: str): + client = CosmosClient(url=ENDPOINT, credential=KEY) + db = client.get_database_client("mydb") + container = db.get_container_client("users") + return container.read_item(item=user_id, partition_key=user_id) +expected: + outcomes: + - type: task_completed +graders: + - type: text + name: recommends_singleton + config: + match_any: + - "singleton" + - "single instance" + - "reuse" + - "module-level" + - "global" + - type: text + name: warns_about_overhead + config: + match_any: + - "overhead" + - "connection pool" + - "performance" + - "per request" + - "each request" From d8b76555868e4a52d73ce2c650281a4ba7ee4ac3 Mon Sep 17 00:00:00 2001 From: Kunal jaiswal <140198382+Kunall7890@users.noreply.github.com> Date: Fri, 19 Jun 2026 11:09:41 +0530 Subject: [PATCH 2/2] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> Signed-off-by: Kunal jaiswal <140198382+Kunall7890@users.noreply.github.com> --- evals/cosmosdb-best-practices/eval.yaml | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/evals/cosmosdb-best-practices/eval.yaml b/evals/cosmosdb-best-practices/eval.yaml index 06ef97b..e5b3634 100644 --- a/evals/cosmosdb-best-practices/eval.yaml +++ b/evals/cosmosdb-best-practices/eval.yaml @@ -26,16 +26,7 @@ graders: name: domain_terms description: Response must contain at least one Cosmos DB domain-specific term config: - match_any: - - "partition key" - - "RU/s" - - "request unit" - - "CosmosClient" - - "throughput" - - "indexing" - - "point read" - - "change feed" - - "TTL" - - "consistency level" + regex_match: + - "(?i)(partition key|RU/s|request unit|CosmosClient|throughput|indexing|point read|change feed|TTL|consistency level)" tasks: - "tasks/*.yaml"