diff --git a/.github/workflows/ci-main.yaml b/.github/workflows/ci-main.yaml index 110788a..7721946 100644 --- a/.github/workflows/ci-main.yaml +++ b/.github/workflows/ci-main.yaml @@ -22,43 +22,51 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest, windows-latest, macos-latest] - python-version: ['3.10', '3.11', '3.12', '3.13'] + python-version: ["3.10", "3.11", "3.12", "3.13"] steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v5 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v6 - with: - python-version: ${{ matrix.python-version }} + - name: Enable long paths on Windows + if: runner.os == 'Windows' + run: | + git config --system core.longpaths true - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install pytest==7.4.4 pytest-cov==4.1.0 - pip install -r dev-genai-requirements.txt + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v6 + with: + python-version: ${{ matrix.python-version }} - - name: Run tests - opentelemetry-util-genai-emitters-splunk - run: | - pip install -e util/opentelemetry-util-genai-emitters-splunk --no-deps - python -m pytest util/opentelemetry-util-genai-emitters-splunk/tests/ -v + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pytest==7.4.4 pytest-cov==4.1.0 + pip install -r dev-genai-requirements.txt - - name: Run tests - opentelemetry-util-genai-evals - run: | - pip install -e util/opentelemetry-util-genai-evals --no-deps - python -m pytest util/opentelemetry-util-genai-evals/tests/ -v + - name: Install all genai packages + run: | + pip install -e util/opentelemetry-util-genai --no-deps + pip install -e util/opentelemetry-util-genai-evals --no-deps + pip install -e util/opentelemetry-util-genai-evals-deepeval --no-deps + pip install -e util/opentelemetry-util-genai-emitters-splunk --no-deps + pip install -e "instrumentation-genai/opentelemetry-instrumentation-langchain[instruments,test]" - - name: Run tests - opentelemetry-util-genai-evals-deepeval - run: | - pip install -e util/opentelemetry-util-genai-evals-deepeval --no-deps - python -m pytest util/opentelemetry-util-genai-evals-deepeval/tests/ -v + - name: Run tests - opentelemetry-util-genai + run: | + python -m pytest util/opentelemetry-util-genai/tests/ -v --cov=opentelemetry.util.genai --cov-report=term-missing - - name: Run tests - opentelemetry-instrumentation-langchain - run: | - pip install -e instrumentation-genai/opentelemetry-instrumentation-langchain --no-deps - python -m pytest instrumentation-genai/opentelemetry-instrumentation-langchain/tests/ -v + - name: Run tests - opentelemetry-util-genai-emitters-splunk + run: | + python -m pytest util/opentelemetry-util-genai-emitters-splunk/tests/ -v - - name: Run tests - opentelemetry-util-genai - run: | - pip install -e util/opentelemetry-util-genai --no-deps - python -m pytest util/opentelemetry-util-genai/tests/ -v --cov=opentelemetry.util.genai --cov-report=term-missing \ No newline at end of file + - name: Run tests - opentelemetry-util-genai-evals + run: | + python -m pytest util/opentelemetry-util-genai-evals/tests/ -v + + - name: Run tests - opentelemetry-util-genai-evals-deepeval + run: | + python -m pytest util/opentelemetry-util-genai-evals-deepeval/tests/ -v + + - name: Run tests - opentelemetry-instrumentation-langchain + run: | + python -m pytest instrumentation-genai/opentelemetry-instrumentation-langchain/tests/ -v diff --git a/dev-genai-requirements.txt b/dev-genai-requirements.txt index 61e2ef6..a6daca5 100644 --- a/dev-genai-requirements.txt +++ b/dev-genai-requirements.txt @@ -12,7 +12,7 @@ markupsafe>=2.0.1 codespell==2.1.0 requests==2.32.3 ruamel.yaml==0.17.21 -flaky==3.7.0 +flaky>=3.8.1 pre-commit==3.7.0; python_version >= '3.9' pre-commit==3.5.0; python_version < '3.9' ruff==0.6.9 diff --git a/dev-requirements.txt b/dev-requirements.txt index df2e057..c7e718a 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -12,7 +12,7 @@ markupsafe>=2.0.1 codespell==2.1.0 requests==2.32.4 ruamel.yaml==0.17.21 -flaky==3.7.0 +flaky>=3.8.1 pre-commit==3.7.0; python_version >= '3.9' pre-commit==3.5.0; python_version < '3.9' ruff==0.6.9 diff --git a/instrumentation-genai/opentelemetry-instrumentation-langchain/examples/multi_agent_travel_planner/client_server_version/client.py b/instrumentation-genai/opentelemetry-instrumentation-langchain/examples/multi_agent_travel_planner/client_server_version/client.py index 9aac647..b5688e2 100644 --- a/instrumentation-genai/opentelemetry-instrumentation-langchain/examples/multi_agent_travel_planner/client_server_version/client.py +++ b/instrumentation-genai/opentelemetry-instrumentation-langchain/examples/multi_agent_travel_planner/client_server_version/client.py @@ -98,7 +98,7 @@ def run_client( poison_config = None if use_poison: poison_config = generate_random_poison_config() - print(f"\nšŸ’‰ Poison Configuration:") + print("\nšŸ’‰ Poison Configuration:") print(f" Probability: {poison_config['prob']}") print(f" Types: {', '.join(poison_config['types'])}") print(f" Max snippets: {poison_config['max']}") @@ -106,13 +106,13 @@ def run_client( # Generate user request user_request = generate_travel_request(origin, destination) - print(f"\nāœ‰ļø User Request:") + print("\nāœ‰ļø User Request:") print(f" {user_request}") # Get server URL from environment or default to localhost server_url = os.getenv("SERVER_URL", "http://localhost:8080") - print(f"\nšŸ”Œ Connecting to Flask server...") + print("\nšŸ”Œ Connecting to Flask server...") print(f" URL: {server_url}") # Prepare request data @@ -149,31 +149,31 @@ def run_client( print(f"šŸ‘„ Travellers: {result['travellers']}") if result.get('poison_events'): - print(f"\nšŸ’‰ Poison Events Triggered:") + print("\nšŸ’‰ Poison Events Triggered:") for event in result['poison_events']: print(f" - {event}") - print(f"\nāœˆļø Flight Summary:") + print("\nāœˆļø Flight Summary:") print(f" {result['flight_summary']}") - print(f"\nšŸØ Hotel Summary:") + print("\nšŸØ Hotel Summary:") print(f" {result['hotel_summary']}") - print(f"\nšŸŽ­ Activities Summary:") + print("\nšŸŽ­ Activities Summary:") print(f" {result['activities_summary']}") - print(f"\nšŸŽ‰ Final Itinerary:") + print("\nšŸŽ‰ Final Itinerary:") print("─" * 60) print(result['final_itinerary']) print("─" * 60) if result.get('agent_steps'): - print(f"\nšŸ¤– Agent Steps:") + print("\nšŸ¤– Agent Steps:") for step in result['agent_steps']: print(f" - {step['agent']}: {step['status']}") except requests.exceptions.Timeout: - print(f"\nāŒ Error: Request timed out after 5 minutes") + print("\nāŒ Error: Request timed out after 5 minutes") sys.exit(1) except requests.exceptions.RequestException as e: print(f"\nāŒ Error: Failed to connect to server: {e}") @@ -184,7 +184,7 @@ def run_client( sys.exit(1) except KeyError as e: print(f"\nāŒ Error: Missing key in response: {e}") - print(f"Response:") + print("Response:") pprint(result) sys.exit(1) diff --git a/instrumentation-genai/opentelemetry-instrumentation-langchain/examples/multi_agent_travel_planner/client_server_version/main.py b/instrumentation-genai/opentelemetry-instrumentation-langchain/examples/multi_agent_travel_planner/client_server_version/main.py index 5e3437e..7fb6ec7 100644 --- a/instrumentation-genai/opentelemetry-instrumentation-langchain/examples/multi_agent_travel_planner/client_server_version/main.py +++ b/instrumentation-genai/opentelemetry-instrumentation-langchain/examples/multi_agent_travel_planner/client_server_version/main.py @@ -190,17 +190,15 @@ from __future__ import annotations -import argparse import json import os import random import sys from datetime import datetime, timedelta -from typing import Annotated, Any, Dict, List, Optional, TypedDict +from typing import Annotated, Dict, List, Optional, TypedDict from uuid import uuid4 from pprint import pprint -from dotenv import load_dotenv from flask import Flask, request, jsonify from langchain_core.messages import ( AIMessage, @@ -220,7 +218,7 @@ from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import BatchSpanProcessor -from opentelemetry.trace import SpanKind, Status, StatusCode, Tracer +from opentelemetry.trace import SpanKind from opentelemetry import _events, _logs, metrics, trace from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import ( @@ -529,7 +527,7 @@ def pretty_print_message(message, indent=False): indented = "\n".join("\t" + c for c in pretty_message.split("\n")) print(indented, file=sys.stderr, flush=True) - except Exception as e: + except Exception: # Fallback if pretty_repr fails print(f"Message: {message}", file=sys.stderr, flush=True) @@ -985,7 +983,7 @@ def plan(): poison_config=poison_config, ) - print(f"[SERVER] Travel plan completed successfully", file=sys.stderr, flush=True) + print("[SERVER] Travel plan completed successfully", file=sys.stderr, flush=True) print("\n" + "="*80, file=sys.stderr) print("TRAVEL PLAN RESULT:", file=sys.stderr) pprint(result, stream=sys.stderr) diff --git a/instrumentation-genai/opentelemetry-instrumentation-langchain/pyproject.toml b/instrumentation-genai/opentelemetry-instrumentation-langchain/pyproject.toml index 8e169e5..1c60910 100644 --- a/instrumentation-genai/opentelemetry-instrumentation-langchain/pyproject.toml +++ b/instrumentation-genai/opentelemetry-instrumentation-langchain/pyproject.toml @@ -34,6 +34,14 @@ dependencies = [ instruments = [ "langchain >= 0.3.21", ] +test = [ + "langchain-core >= 1.0.0", + "langchain-openai >= 1.0.0", + "pytest-recording >= 0.13.0", + "vcrpy >= 7.0.0", + "pyyaml >= 6.0.0", + "flaky >= 3.8.1", +] [project.entry-points.opentelemetry_instrumentor] langchain = "opentelemetry.instrumentation.langchain:LangChainInstrumentor" diff --git a/instrumentation-genai/opentelemetry-instrumentation-langchain/tests/conftest.py b/instrumentation-genai/opentelemetry-instrumentation-langchain/tests/conftest.py index fe212d5..a238781 100644 --- a/instrumentation-genai/opentelemetry-instrumentation-langchain/tests/conftest.py +++ b/instrumentation-genai/opentelemetry-instrumentation-langchain/tests/conftest.py @@ -138,7 +138,7 @@ def chatOpenAI_client(): return ChatOpenAI() -@pytest.fixture(scope="module") +@pytest.fixture(scope="function") def vcr_config(): return { "filter_headers": [ @@ -149,9 +149,19 @@ def vcr_config(): ], "decode_compressed_response": True, "before_record_response": scrub_response_headers, + "serializer": "yaml", } +@pytest.fixture(scope="session") +def vcr_cassette_dir(): + """Override the default cassette directory to avoid nested subdirectories.""" + import os + + # Return the cassettes directory path + return os.path.join(os.path.dirname(__file__), "cassettes") + + @pytest.fixture(scope="function") def instrument_no_content(tracer_provider, event_logger_provider, meter_provider): if LangChainInstrumentor is None: # pragma: no cover - skip when dependency missing @@ -175,7 +185,20 @@ def instrument_with_content(tracer_provider, event_logger_provider, meter_provid if LangChainInstrumentor is None: # pragma: no cover pytest.skip("opentelemetry-instrumentation-langchain not available") set_prompt_capture_enabled(True) + + # Reset util-genai singleton handler to ensure clean state + import opentelemetry.util.genai.handler as _util_handler_mod # noqa: PLC0415 + + if hasattr(_util_handler_mod.get_telemetry_handler, "_default_handler"): + setattr(_util_handler_mod.get_telemetry_handler, "_default_handler", None) + + # Create new instrumentor for each test instrumentor = LangChainInstrumentor() + + # If already instrumented (from previous test), uninstrument first + if instrumentor._is_instrumented_by_opentelemetry: + instrumentor.uninstrument() + instrumentor.instrument( tracer_provider=tracer_provider, event_logger_provider=event_logger_provider, @@ -183,8 +206,14 @@ def instrument_with_content(tracer_provider, event_logger_provider, meter_provid ) yield instrumentor + set_prompt_capture_enabled(True) - instrumentor.uninstrument() + # Clean up: uninstrument and reset singleton + if instrumentor._is_instrumented_by_opentelemetry: + instrumentor.uninstrument() + + if hasattr(_util_handler_mod.get_telemetry_handler, "_default_handler"): + setattr(_util_handler_mod.get_telemetry_handler, "_default_handler", None) @pytest.fixture(scope="function") @@ -222,21 +251,37 @@ def instrument_with_content_util( OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT: "SPAN_ONLY", # util-genai content gate } ) + # Reset singleton so new env vars are applied import opentelemetry.util.genai.handler as _util_handler_mod # noqa: PLC0415 if hasattr(_util_handler_mod.get_telemetry_handler, "_default_handler"): setattr(_util_handler_mod.get_telemetry_handler, "_default_handler", None) + + # Create new instrumentor for each test instrumentor = LangChainInstrumentor() + + # If already instrumented (from previous test), uninstrument first + if instrumentor._is_instrumented_by_opentelemetry: + instrumentor.uninstrument() + instrumentor.instrument( tracer_provider=tracer_provider, event_logger_provider=event_logger_provider, meter_provider=meter_provider, ) + yield instrumentor + os.environ.pop(OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT, None) set_prompt_capture_enabled(True) - instrumentor.uninstrument() + + # Clean up: uninstrument and reset singleton + if instrumentor._is_instrumented_by_opentelemetry: + instrumentor.uninstrument() + + if hasattr(_util_handler_mod.get_telemetry_handler, "_default_handler"): + setattr(_util_handler_mod.get_telemetry_handler, "_default_handler", None) class LiteralBlockScalar(str): @@ -305,6 +350,11 @@ def deserialize(cassette_string): try: # pragma: no cover - optional pytest-vcr dependency import pytest_recording # type: ignore # noqa: F401 + import vcr as vcr_module # type: ignore # noqa: F401 + + # Register custom YAML serializer globally + vcr_module.VCR().register_serializer("yaml", PrettyPrintJSONBody) + except ModuleNotFoundError: # pragma: no cover - provide stub when plugin missing @pytest.fixture(name="vcr", scope="module") @@ -316,9 +366,10 @@ def register_serializer(self, *_args, **_kwargs): return _VCRStub() -@pytest.fixture(scope="module", autouse=True) +@pytest.fixture(scope="function", autouse=True) def fixture_vcr(vcr): - vcr.register_serializer("yaml", PrettyPrintJSONBody) + # When pytest-recording is installed, vcr is a Cassette and we don't need to do anything + # The serializer is already registered on the VCR module above return vcr diff --git a/instrumentation-genai/opentelemetry-instrumentation-langchain/tests/test_callback_handler_agent.py b/instrumentation-genai/opentelemetry-instrumentation-langchain/tests/test_callback_handler_agent.py index 68faaba..319741c 100644 --- a/instrumentation-genai/opentelemetry-instrumentation-langchain/tests/test_callback_handler_agent.py +++ b/instrumentation-genai/opentelemetry-instrumentation-langchain/tests/test_callback_handler_agent.py @@ -4,7 +4,6 @@ import sys from pathlib import Path from typing import Any, Optional, Tuple -from unittest.mock import MagicMock from uuid import uuid4 import pytest @@ -16,7 +15,6 @@ from opentelemetry.instrumentation.langchain.callback_handler import ( # noqa: E402 LangchainCallbackHandler, ) -from opentelemetry.sdk.trace import TracerProvider # noqa: E402 from opentelemetry.util.genai.types import Step, ToolCall # noqa: E402 try: # pragma: no cover - optional dependency in CI @@ -42,6 +40,8 @@ def __init__(self) -> None: self.started_steps = [] self.stopped_steps = [] self.failed_steps = [] + self.started_workflows = [] + self.stopped_workflows = [] self.entities: dict[str, Any] = {} def start_agent(self, agent): @@ -102,6 +102,28 @@ def fail_step(self, step, error): self.entities.pop(str(step.run_id), None) return step + def start_workflow(self, workflow): + self.started_workflows.append(workflow) + self.entities[str(workflow.run_id)] = workflow + return workflow + + def stop_workflow(self, workflow): + self.stopped_workflows.append(workflow) + self.entities.pop(str(workflow.run_id), None) + return workflow + + def fail_workflow(self, workflow, error): + self.entities.pop(str(workflow.run_id), None) + return workflow + + def fail_by_run_id(self, run_id, error): + # Simplified implementation for stub - just call fail_agent + entity = self.entities.get(str(run_id)) + if entity is None: + return + # For simplicity, assume it's an agent + self.fail_agent(entity, error) + def get_entity(self, run_id): return self.entities.get(str(run_id)) @@ -110,12 +132,8 @@ def get_entity(self, run_id): def _handler_with_stub_fixture() -> ( Tuple[LangchainCallbackHandler, _StubTelemetryHandler] ): - tracer = TracerProvider().get_tracer(__name__) - histogram = MagicMock() - histogram.record = MagicMock() - handler = LangchainCallbackHandler(tracer, histogram, histogram) stub = _StubTelemetryHandler() - handler._handler = stub # type: ignore[attr-defined] + handler = LangchainCallbackHandler(telemetry_handler=stub) return handler, stub @@ -337,12 +355,8 @@ def test_step_outputs_recorded_on_chain_end(handler_with_stub): @pytest.mark.skipif(not LANGCHAIN_CORE_AVAILABLE, reason="langchain_core not available") def test_llm_attributes_independent_of_emitters(monkeypatch): def _build_handler() -> Tuple[LangchainCallbackHandler, _StubTelemetryHandler]: - tracer = TracerProvider().get_tracer(__name__) - histogram = MagicMock() - histogram.record = MagicMock() - handler = LangchainCallbackHandler(tracer, histogram, histogram) stub_handler = _StubTelemetryHandler() - handler._telemetry_handler = stub_handler # type: ignore[attr-defined] + handler = LangchainCallbackHandler(telemetry_handler=stub_handler) return handler, stub_handler def _invoke_with_env(env_value: Optional[str]): diff --git a/util/opentelemetry-util-genai-emitters-splunk/src/opentelemetry/util/genai/emitters/splunk.py b/util/opentelemetry-util-genai-emitters-splunk/src/opentelemetry/util/genai/emitters/splunk.py index fcd1f7a..ecad904 100644 --- a/util/opentelemetry-util-genai-emitters-splunk/src/opentelemetry/util/genai/emitters/splunk.py +++ b/util/opentelemetry-util-genai-emitters-splunk/src/opentelemetry/util/genai/emitters/splunk.py @@ -645,14 +645,14 @@ def splunk_emitters() -> list[EmitterSpec]: def _conversation_factory(ctx: Any) -> SplunkConversationEventsEmitter: capture_mode = getattr(ctx, "capture_event_content", False) return SplunkConversationEventsEmitter( - event_logger=getattr(ctx, "content_logger", None), + event_logger=getattr(ctx, "event_logger", None), capture_content=cast(bool, capture_mode), ) def _evaluation_factory(ctx: Any) -> SplunkEvaluationResultsEmitter: capture_mode = getattr(ctx, "capture_event_content", False) return SplunkEvaluationResultsEmitter( - event_logger=getattr(ctx, "content_logger", None), + event_logger=getattr(ctx, "event_logger", None), capture_content=cast(bool, capture_mode), ) diff --git a/util/opentelemetry-util-genai-emitters-splunk/tests/test_splunk_emitters.py b/util/opentelemetry-util-genai-emitters-splunk/tests/test_splunk_emitters.py index 157db27..120eb7d 100644 --- a/util/opentelemetry-util-genai-emitters-splunk/tests/test_splunk_emitters.py +++ b/util/opentelemetry-util-genai-emitters-splunk/tests/test_splunk_emitters.py @@ -119,90 +119,114 @@ def test_conversation_event_emission() -> None: assert logger.records record = logger.records[0] - assert record.attributes["event.name"] == "gen_ai.splunk.conversation" - assert record.body["conversation"]["inputs"][0]["role"] == "user" - assert record.body["conversation"]["outputs"][0]["role"] == "assistant" - - -def test_evaluation_results_aggregation_and_metrics() -> None: - logger = _CapturingLogger() - meter = _FakeMeter() - specs = splunk_emitters() - evaluation_spec = next( - spec for spec in specs if spec.category == "evaluation" - ) - context = EmitterFactoryContext( - tracer=None, - meter=meter, - event_logger=logger, - content_logger=None, - evaluation_histogram=None, - capture_span_content=False, - capture_event_content=True, - ) - emitter = evaluation_spec.factory(context) - invocation = _build_invocation() - - results = [ - EvaluationResult( - metric_name="accuracy", - score=3.0, - label="medium", - explanation="Normalized via range", - attributes={"range": [0, 4], "judge_model": "llama3"}, - ), - EvaluationResult( - metric_name="toxicity/v1", - score=0.2, - label="low", - ), - EvaluationResult( - metric_name="readability", - score=5.0, - label="high", - ), - ] - - emitter.on_evaluation_results(results, invocation) - - assert "gen_ai.evaluation.result.accuracy" in meter.histograms - assert ( - meter.histograms["gen_ai.evaluation.result.accuracy"].records[0][0] - == 0.75 - ) - assert "gen_ai.evaluation.result.toxicity_v1" in meter.histograms + # Updated to match current implementation - uses semantic convention event name assert ( - meter.histograms["gen_ai.evaluation.result.toxicity_v1"].records[0][0] - == 0.2 + record.attributes["event.name"] + == "gen_ai.client.inference.operation.details" ) - assert "gen_ai.evaluation.result.readability" not in meter.histograms - - emitter.on_end(invocation) - - assert len(logger.records) == 1 - record = logger.records[0] - assert record.event_name == "gen_ai.splunk.evaluations" - evaluations = record.body["evaluations"] - assert len(evaluations) == 3 + assert record.body["gen_ai.input.messages"][0]["role"] == "user" + assert record.body["gen_ai.output.messages"][0]["role"] == "assistant" - accuracy_entry = next(e for e in evaluations if e["name"] == "accuracy") - assert accuracy_entry["normalized_score"] == 0.75 - assert accuracy_entry["range"] == "[0.0,4.0]" - assert accuracy_entry["attributes"]["judge_model"] == "llama3" - toxicity_entry = next(e for e in evaluations if e["name"] == "toxicity/v1") - assert toxicity_entry["normalized_score"] == 0.2 - assert toxicity_entry["range"] == "[0,1]" - - readability_entry = next( - e for e in evaluations if e["name"] == "readability" - ) - assert "normalized_score" not in readability_entry +def test_evaluation_results_aggregation_and_metrics() -> None: + import importlib + import os + + # Enable message content inclusion for this test + os.environ["SPLUNK_EVALUATION_RESULTS_MESSAGE_CONTENT"] = "true" + try: + # Reload module to pick up environment variable + from opentelemetry.util.genai.emitters import splunk as splunk_module + + importlib.reload(splunk_module) + + logger = _CapturingLogger() + meter = _FakeMeter() + specs = splunk_module.splunk_emitters() + evaluation_spec = next( + spec for spec in specs if spec.category == "evaluation" + ) + context = EmitterFactoryContext( + tracer=None, + meter=meter, + event_logger=logger, + content_logger=None, + evaluation_histogram=None, + capture_span_content=False, + capture_event_content=True, + ) + emitter = evaluation_spec.factory(context) + invocation = _build_invocation() + + results = [ + EvaluationResult( + metric_name="accuracy", + score=3.0, + label="medium", + explanation="Normalized via range", + attributes={"range": [0, 4], "judge_model": "llama3"}, + ), + EvaluationResult( + metric_name="toxicity/v1", + score=0.2, + label="low", + ), + EvaluationResult( + metric_name="readability", + score=5.0, + label="high", + ), + ] + + emitter.on_evaluation_results(results, invocation) + + # Metrics emission has been removed from Splunk emitters + # (canonical metrics are handled by core evaluation metrics emitter) + # So we no longer check for histograms + + assert len(logger.records) == 1 + record = logger.records[0] + # Updated event name to match current implementation + assert record.attributes["event.name"] == "gen_ai.evaluation.results" + # Updated body structure to match current implementation + evaluations = record.body["gen_ai.evaluations"] + assert len(evaluations) == 3 + + accuracy_entry = next( + e + for e in evaluations + if e.get("gen_ai.evaluation.name") == "accuracy" + ) + assert accuracy_entry["gen_ai.evaluation.score.value"] == 3.0 + assert accuracy_entry["gen_ai.evaluation.score.label"] == "medium" - conversation = record.body["conversation"] - assert conversation["inputs"][0]["parts"][0]["content"] == "Hello" - assert conversation["system_instructions"] == ["be nice"] + toxicity_entry = next( + e + for e in evaluations + if e.get("gen_ai.evaluation.name") == "toxicity/v1" + ) + assert toxicity_entry["gen_ai.evaluation.score.value"] == 0.2 + assert toxicity_entry["gen_ai.evaluation.score.label"] == "low" - assert record.attributes["event.name"] == "gen_ai.splunk.evaluations" - assert record.attributes["gen_ai.request.model"] == "gpt-test" - assert record.attributes["gen_ai.provider.name"] == "openai" + readability_entry = next( + e + for e in evaluations + if e.get("gen_ai.evaluation.name") == "readability" + ) + assert readability_entry["gen_ai.evaluation.score.value"] == 5.0 + + # Updated body structure for message content (when env var is set) + input_messages = record.body["gen_ai.input.messages"] + assert input_messages[0]["parts"][0]["content"] == "Hello" + system_instructions = record.body["gen_ai.system_instructions"] + assert system_instructions == ["be nice"] + + assert record.attributes["event.name"] == "gen_ai.evaluation.results" + assert record.attributes["gen_ai.request.model"] == "gpt-test" + assert record.attributes["gen_ai.provider.name"] == "openai" + finally: + # Clean up environment variable and reload module + os.environ.pop("SPLUNK_EVALUATION_RESULTS_MESSAGE_CONTENT", None) + from opentelemetry.util.genai.emitters import splunk as splunk_module + + importlib.reload(splunk_module) diff --git a/util/opentelemetry-util-genai-evals-deepeval/pyproject.toml b/util/opentelemetry-util-genai-evals-deepeval/pyproject.toml index 9f76a29..1a78773 100644 --- a/util/opentelemetry-util-genai-evals-deepeval/pyproject.toml +++ b/util/opentelemetry-util-genai-evals-deepeval/pyproject.toml @@ -27,7 +27,7 @@ classifiers = [ dependencies = [ "splunk-otel-util-genai>=0.1.0", "splunk-otel-util-genai-evals>=0.1.0", - "deepeval>=0.21.0", + "deepeval>=3.7.0", "openai>=1.0.0", ] @@ -46,10 +46,7 @@ Repository = "https://github.com/open-telemetry/opentelemetry-python-contrib" path = "src/opentelemetry/util/evaluator/version.py" [tool.hatch.build.targets.sdist] -include = [ - "/src", - "/tests", -] +include = ["/src", "/tests"] [tool.hatch.build.targets.wheel] packages = ["src/opentelemetry"] diff --git a/util/opentelemetry-util-genai-evals-deepeval/tests/test_deepeval_evaluator.py b/util/opentelemetry-util-genai-evals-deepeval/tests/test_deepeval_evaluator.py index 13f2304..0b40f6c 100644 --- a/util/opentelemetry-util-genai-evals-deepeval/tests/test_deepeval_evaluator.py +++ b/util/opentelemetry-util-genai-evals-deepeval/tests/test_deepeval_evaluator.py @@ -106,7 +106,12 @@ def __init__(self, show_indicator=False, print_results=False): eval_cfg_mod.AsyncConfig = AsyncConfig eval_cfg_mod.DisplayConfig = DisplayConfig - def evaluate(test_cases, metrics, async_config=None, display_config=None): + def evaluate( + test_cases, + metrics, + async_config=None, + display_config=None, + ): class _Eval: test_results = [] @@ -222,7 +227,6 @@ def test_default_metrics_covered() -> None: "bias", "toxicity", "answer_relevancy", - "faithfulness", "hallucination", "sentiment", } @@ -259,14 +263,12 @@ def test_evaluator_converts_results(monkeypatch): ) monkeypatch.setattr( - plugin.DeepevalEvaluator, - "_instantiate_metrics", - lambda self, specs, test_case: ([object()], []), + "opentelemetry.util.evaluator.deepeval._instantiate_metrics", + lambda specs, test_case, model: ([object()], []), ) monkeypatch.setattr( - plugin.DeepevalEvaluator, - "_run_deepeval", - lambda self, case, metrics: fake_result, + "opentelemetry.util.evaluator.deepeval._run_deepeval", + lambda case, metrics, debug_log: fake_result, ) results = evaluator.evaluate(invocation) @@ -274,7 +276,7 @@ def test_evaluator_converts_results(monkeypatch): result = results[0] assert result.metric_name == "bias" assert result.score == 0.8 - assert result.label == "pass" + assert result.label == "Not Biased" assert result.explanation == "looks good" assert result.attributes["deepeval.threshold"] == 0.7 assert result.attributes["deepeval.success"] is True @@ -290,7 +292,7 @@ def test_metric_options_coercion(monkeypatch): captured = {} - def fake_instantiate(self, specs, test_case): + def fake_instantiate(specs, test_case, model): captured.update(specs[0].options) return [object()], [] @@ -315,21 +317,19 @@ def fake_instantiate(self, specs, test_case): ) monkeypatch.setattr( - plugin.DeepevalEvaluator, - "_instantiate_metrics", + "opentelemetry.util.evaluator.deepeval._instantiate_metrics", fake_instantiate, ) monkeypatch.setattr( - plugin.DeepevalEvaluator, - "_run_deepeval", - lambda self, case, metrics: fake_result, + "opentelemetry.util.evaluator.deepeval._run_deepeval", + lambda case, metrics, debug_log: fake_result, ) results = evaluator.evaluate(invocation) assert captured["threshold"] == 0.9 assert captured["strict_mode"] is True assert captured.get("model", evaluator._default_model()) == "gpt-4o-mini" - assert results[0].label == "fail" + assert results[0].label == "Biased" def test_evaluator_handles_instantiation_error(monkeypatch): @@ -338,10 +338,12 @@ def test_evaluator_handles_instantiation_error(monkeypatch): ("bias",), invocation_type="LLMInvocation" ) - def boom(self, specs, test_case): + def boom(specs, test_case, model): raise RuntimeError("boom") - monkeypatch.setattr(plugin.DeepevalEvaluator, "_instantiate_metrics", boom) + monkeypatch.setattr( + "opentelemetry.util.evaluator.deepeval._instantiate_metrics", boom + ) results = evaluator.evaluate(invocation) assert len(results) == 1 @@ -398,7 +400,7 @@ def test_retrieval_context_extracted_from_attributes(monkeypatch): captured = {} - def fake_instantiate(self, specs, test_case): + def fake_instantiate(specs, test_case, model): captured["retrieval_context"] = getattr( test_case, "retrieval_context", None ) @@ -425,12 +427,12 @@ def fake_instantiate(self, specs, test_case): ) monkeypatch.setattr( - plugin.DeepevalEvaluator, "_instantiate_metrics", fake_instantiate + "opentelemetry.util.evaluator.deepeval._instantiate_metrics", + fake_instantiate, ) monkeypatch.setattr( - plugin.DeepevalEvaluator, - "_run_deepeval", - lambda self, case, metrics: fake_result, + "opentelemetry.util.evaluator.deepeval._run_deepeval", + lambda case, metrics, debug_log: fake_result, ) results = evaluator.evaluate(invocation) diff --git a/util/opentelemetry-util-genai-evals-deepeval/tests/test_deepeval_metric_name_variants.py b/util/opentelemetry-util-genai-evals-deepeval/tests/test_deepeval_metric_name_variants.py index c7aeb50..c51197c 100644 --- a/util/opentelemetry-util-genai-evals-deepeval/tests/test_deepeval_metric_name_variants.py +++ b/util/opentelemetry-util-genai-evals-deepeval/tests/test_deepeval_metric_name_variants.py @@ -49,25 +49,24 @@ def _build_invocation(): def test_answer_relevancy_variants_normalize(variant, expected_key): captured = {} - def fake_instantiate(self, specs, test_case): + def fake_instantiate(specs, test_case, model): # capture the normalized internal spec names captured["spec_names"] = [s.name for s in specs] # return a dummy metric instance so evaluation proceeds to conversion path (which will produce no data) return [object()], [] with ( - patch.object( - plugin.DeepevalEvaluator, "_instantiate_metrics", fake_instantiate + patch( + "opentelemetry.util.evaluator.deepeval._instantiate_metrics", + fake_instantiate, ), - patch.object( - plugin.DeepevalEvaluator, - "_build_test_case", - lambda self, inv, t: object(), + patch( + "opentelemetry.util.evaluator.deepeval._build_llm_test_case", + lambda inv: object(), ), - patch.object( - plugin.DeepevalEvaluator, - "_run_deepeval", - lambda self, case, metrics: type( + patch( + "opentelemetry.util.evaluator.deepeval._run_deepeval", + lambda case, metrics, debug_log: type( "_DummyEval", (), {"test_results": []} )(), ), @@ -85,17 +84,17 @@ def test_unknown_metric_produces_error(): invalid = "nonexistent-metric" # Patch _instantiate_metrics to raise the same ValueError pattern used by evaluator for unknown metric registry key - def fake_instantiate(self, specs, test_case): + def fake_instantiate(specs, test_case, model): raise ValueError(f"Unknown Deepeval metric '{invalid}'") with ( - patch.object( - plugin.DeepevalEvaluator, "_instantiate_metrics", fake_instantiate + patch( + "opentelemetry.util.evaluator.deepeval._instantiate_metrics", + fake_instantiate, ), - patch.object( - plugin.DeepevalEvaluator, - "_build_test_case", - lambda self, inv, t: object(), + patch( + "opentelemetry.util.evaluator.deepeval._build_llm_test_case", + lambda inv: object(), ), ): evaluator = plugin.DeepevalEvaluator( diff --git a/util/opentelemetry-util-genai-evals-deepeval/tests/test_deepeval_sentiment_metric.py b/util/opentelemetry-util-genai-evals-deepeval/tests/test_deepeval_sentiment_metric.py index ff92a74..151037e 100644 --- a/util/opentelemetry-util-genai-evals-deepeval/tests/test_deepeval_sentiment_metric.py +++ b/util/opentelemetry-util-genai-evals-deepeval/tests/test_deepeval_sentiment_metric.py @@ -98,7 +98,12 @@ def __init__(self, show_indicator=False, print_results=False): eval_cfg_mod.AsyncConfig = AsyncConfig eval_cfg_mod.DisplayConfig = DisplayConfig - def evaluate(test_cases, metrics, async_config=None, display_config=None): + def evaluate( + test_cases, + metrics, + async_config=None, + display_config=None, + ): class _Eval: test_results = [] @@ -181,14 +186,12 @@ def test_sentiment_metric_result_attributes(monkeypatch): # Bypass instantiation logic to avoid real deepeval dependency usage monkeypatch.setattr( - plugin.DeepevalEvaluator, - "_instantiate_metrics", - lambda self, specs, test_case: ([object()], []), + "opentelemetry.util.evaluator.deepeval._instantiate_metrics", + lambda specs, test_case, model: ([object()], []), ) monkeypatch.setattr( - plugin.DeepevalEvaluator, - "_run_deepeval", - lambda self, case, metrics: fake_result, + "opentelemetry.util.evaluator.deepeval._run_deepeval", + lambda case, metrics, debug_log: fake_result, ) results = evaluator.evaluate(invocation) diff --git a/util/opentelemetry-util-genai-evals/tests/test_evaluation_dynamic_aggregation.py b/util/opentelemetry-util-genai-evals/tests/test_evaluation_dynamic_aggregation.py index dab71ec..19b7b6c 100644 --- a/util/opentelemetry-util-genai-evals/tests/test_evaluation_dynamic_aggregation.py +++ b/util/opentelemetry-util-genai-evals/tests/test_evaluation_dynamic_aggregation.py @@ -32,10 +32,10 @@ def test_dynamic_aggregation_env_toggle(monkeypatch): # type: ignore[no-untyped [EvaluationResult(metric_name="bias", score=0.1)], [EvaluationResult(metric_name="toxicity", score=0.3)], ] - # Disable internal aggregate flag - manager._aggregate_results = False + # Set internal aggregate flag to None to enable dynamic env var reading + manager._aggregate_results = None manager._publish_results(invocation, buckets) - assert len(handler.calls) == 2 # two separate batches + assert len(handler.calls) == 2 # two separate batches (env var not set) # Now enable aggregation via env and emit again -> should aggregate monkeypatch.setenv( # type: ignore[attr-defined] diff --git a/util/opentelemetry-util-genai-evals/tests/test_evaluators.py b/util/opentelemetry-util-genai-evals/tests/test_evaluators.py index 445b6c9..6ab9e4a 100644 --- a/util/opentelemetry-util-genai-evals/tests/test_evaluators.py +++ b/util/opentelemetry-util-genai-evals/tests/test_evaluators.py @@ -237,7 +237,7 @@ def _build_invocation(self) -> LLMInvocation: ) def test_handler_registers_manager(self) -> None: with patch( - "opentelemetry.util.genai.handler._load_completion_callbacks", + "opentelemetry.util.genai.utils.load_completion_callbacks", side_effect=_mock_load_callbacks, ): handler = get_telemetry_handler() @@ -263,7 +263,7 @@ def test_handler_registers_manager(self) -> None: ) def test_handler_evaluate_llm_returns_results(self) -> None: with patch( - "opentelemetry.util.genai.handler._load_completion_callbacks", + "opentelemetry.util.genai.utils.load_completion_callbacks", side_effect=_mock_load_callbacks, ): handler = get_telemetry_handler() @@ -287,7 +287,7 @@ def test_handler_auto_enables_when_env_missing(self) -> None: return_value={"LLMInvocation": ("static_metric",)}, ), patch( - "opentelemetry.util.genai.handler._load_completion_callbacks", + "opentelemetry.util.genai.utils.load_completion_callbacks", side_effect=_mock_load_callbacks, ), ): @@ -305,7 +305,7 @@ def test_handler_auto_enables_when_env_missing(self) -> None: ) def test_handler_disables_when_none(self) -> None: with patch( - "opentelemetry.util.genai.handler._load_completion_callbacks", + "opentelemetry.util.genai.utils.load_completion_callbacks", side_effect=_mock_load_callbacks, ): handler = get_telemetry_handler() diff --git a/util/opentelemetry-util-genai/tests/test_async_evaluation.py b/util/opentelemetry-util-genai/tests/test_async_evaluation.py index b0b2fad..e4635d1 100644 --- a/util/opentelemetry-util-genai/tests/test_async_evaluation.py +++ b/util/opentelemetry-util-genai/tests/test_async_evaluation.py @@ -49,7 +49,7 @@ def _mock_load_callbacks(_selected): } with patch( - "opentelemetry.util.genai.handler._load_completion_callbacks", + "opentelemetry.util.genai.utils.load_completion_callbacks", side_effect=_mock_load_callbacks, ): handler = get_telemetry_handler() diff --git a/util/opentelemetry-util-genai/tests/test_handler_evaluations.py b/util/opentelemetry-util-genai/tests/test_handler_evaluations.py index 7fd4554..0236b43 100644 --- a/util/opentelemetry-util-genai/tests/test_handler_evaluations.py +++ b/util/opentelemetry-util-genai/tests/test_handler_evaluations.py @@ -66,7 +66,7 @@ def _mock_load_callbacks(_selected): } with patch( - "opentelemetry.util.genai.handler._load_completion_callbacks", + "opentelemetry.util.genai.utils.load_completion_callbacks", side_effect=_mock_load_callbacks, ): handler = get_telemetry_handler() diff --git a/util/opentelemetry-util-genai/tests/test_utils.py b/util/opentelemetry-util-genai/tests/test_utils.py index 644e7e3..86645f2 100644 --- a/util/opentelemetry-util-genai/tests/test_utils.py +++ b/util/opentelemetry-util-genai/tests/test_utils.py @@ -173,7 +173,7 @@ def test_llm_start_and_stop_creates_span(self): # pylint: disable=no-self-use assert span_attrs.get("gen_ai.provider.name") == "test-provider" assert span.start_time is not None assert span.end_time is not None - assert span.end_time > span.start_time + assert span.end_time >= span.start_time assert invocation.attributes.get("custom_attr") == "value" assert invocation.attributes.get("extra") == "info"