Add metrics to the Python OpenAI instrumentation (#3180)

2025-01-16 07:24:35 +09:00 · 2025-01-16 07:24:35 +09:00 · a716949d1c
parent 07c97eac38
commit a716949d1c
10 changed files with 763 additions and 9 deletions
--- a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/CHANGELOG.md
+++ b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/CHANGELOG.md
@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Add example to `opentelemetry-instrumentation-openai-v2`
  ([#3006](https://github.com/open-telemetry/opentelemetry-python-contrib/pull/3006))
 - Support for `AsyncOpenAI/AsyncCompletions` ([#2984](https://github.com/open-telemetry/opentelemetry-python-contrib/pull/2984))
+- Add metrics ([#3180](https://github.com/open-telemetry/opentelemetry-python-contrib/pull/3180))

 ## Version 2.0b0 (2024-11-08)

--- a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/README.rst
+++ b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/README.rst
@ -7,7 +7,8 @@ OpenTelemetry OpenAI Instrumentation
   :target: https://pypi.org/project/opentelemetry-instrumentation-openai-v2/

 This library allows tracing LLM requests and logging of messages made by the
-`OpenAI Python API library <https://pypi.org/project/openai/>`_.
+`OpenAI Python API library <https://pypi.org/project/openai/>`_. It also captures
+the duration of the operations and the number of tokens used as metrics.


 Installation
@ -74,6 +75,48 @@ To uninstrument clients, call the uninstrument method:
    # Uninstrument all clients
    OpenAIInstrumentor().uninstrument()

+Bucket Boundaries
+-----------------
+
+This section describes the explicit bucket boundaries for metrics such as token usage and operation duration, and guides users to create Views to implement them according to the semantic conventions.
+
+The bucket boundaries are defined as follows:
+
+- For `gen_ai.client.token.usage`: [1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864]
+- For `gen_ai.client.operation.duration`: [0.01, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64, 1.28, 2.56, 5.12, 10.24, 20.48, 40.96, 81.92]
+
+To implement these bucket boundaries, you can create Views in your OpenTelemetry SDK setup. Here is an example:
+
+.. code-block:: python
+
+    from opentelemetry.sdk.metrics import MeterProvider, View
+    from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
+    from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
+    from opentelemetry.sdk.metrics.aggregation import ExplicitBucketHistogramAggregation
+
+    views = [
+        View(
+            instrument_name="gen_ai.client.token.usage",
+            aggregation=ExplicitBucketHistogramAggregation([1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864]),
+        ),
+        View(
+            instrument_name="gen_ai.client.operation.duration",
+            aggregation=ExplicitBucketHistogramAggregation([0.01, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64, 1.28, 2.56, 5.12, 10.24, 20.48, 40.96, 81.92]),
+        ),
+    ]
+
+    metric_exporter = OTLPMetricExporter(endpoint="http://localhost:4317")
+    metric_reader = PeriodicExportingMetricReader(metric_exporter)
+    provider = MeterProvider(
+        metric_readers=[metric_reader],
+        views=views
+    )
+
+    from opentelemetry.sdk.metrics import set_meter_provider
+    set_meter_provider(provider)
+
+For more details, refer to the `OpenTelemetry GenAI Metrics documentation <https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-metrics/>`_.
+
 References
 ----------
 * `OpenTelemetry OpenAI Instrumentation <https://opentelemetry-python-contrib.readthedocs.io/en/latest/instrumentation/openai/openai.html>`_
--- a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/init.py
+++ b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/init.py
@ -49,13 +49,18 @@ from opentelemetry.instrumentation.instrumentor import BaseInstrumentor
 from opentelemetry.instrumentation.openai_v2.package import _instruments
 from opentelemetry.instrumentation.openai_v2.utils import is_content_enabled
 from opentelemetry.instrumentation.utils import unwrap
+from opentelemetry.metrics import get_meter
 from opentelemetry.semconv.schemas import Schemas
 from opentelemetry.trace import get_tracer

+from .instruments import Instruments
 from .patch import async_chat_completions_create, chat_completions_create


 class OpenAIInstrumentor(BaseInstrumentor):
+    def __init__(self):
+        self._meter = None
+
    def instrumentation_dependencies(self) -> Collection[str]:
        return _instruments

@ -75,12 +80,21 @@ class OpenAIInstrumentor(BaseInstrumentor):
            schema_url=Schemas.V1_28_0.value,
            event_logger_provider=event_logger_provider,
        )
+        meter_provider = kwargs.get("meter_provider")
+        self._meter = get_meter(
+            __name__,
+            "",
+            meter_provider,
+            schema_url=Schemas.V1_28_0.value,
+        )
+
+        instruments = Instruments(self._meter)

        wrap_function_wrapper(
            module="openai.resources.chat.completions",
            name="Completions.create",
            wrapper=chat_completions_create(
-                tracer, event_logger, is_content_enabled()
+                tracer, event_logger, instruments, is_content_enabled()
            ),
        )

@ -88,7 +102,7 @@ class OpenAIInstrumentor(BaseInstrumentor):
            module="openai.resources.chat.completions",
            name="AsyncCompletions.create",
            wrapper=async_chat_completions_create(
-                tracer, event_logger, is_content_enabled()
+                tracer, event_logger, instruments, is_content_enabled()
            ),
        )

--- a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/instruments.py
+++ b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/instruments.py
@ -0,0 +1,11 @@
+from opentelemetry.semconv._incubating.metrics import gen_ai_metrics
+
+
+class Instruments:
+    def __init__(self, meter):
+        self.operation_duration_histogram = (
+            gen_ai_metrics.create_gen_ai_client_operation_duration(meter)
+        )
+        self.token_usage_histogram = (
+            gen_ai_metrics.create_gen_ai_client_token_usage(meter)
+        )
--- a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/patch.py
+++ b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/patch.py
@ -13,6 +13,7 @@
 # limitations under the License.


+from timeit import default_timer
 from typing import Optional

 from openai import Stream
@ -21,8 +22,12 @@ from opentelemetry._events import Event, EventLogger
 from opentelemetry.semconv._incubating.attributes import (
    gen_ai_attributes as GenAIAttributes,
 )
+from opentelemetry.semconv._incubating.attributes import (
+    server_attributes as ServerAttributes,
+)
 from opentelemetry.trace import Span, SpanKind, Tracer

+from .instruments import Instruments
 from .utils import (
    choice_to_event,
    get_llm_request_attributes,
@ -34,7 +39,10 @@ from .utils import (


 def chat_completions_create(
-    tracer: Tracer, event_logger: EventLogger, capture_content: bool
+    tracer: Tracer,
+    event_logger: EventLogger,
+    instruments: Instruments,
+    capture_content: bool,
 ):
    """Wrap the `create` method of the `ChatCompletion` class to trace it."""

@ -54,6 +62,9 @@ def chat_completions_create(
                        message_to_event(message, capture_content)
                    )

+            start = default_timer()
+            result = None
+            error_type = None
            try:
                result = wrapped(*args, **kwargs)
                if is_streaming(kwargs):
@ -69,14 +80,27 @@ def chat_completions_create(
                return result

            except Exception as error:
+                error_type = type(error).__qualname__
                handle_span_exception(span, error)
                raise
+            finally:
+                duration = max((default_timer() - start), 0)
+                _record_metrics(
+                    instruments,
+                    duration,
+                    result,
+                    span_attributes,
+                    error_type,
+                )

    return traced_method


 def async_chat_completions_create(
-    tracer: Tracer, event_logger: EventLogger, capture_content: bool
+    tracer: Tracer,
+    event_logger: EventLogger,
+    instruments: Instruments,
+    capture_content: bool,
 ):
    """Wrap the `create` method of the `AsyncChatCompletion` class to trace it."""

@ -96,6 +120,9 @@ def async_chat_completions_create(
                        message_to_event(message, capture_content)
                    )

+            start = default_timer()
+            result = None
+            error_type = None
            try:
                result = await wrapped(*args, **kwargs)
                if is_streaming(kwargs):
@ -111,12 +138,88 @@ def async_chat_completions_create(
                return result

            except Exception as error:
+                error_type = type(error).__qualname__
                handle_span_exception(span, error)
                raise
+            finally:
+                duration = max((default_timer() - start), 0)
+                _record_metrics(
+                    instruments,
+                    duration,
+                    result,
+                    span_attributes,
+                    error_type,
+                )

    return traced_method


+def _record_metrics(
+    instruments: Instruments,
+    duration: float,
+    result,
+    span_attributes: dict,
+    error_type: Optional[str],
+):
+    common_attributes = {
+        GenAIAttributes.GEN_AI_OPERATION_NAME: GenAIAttributes.GenAiOperationNameValues.CHAT.value,
+        GenAIAttributes.GEN_AI_SYSTEM: GenAIAttributes.GenAiSystemValues.OPENAI.value,
+        GenAIAttributes.GEN_AI_REQUEST_MODEL: span_attributes[
+            GenAIAttributes.GEN_AI_REQUEST_MODEL
+        ],
+    }
+
+    if error_type:
+        common_attributes["error.type"] = error_type
+
+    if result and getattr(result, "model", None):
+        common_attributes[GenAIAttributes.GEN_AI_RESPONSE_MODEL] = result.model
+
+    if result and getattr(result, "service_tier", None):
+        common_attributes[
+            GenAIAttributes.GEN_AI_OPENAI_RESPONSE_SERVICE_TIER
+        ] = result.service_tier
+
+    if result and getattr(result, "system_fingerprint", None):
+        common_attributes["gen_ai.openai.response.system_fingerprint"] = (
+            result.system_fingerprint
+        )
+
+    if ServerAttributes.SERVER_ADDRESS in span_attributes:
+        common_attributes[ServerAttributes.SERVER_ADDRESS] = span_attributes[
+            ServerAttributes.SERVER_ADDRESS
+        ]
+
+    if ServerAttributes.SERVER_PORT in span_attributes:
+        common_attributes[ServerAttributes.SERVER_PORT] = span_attributes[
+            ServerAttributes.SERVER_PORT
+        ]
+
+    instruments.operation_duration_histogram.record(
+        duration,
+        attributes=common_attributes,
+    )
+
+    if result and getattr(result, "usage", None):
+        input_attributes = {
+            **common_attributes,
+            GenAIAttributes.GEN_AI_TOKEN_TYPE: GenAIAttributes.GenAiTokenTypeValues.INPUT.value,
+        }
+        instruments.token_usage_histogram.record(
+            result.usage.prompt_tokens,
+            attributes=input_attributes,
+        )
+
+        completion_attributes = {
+            **common_attributes,
+            GenAIAttributes.GEN_AI_TOKEN_TYPE: GenAIAttributes.GenAiTokenTypeValues.COMPLETION.value,
+        }
+        instruments.token_usage_histogram.record(
+            result.usage.completion_tokens,
+            attributes=completion_attributes,
+        )
+
+
 def _set_response_attributes(
    span, result, event_logger: EventLogger, capture_content: bool
 ):
--- a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/cassettes/test_async_chat_completion_metrics.yaml
+++ b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/cassettes/test_async_chat_completion_metrics.yaml
@ -0,0 +1,133 @@
+interactions:
+- request:
+    body: |-
+      {
+        "messages": [
+          {
+            "role": "user",
+            "content": "Say this is a test"
+          }
+        ],
+        "model": "gpt-4o-mini",
+        "stream": false
+      }
+    headers:
+      accept:
+      - application/json
+      accept-encoding:
+      - gzip, deflate
+      authorization:
+      - Bearer test_openai_api_key
+      connection:
+      - keep-alive
+      content-length:
+      - '106'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      user-agent:
+      - AsyncOpenAI/Python 1.26.0
+      x-stainless-arch:
+      - arm64
+      x-stainless-async:
+      - async:asyncio
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - MacOS
+      x-stainless-package-version:
+      - 1.26.0
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.12.5
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    body:
+      string: |-
+        {
+          "id": "chatcmpl-ASv9R2E7Yhb2e7bj4Xl0qm9s3J42Y",
+          "object": "chat.completion",
+          "created": 1731456237,
+          "model": "gpt-4o-mini-2024-07-18",
+          "choices": [
+            {
+              "index": 0,
+              "message": {
+                "role": "assistant",
+                "content": "This is a test. How can I assist you further?",
+                "refusal": null
+              },
+              "logprobs": null,
+              "finish_reason": "stop"
+            }
+          ],
+          "service_tier": "default",
+          "usage": {
+            "prompt_tokens": 12,
+            "completion_tokens": 12,
+            "total_tokens": 24,
+            "prompt_tokens_details": {
+              "cached_tokens": 0,
+              "audio_tokens": 0
+            },
+            "completion_tokens_details": {
+              "reasoning_tokens": 0,
+              "audio_tokens": 0,
+              "accepted_prediction_tokens": 0,
+              "rejected_prediction_tokens": 0
+            }
+          },
+          "system_fingerprint": "fp_0ba0d124f1"
+        }
+    headers:
+      CF-Cache-Status:
+      - DYNAMIC
+      CF-RAY:
+      - 8e1a80679a8311a6-MRS
+      Connection:
+      - keep-alive
+      Content-Type:
+      - application/json
+      Date:
+      - Wed, 13 Nov 2024 00:03:58 GMT
+      Server:
+      - cloudflare
+      Set-Cookie: test_set_cookie
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - nosniff
+      access-control-expose-headers:
+      - X-Request-ID
+      alt-svc:
+      - h3=":443"; ma=86400
+      content-length:
+      - '796'
+      openai-organization: test_openai_org_id
+      openai-processing-ms:
+      - '359'
+      openai-version:
+      - '2020-10-01'
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      x-ratelimit-limit-requests:
+      - '30000'
+      x-ratelimit-limit-tokens:
+      - '150000000'
+      x-ratelimit-remaining-requests:
+      - '29999'
+      x-ratelimit-remaining-tokens:
+      - '149999978'
+      x-ratelimit-reset-requests:
+      - 2ms
+      x-ratelimit-reset-tokens:
+      - 0s
+      x-request-id:
+      - req_41ea134c1fc450d4ca4cf8d0c6a7c53a
+    status:
+      code: 200
+      message: OK
+version: 1
--- a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/cassettes/test_chat_completion_metrics.yaml
+++ b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/cassettes/test_chat_completion_metrics.yaml
@ -0,0 +1,135 @@
+interactions:
+- request:
+    body: |-
+      {
+        "messages": [
+          {
+            "role": "user",
+            "content": "Say this is a test"
+          }
+        ],
+        "model": "gpt-4o-mini",
+        "stream": false
+      }
+    headers:
+      accept:
+      - application/json
+      accept-encoding:
+      - gzip, deflate
+      authorization:
+      - Bearer test_openai_api_key
+      connection:
+      - keep-alive
+      content-length:
+      - '106'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      user-agent:
+      - OpenAI/Python 1.54.3
+      x-stainless-arch:
+      - arm64
+      x-stainless-async:
+      - 'false'
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - MacOS
+      x-stainless-package-version:
+      - 1.54.3
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.12.6
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    body:
+      string: |-
+        {
+          "id": "chatcmpl-ASYMQRl3A3DXL9FWCK9tnGRcKIO7q",
+          "object": "chat.completion",
+          "created": 1731368630,
+          "model": "gpt-4o-mini-2024-07-18",
+          "choices": [
+            {
+              "index": 0,
+              "message": {
+                "role": "assistant",
+                "content": "This is a test.",
+                "refusal": null
+              },
+              "logprobs": null,
+              "finish_reason": "stop"
+            }
+          ],
+          "service_tier": "default",
+          "usage": {
+            "prompt_tokens": 12,
+            "completion_tokens": 5,
+            "total_tokens": 17,
+            "prompt_tokens_details": {
+              "cached_tokens": 0,
+              "audio_tokens": 0
+            },
+            "completion_tokens_details": {
+              "reasoning_tokens": 0,
+              "audio_tokens": 0,
+              "accepted_prediction_tokens": 0,
+              "rejected_prediction_tokens": 0
+            }
+          },
+          "system_fingerprint": "fp_0ba0d124f1"
+        }
+    headers:
+      CF-Cache-Status:
+      - DYNAMIC
+      CF-RAY:
+      - 8e122593ff368bc8-SIN
+      Connection:
+      - keep-alive
+      Content-Type:
+      - application/json
+      Date:
+      - Mon, 11 Nov 2024 23:43:50 GMT
+      Server:
+      - cloudflare
+      Set-Cookie: test_set_cookie
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - nosniff
+      access-control-expose-headers:
+      - X-Request-ID
+      alt-svc:
+      - h3=":443"; ma=86400
+      content-length:
+      - '765'
+      openai-organization: test_openai_org_id
+      openai-processing-ms:
+      - '287'
+      openai-version:
+      - '2020-10-01'
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      x-ratelimit-limit-requests:
+      - '10000'
+      x-ratelimit-limit-tokens:
+      - '200000'
+      x-ratelimit-remaining-requests:
+      - '9999'
+      x-ratelimit-remaining-tokens:
+      - '199977'
+      x-ratelimit-reset-requests:
+      - 8.64s
+      x-ratelimit-reset-tokens:
+      - 6ms
+      x-request-id:
+      - req_58cff97afd0e7c0bba910ccf0b044a6f
+    status:
+      code: 200
+      message: OK
+version: 1
--- a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/conftest.py
+++ b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/conftest.py
@ -17,6 +17,17 @@ from opentelemetry.sdk._logs.export import (
    InMemoryLogExporter,
    SimpleLogRecordProcessor,
 )
+from opentelemetry.sdk.metrics import (
+    Histogram,
+    MeterProvider,
+)
+from opentelemetry.sdk.metrics.export import (
+    InMemoryMetricReader,
+)
+from opentelemetry.sdk.metrics.view import (
+    ExplicitBucketHistogramAggregation,
+    View,
+)
 from opentelemetry.sdk.trace import TracerProvider
 from opentelemetry.sdk.trace.export import SimpleSpanProcessor
 from opentelemetry.sdk.trace.export.in_memory_span_exporter import (
@ -36,6 +47,12 @@ def fixture_log_exporter():
    yield exporter


+@pytest.fixture(scope="function", name="metric_reader")
+def fixture_metric_reader():
+    exporter = InMemoryMetricReader()
+    yield exporter
+
+
@pytest.fixture(scope="function", name="tracer_provider")
 def fixture_tracer_provider(span_exporter):
    provider = TracerProvider()
@ -52,6 +69,62 @@ def fixture_event_logger_provider(log_exporter):
    return event_logger_provider


+@pytest.fixture(scope="function", name="meter_provider")
+def fixture_meter_provider(metric_reader):
+    token_usage_histogram_view = View(
+        instrument_type=Histogram,
+        instrument_name="gen_ai.client.token.usage",
+        aggregation=ExplicitBucketHistogramAggregation(
+            boundaries=[
+                1,
+                4,
+                16,
+                64,
+                256,
+                1024,
+                4096,
+                16384,
+                65536,
+                262144,
+                1048576,
+                4194304,
+                16777216,
+                67108864,
+            ]
+        ),
+    )
+
+    duration_histogram_view = View(
+        instrument_type=Histogram,
+        instrument_name="gen_ai.client.operation.duration",
+        aggregation=ExplicitBucketHistogramAggregation(
+            boundaries=[
+                0.01,
+                0.02,
+                0.04,
+                0.08,
+                0.16,
+                0.32,
+                0.64,
+                1.28,
+                2.56,
+                5.12,
+                10.24,
+                20.48,
+                40.96,
+                81.92,
+            ]
+        ),
+    )
+
+    meter_provider = MeterProvider(
+        metric_readers=[metric_reader],
+        views=[token_usage_histogram_view, duration_histogram_view],
+    )
+
+    return meter_provider
+
+
@pytest.fixture(autouse=True)
 def environment():
    if not os.getenv("OPENAI_API_KEY"):
@ -83,7 +156,9 @@ def vcr_config():


@pytest.fixture(scope="function")
-def instrument_no_content(tracer_provider, event_logger_provider):
+def instrument_no_content(
+    tracer_provider, event_logger_provider, meter_provider
+):
    os.environ.update(
        {OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT: "False"}
    )
@ -92,6 +167,7 @@ def instrument_no_content(tracer_provider, event_logger_provider):
    instrumentor.instrument(
        tracer_provider=tracer_provider,
        event_logger_provider=event_logger_provider,
+        meter_provider=meter_provider,
    )

    yield instrumentor
@ -100,7 +176,9 @@ def instrument_no_content(tracer_provider, event_logger_provider):


@pytest.fixture(scope="function")
-def instrument_with_content(tracer_provider, event_logger_provider):
+def instrument_with_content(
+    tracer_provider, event_logger_provider, meter_provider
+):
    os.environ.update(
        {OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT: "True"}
    )
@ -108,6 +186,7 @@ def instrument_with_content(tracer_provider, event_logger_provider):
    instrumentor.instrument(
        tracer_provider=tracer_provider,
        event_logger_provider=event_logger_provider,
+        meter_provider=meter_provider,
    )

    yield instrumentor
--- a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/test_chat_completions.py
+++ b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/test_chat_completions.py
@ -32,6 +32,7 @@ from opentelemetry.semconv._incubating.attributes import (
 from opentelemetry.semconv._incubating.attributes import (
    server_attributes as ServerAttributes,
 )
+from opentelemetry.semconv._incubating.metrics import gen_ai_metrics


@pytest.mark.vcr()
@ -94,7 +95,9 @@ def test_chat_completion_no_content(
    assert_message_in_logs(logs[1], "gen_ai.choice", choice_event, spans[0])


-def test_chat_completion_bad_endpoint(span_exporter, instrument_no_content):
+def test_chat_completion_bad_endpoint(
+    span_exporter, metric_reader, instrument_no_content
+):
    llm_model_value = "gpt-4o-mini"
    messages_value = [{"role": "user", "content": "Say this is a test"}]

@ -116,10 +119,31 @@ def test_chat_completion_bad_endpoint(span_exporter, instrument_no_content):
        "APIConnectionError" == spans[0].attributes[ErrorAttributes.ERROR_TYPE]
    )

+    metrics = metric_reader.get_metrics_data().resource_metrics
+    assert len(metrics) == 1
+
+    metric_data = metrics[0].scope_metrics[0].metrics
+    duration_metric = next(
+        (
+            m
+            for m in metric_data
+            if m.name == gen_ai_metrics.GEN_AI_CLIENT_OPERATION_DURATION
+        ),
+        None,
+    )
+    assert duration_metric is not None
+    assert duration_metric.data.data_points[0].sum > 0
+    assert (
+        duration_metric.data.data_points[0].attributes[
+            ErrorAttributes.ERROR_TYPE
+        ]
+        == "APIConnectionError"
+    )
+

@pytest.mark.vcr()
 def test_chat_completion_404(
-    span_exporter, openai_client, instrument_no_content
+    span_exporter, openai_client, metric_reader, instrument_no_content
 ):
    llm_model_value = "this-model-does-not-exist"
    messages_value = [{"role": "user", "content": "Say this is a test"}]
@ -135,6 +159,27 @@ def test_chat_completion_404(
    assert_all_attributes(spans[0], llm_model_value)
    assert "NotFoundError" == spans[0].attributes[ErrorAttributes.ERROR_TYPE]

+    metrics = metric_reader.get_metrics_data().resource_metrics
+    assert len(metrics) == 1
+
+    metric_data = metrics[0].scope_metrics[0].metrics
+    duration_metric = next(
+        (
+            m
+            for m in metric_data
+            if m.name == gen_ai_metrics.GEN_AI_CLIENT_OPERATION_DURATION
+        ),
+        None,
+    )
+    assert duration_metric is not None
+    assert duration_metric.data.data_points[0].sum > 0
+    assert (
+        duration_metric.data.data_points[0].attributes[
+            ErrorAttributes.ERROR_TYPE
+        ]
+        == "NotFoundError"
+    )
+

@pytest.mark.vcr()
 def test_chat_completion_extra_params(
--- a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/test_chat_metrics.py
+++ b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/test_chat_metrics.py
@ -0,0 +1,190 @@
+import pytest
+
+from opentelemetry.semconv._incubating.attributes import (
+    gen_ai_attributes as GenAIAttributes,
+)
+from opentelemetry.semconv._incubating.attributes import (
+    server_attributes as ServerAttributes,
+)
+from opentelemetry.semconv._incubating.metrics import gen_ai_metrics
+
+
+def assert_all_metric_attributes(data_point):
+    assert GenAIAttributes.GEN_AI_OPERATION_NAME in data_point.attributes
+    assert (
+        data_point.attributes[GenAIAttributes.GEN_AI_OPERATION_NAME]
+        == GenAIAttributes.GenAiOperationNameValues.CHAT.value
+    )
+    assert GenAIAttributes.GEN_AI_SYSTEM in data_point.attributes
+    assert (
+        data_point.attributes[GenAIAttributes.GEN_AI_SYSTEM]
+        == GenAIAttributes.GenAiSystemValues.OPENAI.value
+    )
+    assert GenAIAttributes.GEN_AI_REQUEST_MODEL in data_point.attributes
+    assert (
+        data_point.attributes[GenAIAttributes.GEN_AI_REQUEST_MODEL]
+        == "gpt-4o-mini"
+    )
+    assert GenAIAttributes.GEN_AI_RESPONSE_MODEL in data_point.attributes
+    assert (
+        data_point.attributes[GenAIAttributes.GEN_AI_RESPONSE_MODEL]
+        == "gpt-4o-mini-2024-07-18"
+    )
+    assert "gen_ai.openai.response.system_fingerprint" in data_point.attributes
+    assert (
+        data_point.attributes["gen_ai.openai.response.system_fingerprint"]
+        == "fp_0ba0d124f1"
+    )
+    assert (
+        GenAIAttributes.GEN_AI_OPENAI_RESPONSE_SERVICE_TIER
+        in data_point.attributes
+    )
+    assert (
+        data_point.attributes[
+            GenAIAttributes.GEN_AI_OPENAI_RESPONSE_SERVICE_TIER
+        ]
+        == "default"
+    )
+    assert (
+        data_point.attributes[ServerAttributes.SERVER_ADDRESS]
+        == "api.openai.com"
+    )
+
+
+@pytest.mark.vcr()
+def test_chat_completion_metrics(
+    metric_reader, openai_client, instrument_with_content
+):
+    llm_model_value = "gpt-4o-mini"
+    messages_value = [{"role": "user", "content": "Say this is a test"}]
+
+    openai_client.chat.completions.create(
+        messages=messages_value, model=llm_model_value, stream=False
+    )
+
+    metrics = metric_reader.get_metrics_data().resource_metrics
+    assert len(metrics) == 1
+
+    metric_data = metrics[0].scope_metrics[0].metrics
+    assert len(metric_data) == 2
+
+    duration_metric = next(
+        (
+            m
+            for m in metric_data
+            if m.name == gen_ai_metrics.GEN_AI_CLIENT_OPERATION_DURATION
+        ),
+        None,
+    )
+    assert duration_metric is not None
+    assert duration_metric.data.data_points[0].sum > 0
+    assert_all_metric_attributes(duration_metric.data.data_points[0])
+
+    token_usage_metric = next(
+        (
+            m
+            for m in metric_data
+            if m.name == gen_ai_metrics.GEN_AI_CLIENT_TOKEN_USAGE
+        ),
+        None,
+    )
+    assert token_usage_metric is not None
+
+    input_token_usage = next(
+        (
+            d
+            for d in token_usage_metric.data.data_points
+            if d.attributes[GenAIAttributes.GEN_AI_TOKEN_TYPE]
+            == GenAIAttributes.GenAiTokenTypeValues.INPUT.value
+        ),
+        None,
+    )
+    assert input_token_usage is not None
+    assert input_token_usage.sum == 12
+    # assert against buckets [1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864]
+    assert input_token_usage.bucket_counts[2] == 1
+    assert_all_metric_attributes(input_token_usage)
+
+    output_token_usage = next(
+        (
+            d
+            for d in token_usage_metric.data.data_points
+            if d.attributes[GenAIAttributes.GEN_AI_TOKEN_TYPE]
+            == GenAIAttributes.GenAiTokenTypeValues.COMPLETION.value
+        ),
+        None,
+    )
+    assert output_token_usage is not None
+    assert output_token_usage.sum == 5
+    # assert against buckets [1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864]
+    assert output_token_usage.bucket_counts[2] == 1
+    assert_all_metric_attributes(output_token_usage)
+
+
+@pytest.mark.vcr()
+@pytest.mark.asyncio()
+async def test_async_chat_completion_metrics(
+    metric_reader, async_openai_client, instrument_with_content
+):
+    llm_model_value = "gpt-4o-mini"
+    messages_value = [{"role": "user", "content": "Say this is a test"}]
+
+    await async_openai_client.chat.completions.create(
+        messages=messages_value, model=llm_model_value, stream=False
+    )
+
+    metrics = metric_reader.get_metrics_data().resource_metrics
+    assert len(metrics) == 1
+
+    metric_data = metrics[0].scope_metrics[0].metrics
+    assert len(metric_data) == 2
+
+    duration_metric = next(
+        (
+            m
+            for m in metric_data
+            if m.name == gen_ai_metrics.GEN_AI_CLIENT_OPERATION_DURATION
+        ),
+        None,
+    )
+    assert duration_metric is not None
+    assert duration_metric.data.data_points[0].sum > 0
+    assert_all_metric_attributes(duration_metric.data.data_points[0])
+
+    token_usage_metric = next(
+        (
+            m
+            for m in metric_data
+            if m.name == gen_ai_metrics.GEN_AI_CLIENT_TOKEN_USAGE
+        ),
+        None,
+    )
+    assert token_usage_metric is not None
+
+    input_token_usage = next(
+        (
+            d
+            for d in token_usage_metric.data.data_points
+            if d.attributes[GenAIAttributes.GEN_AI_TOKEN_TYPE]
+            == GenAIAttributes.GenAiTokenTypeValues.INPUT.value
+        ),
+        None,
+    )
+
+    assert input_token_usage is not None
+    assert input_token_usage.sum == 12
+    assert_all_metric_attributes(input_token_usage)
+
+    output_token_usage = next(
+        (
+            d
+            for d in token_usage_metric.data.data_points
+            if d.attributes[GenAIAttributes.GEN_AI_TOKEN_TYPE]
+            == GenAIAttributes.GenAiTokenTypeValues.COMPLETION.value
+        ),
+        None,
+    )
+
+    assert output_token_usage is not None
+    assert output_token_usage.sum == 12
+    assert_all_metric_attributes(output_token_usage)