diff --git a/METRICS.md b/METRICS.md new file mode 100644 index 0000000..ab64ea6 --- /dev/null +++ b/METRICS.md @@ -0,0 +1,84 @@ +# Aggregated Metrics Endpoint + +The model-runner now exposes an aggregated `/metrics` endpoint that collects and labels metrics from all active llama.cpp runners. + +## Overview + +When llama.cpp models are running, each server automatically exposes Prometheus-compatible metrics at its `/metrics` endpoint. The model-runner now aggregates these metrics from all active runners, adds identifying labels, and serves them through a unified `/metrics` endpoint. This provides a comprehensive view of all running models with proper Prometheus labeling. + +## Aggregated Metrics Format + +Instead of exposing metrics from a single runner, the endpoint now aggregates metrics from all active runners and adds labels to identify the source: + +### Example Output + +```prometheus +# HELP llama_prompt_tokens_total Total number of prompt tokens processed +# TYPE llama_prompt_tokens_total counter +llama_prompt_tokens_total{backend="llama.cpp",model="llama3.2:latest",mode="completion"} 4934 +llama_prompt_tokens_total{backend="llama.cpp",model="ai/mxbai-embed-large:335M-F16",mode="embedding"} 4525 + +# HELP llama_generation_tokens_total Total number of tokens generated +# TYPE llama_generation_tokens_total counter +llama_generation_tokens_total{backend="llama.cpp",model="llama3.2:latest",mode="completion"} 2156 + +# HELP llama_requests_total Total number of requests processed +# TYPE llama_requests_total counter +llama_requests_total{backend="llama.cpp",model="llama3.2:latest",mode="completion"} 127 +llama_requests_total{backend="llama.cpp",model="ai/mxbai-embed-large:335M-F16",mode="embedding"} 89 +``` + +### Labels Added + +Each metric is automatically labeled with: +- **`backend`**: The inference backend (e.g., "llama.cpp") +- **`model`**: The model name (e.g., "llama3.2:latest") +- **`mode`**: The operation mode ("completion" or "embedding") + +## Usage + +### Enabling Metrics (Default) + +By default, the aggregated metrics endpoint is enabled. When the model-runner starts with active runners, you can access metrics at: + +``` +GET /metrics +``` + +### Disabling Metrics + +To disable the metrics endpoint, set the `DISABLE_METRICS` environment variable: + +```bash +export DISABLE_METRICS=1 +``` + +### TCP Port Access + +If you're running the model-runner with a TCP port (using `MODEL_RUNNER_PORT`), you can access metrics via HTTP: + +```bash +# If MODEL_RUNNER_PORT=8080 +curl http://localhost:8080/metrics +``` + +### Unix Socket Access + +If using Unix sockets (default), you'll need to use a tool that supports Unix socket HTTP requests: + +```bash +# Using curl with Unix socket +curl --unix-socket model-runner.sock http://localhost/metrics +``` + +## Metrics Available + +The aggregated endpoint exposes all metrics from active llama.cpp runners, typically including: + +- **Request metrics**: Total requests, request duration, queue statistics +- **Token metrics**: Prompt tokens, generation tokens, tokens per second +- **Memory metrics**: Memory usage, cache statistics +- **Model metrics**: Model loading status, context usage +- **Performance metrics**: Processing latency, throughput + +All metrics retain their original names and types but gain the additional identifying labels. diff --git a/README.md b/README.md index dcfa54b..7de6e17 100644 --- a/README.md +++ b/README.md @@ -95,6 +95,9 @@ curl http://localhost:8080/engines/llama.cpp/v1/chat/completions -X POST -d '{ # Delete a model curl http://localhost:8080/models/ai/smollm2 -X DELETE + +# Get metrics +curl http://localhost:8080/metrics ``` The response will contain the model's reply: @@ -122,3 +125,22 @@ The response will contain the model's reply: } } ``` + +## Metrics + +The Model Runner exposes [the metrics endpoint](https://github.com/ggml-org/llama.cpp/tree/master/tools/server#get-metrics-prometheus-compatible-metrics-exporter) of llama.cpp server at the `/metrics` endpoint. This allows you to monitor model performance, request statistics, and resource usage. + +### Accessing Metrics + +```sh +# Get metrics in Prometheus format +curl http://localhost:8080/metrics +``` + +### Configuration + +- **Enable metrics (default)**: Metrics are enabled by default +- **Disable metrics**: Set `DISABLE_METRICS=1` environment variable +- **Monitoring integration**: Add the endpoint to your Prometheus configuration + +Check [METRICS.md](./METRICS.md) for more details. diff --git a/go.mod b/go.mod index bf3e67d..f122d22 100644 --- a/go.mod +++ b/go.mod @@ -8,11 +8,14 @@ require ( github.com/docker/model-distribution v0.0.0-20250512190053-b3792c042d57 github.com/google/go-containerregistry v0.20.3 github.com/jaypipes/ghw v0.16.0 + github.com/mattn/go-shellwords v1.0.12 github.com/opencontainers/go-digest v1.0.0 github.com/opencontainers/image-spec v1.1.1 + github.com/prometheus/client_model v0.6.2 + github.com/prometheus/common v0.64.0 github.com/sirupsen/logrus v1.9.3 github.com/stretchr/testify v1.10.0 - golang.org/x/sync v0.12.0 + golang.org/x/sync v0.14.0 ) require ( @@ -21,7 +24,7 @@ require ( github.com/containerd/errdefs v1.0.0 // indirect github.com/containerd/log v0.1.0 // indirect github.com/containerd/stargz-snapshotter/estargz v0.16.3 // indirect - github.com/davecgh/go-spew v1.1.1 // indirect + github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/distribution/reference v0.6.0 // indirect github.com/docker/cli v27.5.0+incompatible // indirect github.com/docker/distribution v2.8.3+incompatible // indirect @@ -34,28 +37,31 @@ require ( github.com/henvic/httpretty v0.1.4 // indirect github.com/jaypipes/pcidb v1.0.1 // indirect github.com/json-iterator/go v1.1.12 // indirect - github.com/klauspost/compress v1.17.11 // indirect - github.com/mattn/go-shellwords v1.0.12 // indirect + github.com/klauspost/compress v1.18.0 // indirect github.com/mitchellh/go-homedir v1.1.0 // indirect github.com/moby/locker v1.0.1 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pkg/errors v0.9.1 // indirect - github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/rs/dnscache v0.0.0-20230804202142-fc85eb664529 // indirect github.com/smallnest/ringbuffer v0.0.0-20241116012123-461381446e3d // indirect github.com/vbatts/tar-split v0.11.6 // indirect go.opentelemetry.io/auto/sdk v1.1.0 // indirect - go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 // indirect go.opentelemetry.io/otel v1.35.0 // indirect go.opentelemetry.io/otel/metric v1.35.0 // indirect go.opentelemetry.io/otel/trace v1.35.0 // indirect - golang.org/x/crypto v0.35.0 // indirect - golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f // indirect - golang.org/x/mod v0.22.0 // indirect - golang.org/x/sys v0.31.0 // indirect - golang.org/x/tools v0.29.0 // indirect + golang.org/x/crypto v0.37.0 // indirect + golang.org/x/exp v0.0.0-20250106191152-7588d65b2ba8 // indirect + golang.org/x/mod v0.24.0 // indirect + golang.org/x/sys v0.33.0 // indirect + golang.org/x/tools v0.32.0 // indirect gonum.org/v1/gonum v0.15.1 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20250414145226-207652e42e2e // indirect + google.golang.org/grpc v1.72.0 // indirect + google.golang.org/protobuf v1.36.6 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect howett.net/plist v1.0.0 // indirect ) diff --git a/go.sum b/go.sum index 8f2290d..a8fc0d8 100644 --- a/go.sum +++ b/go.sum @@ -27,8 +27,9 @@ github.com/containerd/stargz-snapshotter/estargz v0.16.3/go.mod h1:uyr4BfYfOj3G9 github.com/containerd/typeurl/v2 v2.2.3 h1:yNA/94zxWdvYACdYO8zofhrTVuQY73fFU1y++dYSw40= github.com/containerd/typeurl/v2 v2.2.3/go.mod h1:95ljDnPfD3bAbDJRugOiShd/DlAAsxGtUBhJxIn7SCk= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk= github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E= github.com/docker/cli v27.5.0+incompatible h1:aMphQkcGtpHixwwhAXJT1rrK/detk2JIvDaFkLctbGM= @@ -58,6 +59,8 @@ github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX github.com/google/go-containerregistry v0.20.3 h1:oNx7IdTI936V8CQRveCjaxOiegWwvM7kqkbXTpyiovI= github.com/google/go-containerregistry v0.20.3/go.mod h1:w00pIgBRDVUDFM6bq+Qx8lwNWK+cxgCuX1vd3PIBDNI= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/gpustack/gguf-parser-go v0.14.1 h1:tmz2eTnSEFfE52V10FESqo9oAUquZ6JKQFntWC/wrEg= github.com/gpustack/gguf-parser-go v0.14.1/go.mod h1:GvHh1Kvvq5ojCOsJ5UpwiJJmIjFw3Qk5cW7R+CZ3IJo= github.com/henvic/httpretty v0.1.4 h1:Jo7uwIRWVFxkqOnErcoYfH90o3ddQyVrSANeS4cxYmU= @@ -69,8 +72,8 @@ github.com/jaypipes/pcidb v1.0.1/go.mod h1:6xYUz/yYEyOkIkUt2t2J2folIuZ4Yg6uByCGF github.com/jessevdk/go-flags v1.4.0/go.mod h1:4FA24M0QyGHXBuZZK/XkWh8h0e1EYbRYJSGM75WSRxI= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= -github.com/klauspost/compress v1.17.11 h1:In6xLpyWOi1+C7tXUUWv2ot1QvBjxevKAaI6IXrJmUc= -github.com/klauspost/compress v1.17.11/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0= +github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= +github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= @@ -90,14 +93,21 @@ github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= github.com/opencontainers/image-spec v1.1.1 h1:y0fUlFfIZhPF1W537XOLg0/fcx6zcHCJwooC2xJA040= github.com/opencontainers/image-spec v1.1.1/go.mod h1:qpqAh3Dmcf36wStyyWU+kCeDgrGnAve2nCC8+7h8Q0M= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= +github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= +github.com/prometheus/common v0.64.0 h1:pdZeA+g617P7oGv1CzdTzyeShxAGrTBsolKNOLQPGO4= +github.com/prometheus/common v0.64.0/go.mod h1:0gZns+BLRQ3V6NdaerOhMbwwRbNh9hkGINtQAsP5GS8= github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= github.com/rs/dnscache v0.0.0-20230804202142-fc85eb664529 h1:18kd+8ZUlt/ARXhljq+14TwAoKa61q6dX8jtwOf6DH8= @@ -117,39 +127,43 @@ go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0= go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo= go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 h1:yd02MEjBdJkG3uabWP9apV+OuWRIXGDuJEUJbOHmCFU= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0/go.mod h1:umTcuxiv1n/s/S6/c2AT/g2CQ7u5C59sHDNmfSwgz7Q= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 h1:sbiXRNDSWJOTobXh5HyQKjq6wUC5tNybqjIqDpAY4CU= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0/go.mod h1:69uWxva0WgAA/4bu2Yy70SLDBwZXuQ6PbBpbsa5iZrQ= go.opentelemetry.io/otel v1.35.0 h1:xKWKPxrxB6OtMCbmMY021CqC45J+3Onta9MqjhnusiQ= go.opentelemetry.io/otel v1.35.0/go.mod h1:UEqy8Zp11hpkUrL73gSlELM0DupHoiq72dR+Zqel/+Y= go.opentelemetry.io/otel/metric v1.35.0 h1:0znxYu2SNyuMSQT4Y9WDWej0VpcsxkuklLa4/siN90M= go.opentelemetry.io/otel/metric v1.35.0/go.mod h1:nKVFgxBZ2fReX6IlyW28MgZojkoAkJGaE8CpgeAU3oE= +go.opentelemetry.io/otel/sdk v1.35.0 h1:iPctf8iprVySXSKJffSS79eOjl9pvxV9ZqOWT0QejKY= +go.opentelemetry.io/otel/sdk v1.35.0/go.mod h1:+ga1bZliga3DxJ3CQGg3updiaAJoNECOgJREo9KHGQg= +go.opentelemetry.io/otel/sdk/metric v1.35.0 h1:1RriWBmCKgkeHEhM7a2uMjMUfP7MsOF5JpUCaEqEI9o= +go.opentelemetry.io/otel/sdk/metric v1.35.0/go.mod h1:is6XYCUMpcKi+ZsOvfluY5YstFnhW0BidkR+gL+qN+w= go.opentelemetry.io/otel/trace v1.35.0 h1:dPpEfJu1sDIqruz7BHFG3c7528f6ddfSWfFDVt/xgMs= go.opentelemetry.io/otel/trace v1.35.0/go.mod h1:WUk7DtFp1Aw2MkvqGdwiXYDZZNvA/1J8o6xRXLrIkyc= -golang.org/x/crypto v0.35.0 h1:b15kiHdrGCHrP6LvwaQ3c03kgNhhiMgvlhxHQhmg2Xs= -golang.org/x/crypto v0.35.0/go.mod h1:dy7dXNW32cAb/6/PRuTNsix8T+vJAqvuIy5Bli/x0YQ= -golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f h1:XdNn9LlyWAhLVp6P/i8QYBW+hlyhrhei9uErw2B5GJo= -golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f/go.mod h1:D5SMRVC3C2/4+F/DB1wZsLRnSNimn2Sp/NPsCrsv8ak= -golang.org/x/mod v0.22.0 h1:D4nJWe9zXqHOmWqj4VMOJhvzj7bEZg4wEYa759z1pH4= -golang.org/x/mod v0.22.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY= +golang.org/x/crypto v0.37.0 h1:kJNSjF/Xp7kU0iB2Z+9viTPMW4EqqsrywMXLJOOsXSE= +golang.org/x/crypto v0.37.0/go.mod h1:vg+k43peMZ0pUMhYmVAWysMK35e6ioLh3wB8ZCAfbVc= +golang.org/x/exp v0.0.0-20250106191152-7588d65b2ba8 h1:yqrTHse8TCMW1M1ZCP+VAR/l0kKxwaAIqN/il7x4voA= +golang.org/x/exp v0.0.0-20250106191152-7588d65b2ba8/go.mod h1:tujkw807nyEEAamNbDrEGzRav+ilXA7PCRAd6xsmwiU= +golang.org/x/mod v0.24.0 h1:ZfthKaKaT4NrhGVZHO1/WDTwGES4De8KtWO0SIbNJMU= +golang.org/x/mod v0.24.0/go.mod h1:IXM97Txy2VM4PJ3gI61r1YEk/gAj6zAHN3AdZt6S9Ww= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.12.0 h1:MHc5BpPuC30uJk597Ri8TV3CNZcTLu6B6z4lJy+g6Jw= -golang.org/x/sync v0.12.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sync v0.14.0 h1:woo0S4Yywslg6hp4eUFjTVOyKt0RookbpAHG4c1HmhQ= +golang.org/x/sync v0.14.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.31.0 h1:ioabZlmFYtWhL+TRYpcnNlLwhyxaM9kWTDEmfnprqik= -golang.org/x/sys v0.31.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= -golang.org/x/term v0.29.0 h1:L6pJp37ocefwRRtYPKSWOWzOtWSxVajvz2ldH/xi3iU= -golang.org/x/term v0.29.0/go.mod h1:6bl4lRlvVuDgSf3179VpIxBF0o10JUpXWOnI7nErv7s= -golang.org/x/tools v0.29.0 h1:Xx0h3TtM9rzQpQuR4dKLrdglAmCEN5Oi+P74JdhdzXE= -golang.org/x/tools v0.29.0/go.mod h1:KMQVMRsVxU6nHCFXrBPhDB8XncLNLM0lIy/F14RP588= +golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= +golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/term v0.31.0 h1:erwDkOK1Msy6offm1mOgvspSkslFnIGsFnxOKoufg3o= +golang.org/x/term v0.31.0/go.mod h1:R4BeIy7D95HzImkxGkTW1UQTtP54tio2RyHz7PwK0aw= +golang.org/x/tools v0.32.0 h1:Q7N1vhpkQv7ybVzLFtTjvQya2ewbwNDZzUgfXGqtMWU= +golang.org/x/tools v0.32.0/go.mod h1:ZxrU41P/wAbZD8EDa6dDCa6XfpkhJ7HFMjHJXfBDu8s= gonum.org/v1/gonum v0.15.1 h1:FNy7N6OUZVUaWG9pTiD+jlhdQ3lMP+/LcTpJ6+a8sQ0= gonum.org/v1/gonum v0.15.1/go.mod h1:eZTZuRFrzu5pcyjN5wJhcIhnUdNijYxX1T2IcrOGY0o= -google.golang.org/genproto/googleapis/rpc v0.0.0-20241021214115-324edc3d5d38 h1:zciRKQ4kBpFgpfC5QQCVtnnNAcLIqweL7plyZRQHVpI= -google.golang.org/genproto/googleapis/rpc v0.0.0-20241021214115-324edc3d5d38/go.mod h1:GX3210XPVPUjJbTUbvwI8f2IpZDMZuPJWDzDuebbviI= -google.golang.org/grpc v1.68.1 h1:oI5oTa11+ng8r8XMMN7jAOmWfPZWbYpCFaMUTACxkM0= -google.golang.org/grpc v1.68.1/go.mod h1:+q1XYFJjShcqn0QZHvCyeR4CXPA+llXIeUIfIe00waw= -google.golang.org/protobuf v1.36.3 h1:82DV7MYdb8anAVi3qge1wSnMDrnKK7ebr+I0hHRN1BU= -google.golang.org/protobuf v1.36.3/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250414145226-207652e42e2e h1:ztQaXfzEXTmCBvbtWYRhJxW+0iJcz2qXfd38/e9l7bA= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250414145226-207652e42e2e/go.mod h1:qQ0YXyHHx3XkvlzUtpXDkS29lDSafHMZBAZDc03LQ3A= +google.golang.org/grpc v1.72.0 h1:S7UkcVa60b5AAQTaO6ZKamFp1zMZSU0fGDK2WZLbBnM= +google.golang.org/grpc v1.72.0/go.mod h1:wH5Aktxcg25y1I3w7H69nHfXdOG3UiadoBtjh3izSDM= +google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= +google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= diff --git a/main.go b/main.go index 35a165f..ccf730e 100644 --- a/main.go +++ b/main.go @@ -112,6 +112,18 @@ func main() { router.Handle(route, scheduler) } + // Add metrics endpoint if enabled + if os.Getenv("DISABLE_METRICS") != "1" { + metricsHandler := metrics.NewAggregatedMetricsHandler( + log.WithField("component", "metrics"), + scheduler, + ) + router.Handle("/metrics", metricsHandler) + log.Info("Metrics endpoint enabled at /metrics") + } else { + log.Info("Metrics endpoint disabled") + } + server := &http.Server{Handler: router} serverErrors := make(chan error, 1) diff --git a/pkg/inference/backends/llamacpp/llamacpp_config.go b/pkg/inference/backends/llamacpp/llamacpp_config.go index 767cbb1..c7990ce 100644 --- a/pkg/inference/backends/llamacpp/llamacpp_config.go +++ b/pkg/inference/backends/llamacpp/llamacpp_config.go @@ -15,7 +15,7 @@ type Config struct { // NewDefaultLlamaCppConfig creates a new LlamaCppConfig with default values. func NewDefaultLlamaCppConfig() *Config { - args := append([]string{"--jinja", "-ngl", "100"}) + args := append([]string{"--jinja", "-ngl", "100", "--metrics"}) // Special case for Windows ARM64 if runtime.GOOS == "windows" && runtime.GOARCH == "arm64" { diff --git a/pkg/inference/backends/llamacpp/llamacpp_config_test.go b/pkg/inference/backends/llamacpp/llamacpp_config_test.go index c427a4e..d1b937b 100644 --- a/pkg/inference/backends/llamacpp/llamacpp_config_test.go +++ b/pkg/inference/backends/llamacpp/llamacpp_config_test.go @@ -81,6 +81,7 @@ func TestGetArgs(t *testing.T) { expected: []string{ "--jinja", "-ngl", "100", + "--metrics", "--model", modelPath, "--host", socket, }, @@ -91,6 +92,7 @@ func TestGetArgs(t *testing.T) { expected: []string{ "--jinja", "-ngl", "100", + "--metrics", "--model", modelPath, "--host", socket, "--embeddings", diff --git a/pkg/inference/scheduling/scheduler.go b/pkg/inference/scheduling/scheduler.go index 8fbe721..9b9e3fc 100644 --- a/pkg/inference/scheduling/scheduler.go +++ b/pkg/inference/scheduling/scheduler.go @@ -405,6 +405,84 @@ func (s *Scheduler) Configure(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusAccepted) } +// GetAllActiveRunners returns information about all active runners +func (s *Scheduler) GetAllActiveRunners() []metrics.ActiveRunner { + runningBackends := s.getLoaderStatus(context.Background()) + var activeRunners []metrics.ActiveRunner + + if !s.loader.lock(context.Background()) { + return activeRunners + } + defer s.loader.unlock() + + for _, backend := range runningBackends { + // Find the runner slot for this backend/model combination + key := runnerKey{ + backend: backend.BackendName, + model: backend.ModelName, + mode: parseBackendMode(backend.Mode), + } + + if slot, exists := s.loader.runners[key]; exists { + socket, err := RunnerSocketPath(slot) + if err != nil { + s.log.Warnf("Failed to get socket path for runner %s/%s: %v", backend.BackendName, backend.ModelName, err) + continue + } + + activeRunners = append(activeRunners, metrics.ActiveRunner{ + BackendName: backend.BackendName, + ModelName: backend.ModelName, + Mode: backend.Mode, + Socket: socket, + }) + } + } + + return activeRunners +} + +// GetLlamaCppSocket returns the Unix socket path for an active llama.cpp runner +func (s *Scheduler) GetLlamaCppSocket() (string, error) { + runningBackends := s.getLoaderStatus(context.Background()) + + if !s.loader.lock(context.Background()) { + return "", errors.New("failed to acquire loader lock") + } + defer s.loader.unlock() + + // Look for an active llama.cpp backend + for _, backend := range runningBackends { + if backend.BackendName == "llama.cpp" { + // Find the runner slot for this backend/model combination + key := runnerKey{ + backend: backend.BackendName, + model: backend.ModelName, + mode: parseBackendMode(backend.Mode), + } + + if slot, exists := s.loader.runners[key]; exists { + // Use the RunnerSocketPath function to get the socket path + return RunnerSocketPath(slot) + } + } + } + + return "", errors.New("no active llama.cpp backend found") +} + +// parseBackendMode converts a string mode to BackendMode +func parseBackendMode(mode string) inference.BackendMode { + switch mode { + case "completion": + return inference.BackendModeCompletion + case "embedding": + return inference.BackendModeEmbedding + default: + return inference.BackendModeCompletion + } +} + // ServeHTTP implements net/http.Handler.ServeHTTP. func (s *Scheduler) ServeHTTP(w http.ResponseWriter, r *http.Request) { s.lock.Lock() diff --git a/pkg/metrics/aggregated_handler.go b/pkg/metrics/aggregated_handler.go new file mode 100644 index 0000000..adaa40c --- /dev/null +++ b/pkg/metrics/aggregated_handler.go @@ -0,0 +1,171 @@ +package metrics + +import ( + "context" + "fmt" + "io" + "net" + "net/http" + "strings" + "sync" + "time" + + "github.com/docker/model-runner/pkg/logging" + dto "github.com/prometheus/client_model/go" + "github.com/prometheus/common/expfmt" +) + +// AggregatedMetricsHandler collects metrics from all active runners and aggregates them with labels +type AggregatedMetricsHandler struct { + log logging.Logger + scheduler SchedulerInterface +} + +// NewAggregatedMetricsHandler creates a new aggregated metrics handler +func NewAggregatedMetricsHandler(log logging.Logger, scheduler SchedulerInterface) *AggregatedMetricsHandler { + return &AggregatedMetricsHandler{ + log: log, + scheduler: scheduler, + } +} + +// ServeHTTP implements http.Handler for aggregated metrics +func (h *AggregatedMetricsHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + http.Error(w, http.StatusText(http.StatusMethodNotAllowed), http.StatusMethodNotAllowed) + return + } + + runners := h.scheduler.GetAllActiveRunners() + if len(runners) == 0 { + w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8") + w.WriteHeader(http.StatusOK) + fmt.Fprintf(w, "# No active runners\n") + return + } + + // Collect and aggregate metrics from all runners + allFamilies := h.collectAndAggregateMetrics(r.Context(), runners) + + // Write aggregated response using Prometheus encoder + h.writeAggregatedMetrics(w, allFamilies) +} + +// collectAndAggregateMetrics fetches metrics from all runners and aggregates them +func (h *AggregatedMetricsHandler) collectAndAggregateMetrics(ctx context.Context, runners []ActiveRunner) map[string]*dto.MetricFamily { + var wg sync.WaitGroup + var mu sync.Mutex + allFamilies := make(map[string]*dto.MetricFamily) + + for _, runner := range runners { + wg.Add(1) + go func(runner ActiveRunner) { + defer wg.Done() + + families, err := h.fetchRunnerMetrics(ctx, runner) + if err != nil { + h.log.Warnf("Failed to fetch metrics from runner %s/%s: %v", runner.BackendName, runner.ModelName, err) + return + } + + // Add labels to metrics and merge into allFamilies + labels := map[string]string{ + "backend": runner.BackendName, + "model": runner.ModelName, + "mode": runner.Mode, + } + + mu.Lock() + h.addLabelsAndMerge(families, labels, allFamilies) + mu.Unlock() + }(runner) + } + + wg.Wait() + return allFamilies +} + +// fetchRunnerMetrics fetches and parses metrics from a single runner +func (h *AggregatedMetricsHandler) fetchRunnerMetrics(ctx context.Context, runner ActiveRunner) (map[string]*dto.MetricFamily, error) { + // Create HTTP client for Unix socket communication + client := &http.Client{ + Transport: &http.Transport{ + DialContext: func(ctx context.Context, network, addr string) (net.Conn, error) { + return net.DialTimeout("unix", runner.Socket, 5*time.Second) + }, + }, + Timeout: 10 * time.Second, + } + + // Create request to the runner's metrics endpoint + req, err := http.NewRequestWithContext(ctx, "GET", "http://unix/metrics", nil) + if err != nil { + return nil, fmt.Errorf("failed to create metrics request: %w", err) + } + + // Make the request + resp, err := client.Do(req) + if err != nil { + return nil, fmt.Errorf("failed to fetch metrics: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("metrics endpoint returned status %d", resp.StatusCode) + } + + // Read response body + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("failed to read metrics response: %w", err) + } + + // Parse metrics using official Prometheus parser + parser := expfmt.TextParser{} + families, err := parser.TextToMetricFamilies(strings.NewReader(string(body))) + if err != nil { + return nil, fmt.Errorf("failed to parse metrics: %w", err) + } + + return families, nil +} + +// addLabelsAndMerge adds labels to metrics and merges them into the aggregated families +func (h *AggregatedMetricsHandler) addLabelsAndMerge(families map[string]*dto.MetricFamily, labels map[string]string, allFamilies map[string]*dto.MetricFamily) { + for name, family := range families { + // Add labels to each metric in the family + for _, metric := range family.GetMetric() { + // Add our labels to the existing label pairs + for key, value := range labels { + metric.Label = append(metric.Label, &dto.LabelPair{ + Name: &key, + Value: &value, + }) + } + } + + // Merge into allFamilies + if existingFamily, exists := allFamilies[name]; exists { + // Append metrics to existing family + existingFamily.Metric = append(existingFamily.Metric, family.GetMetric()...) + } else { + // Create new family + allFamilies[name] = family + } + } +} + +// writeAggregatedMetrics writes the aggregated metrics using Prometheus encoder +func (h *AggregatedMetricsHandler) writeAggregatedMetrics(w http.ResponseWriter, families map[string]*dto.MetricFamily) { + w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8") + w.WriteHeader(http.StatusOK) + + // Use Prometheus encoder to write metrics + encoder := expfmt.NewEncoder(w, expfmt.NewFormat(expfmt.TypeTextPlain)) + for _, family := range families { + if err := encoder.Encode(family); err != nil { + h.log.Errorf("Failed to encode metric family %s: %v", *family.Name, err) + continue + } + } +} diff --git a/pkg/metrics/scheduler_proxy.go b/pkg/metrics/scheduler_proxy.go new file mode 100644 index 0000000..fd55658 --- /dev/null +++ b/pkg/metrics/scheduler_proxy.go @@ -0,0 +1,99 @@ +package metrics + +import ( + "io" + "net" + "net/http" + "time" + + "github.com/docker/model-runner/pkg/logging" +) + +// SchedulerMetricsHandler handles metrics requests by finding active llama.cpp runners +type SchedulerMetricsHandler struct { + log logging.Logger + scheduler SchedulerInterface +} + +// SchedulerInterface defines the methods we need from the scheduler +type SchedulerInterface interface { + GetRunningBackends(w http.ResponseWriter, r *http.Request) + GetLlamaCppSocket() (string, error) + GetAllActiveRunners() []ActiveRunner +} + +// ActiveRunner contains information about an active runner +type ActiveRunner struct { + BackendName string + ModelName string + Mode string + Socket string +} + +// ServeHTTP implements http.Handler for metrics proxying via scheduler +func (h *SchedulerMetricsHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + http.Error(w, http.StatusText(http.StatusMethodNotAllowed), http.StatusMethodNotAllowed) + return + } + + // Get the socket path for the active llama.cpp runner + socket, err := h.scheduler.GetLlamaCppSocket() + if err != nil { + h.log.Errorf("Failed to get llama.cpp socket: %v", err) + http.Error(w, "Metrics endpoint not available", http.StatusServiceUnavailable) + return + } + + // Create HTTP client for Unix socket communication + client := &http.Client{ + Transport: &http.Transport{ + Dial: func(network, addr string) (net.Conn, error) { + return net.DialTimeout("unix", socket, 5*time.Second) + }, + }, + Timeout: 10 * time.Second, + } + + // Create request to the backend metrics endpoint + req, err := http.NewRequestWithContext(r.Context(), "GET", "http://unix/metrics", nil) + if err != nil { + h.log.Errorf("Failed to create metrics request: %v", err) + http.Error(w, "Failed to create metrics request", http.StatusInternalServerError) + return + } + + // Forward relevant headers + for key, values := range r.Header { + for _, value := range values { + req.Header.Add(key, value) + } + } + + // Make the request to the backend + resp, err := client.Do(req) + if err != nil { + h.log.Errorf("Failed to fetch metrics from backend: %v", err) + http.Error(w, "Backend metrics unavailable", http.StatusServiceUnavailable) + return + } + defer resp.Body.Close() + + // Copy response headers + for key, values := range resp.Header { + for _, value := range values { + w.Header().Add(key, value) + } + } + + // Set status code + w.WriteHeader(resp.StatusCode) + + // Copy response body + if _, err := io.Copy(w, resp.Body); err != nil { + h.log.Errorf("Failed to copy metrics response: %v", err) + return + } + + h.log.Debugf("Successfully proxied metrics request") +}