# -- Default values for chart vllm # -- Declare variables to be passed into your templates. # -- Image configuration image: # -- Image repository repository: "vllm/vllm-openai" # -- Image tag tag: "latest" # -- Container launch command command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "float32", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"] # -- Container port containerPort: 8000 # -- Service name serviceName: # -- Service port servicePort: 80 # -- Additional ports configuration extraPorts: [] # -- Number of replicas replicaCount: 1 # -- Deployment strategy configuration deploymentStrategy: {} # -- Resource configuration resources: requests: # -- Number of CPUs cpu: 4 # -- CPU memory configuration memory: 16Gi # -- Number of gpus used nvidia.com/gpu: 1 limits: # -- Number of CPUs cpu: 4 # -- CPU memory configuration memory: 16Gi # -- Number of gpus used nvidia.com/gpu: 1 # -- Type of gpu used gpuModels: - "TYPE_GPU_USED" # -- Autoscaling configuration autoscaling: # -- Enable autoscaling enabled: false # -- Minimum replicas minReplicas: 1 # -- Maximum replicas maxReplicas: 100 # -- Target CPU utilization for autoscaling targetCPUUtilizationPercentage: 80 # targetMemoryUtilizationPercentage: 80 # -- Configmap configs: {} # -- Secrets configuration secrets: {} # -- External configuration externalConfigs: [] # -- Custom Objects configuration customObjects: [] # -- Disruption Budget Configuration maxUnavailablePodDisruptionBudget: "" # -- Additional configuration for the init container extraInit: # -- Path of the model on the s3 which hosts model weights and config files s3modelpath: "relative_s3_model_path/opt-125m" # -- Storage size of the s3 pvcStorage: "1Gi" awsEc2MetadataDisabled: true # -- Additional containers configuration extraContainers: [] # -- Readiness probe configuration readinessProbe: # -- Number of seconds after the container has started before readiness probe is initiated initialDelaySeconds: 5 # -- How often (in seconds) to perform the readiness probe periodSeconds: 5 # -- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready failureThreshold: 3 # -- Configuration of the Kubelet http request on the server httpGet: # -- Path to access on the HTTP server path: /health # -- Name or number of the port to access on the container, on which the server is listening port: 8000 # -- Liveness probe configuration livenessProbe: # -- Number of seconds after the container has started before liveness probe is initiated initialDelaySeconds: 15 # -- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive failureThreshold: 3 # -- How often (in seconds) to perform the liveness probe periodSeconds: 10 # -- Configuration of the Kubelet http request on the server httpGet: # -- Path to access on the HTTP server path: /health # -- Name or number of the port to access on the container, on which the server is listening port: 8000 labels: environment: "test" release: "test"