production-stack/tutorials/assets/values-02-basic-config.yaml

26 lines
567 B
YAML

servingEngineSpec:
runtimeClassName: ""
modelSpec:
- name: "llama3"
repository: "vllm/vllm-openai"
tag: "latest"
modelURL: "meta-llama/Llama-3.1-8B-Instruct"
replicaCount: 1
requestCPU: 10
requestMemory: "16Gi"
requestGPU: 1
pvcStorage: "50Gi"
pvcAccessMode:
- ReadWriteOnce
vllmConfig:
enableChunkedPrefill: false
enablePrefixCaching: false
maxModelLen: 4096
dtype: "bfloat16"
extraArgs: ["--disable-log-requests", "--gpu-memory-utilization", "0.8"]
hf_token: <YOUR HF TOKEN>