examples/AI/vllm-deployment/vllm-deployment.yaml

73 lines
2.1 KiB
YAML

apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm-gemma-deployment
spec:
replicas: 1
selector:
matchLabels:
app: gemma-server
template:
metadata:
labels:
app: gemma-server
# Labels for better functionality within GKE.
# ai.gke.io/model: gemma-3-1b-it
# ai.gke.io/inference-server: vllm
# examples.ai.gke.io/source: user-guide
spec:
containers:
- name: inference-server
# vllm/vllm-openai:v0.10.0
image: vllm/vllm-openai@sha256:05a31dc4185b042e91f4d2183689ac8a87bd845713d5c3f987563c5899878271
resources:
requests:
cpu: "2"
memory: "10Gi"
ephemeral-storage: "10Gi"
nvidia.com/gpu: "1"
limits:
cpu: "2"
memory: "10Gi"
ephemeral-storage: "10Gi"
nvidia.com/gpu: "1"
command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
args:
- --model=$(MODEL_ID)
- --tensor-parallel-size=1
- --host=0.0.0.0
- --port=8080
env:
# 1 billion parameter model (smallest gemma model)
- name: MODEL_ID
value: google/gemma-3-1b-it
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-secret
key: hf_token
volumeMounts:
- mountPath: /dev/shm
name: dshm
volumes:
- name: dshm
emptyDir:
medium: Memory
# Node selectors are the main difference among the cloud providers,
# making sure vLLM pods land on Nodes with the correct GPU. The
# following are node selector examples for three cloud providers.
#
# - GKE
# nodeSelector:
# cloud.google.com/gke-accelerator: nvidia-l4
# cloud.google.com/gke-gpu-driver-version: default
#
# - EKS
# nodeSelector:
# node.kubernetes.io/instance-type: p4d.24xlarge
#
# - AKS
# nodeSelector:
# agentpiscasi.com/gpu: "true" # Common label for AKS GPU nodes