examples/demos/yelp_demo/ks_app/components/t2tgpu.jsonnet

198 lines
5.1 KiB
Plaintext

local env = std.extVar("__ksonnet/environments");
local params = std.extVar("__ksonnet/params").components["t2tgpu"];
local k = import "k.libsonnet";
local name = params.name;
local namespace = env.namespace;
local updatedParams = {
cloud: "gke",
sync: "0",
dataDir: "gs://kubeflow-demo-base/featurization/yelp-data",
usrDir: "./yelp_sentiment",
problem: "yelp_sentiment",
model: "transformer_encoder",
hparams: "transformer_yelp_sentiment",
hparamsSet: "transformer_yelp_sentiment",
outputGCSPath: "gs://kubeflow-demo-base/kubeflow-demo-base-demo/GPU/training/yelp-model",
gpuImage: "gcr.io/kubeflow-demo-base/kubeflow-yelp-demo-gpu:latest",
cpuImage: "gcr.io/kubeflow-demo-base/kubeflow-yelp-demo-cpu:latest",
trainSteps: 1000,
evalSteps: 10,
psGpu: 0,
workerGpu: 1,
workers: 3,
masters: 1,
ps: 1,
jobName: "t2tgpu",
} + params;
local baseCommand = [
"bash",
"/home/jovyan/yelp_sentiment/worker_launcher.sh",
"--train_steps=" + updatedParams.trainSteps,
"--hparams_set=" + updatedParams.hparams,
"--model=" + updatedParams.model,
"--problem=" + updatedParams.problem,
"--t2t_usr_dir=" + updatedParams.usrDir,
"--data_dir=" + updatedParams.dataDir,
"--output_dir=" + updatedParams.outputGCSPath,
];
local psCommand = baseCommand + [
"--schedule=run_std_server",
];
local totalWorkerReplicas = updatedParams.workers + updatedParams.masters;
local workerBaseCommand = baseCommand + [
"--schedule=train",
"--sync=" + updatedParams.sync,
"--ps_gpu=" + updatedParams.psGpu,
"--worker_gpu=" + updatedParams.workerGpu,
"--worker_replicas=" + totalWorkerReplicas,
"--ps_replicas=" + updatedParams.ps,
"--eval_steps=" + updatedParams.evalSteps,
];
local workerCommand = workerBaseCommand + [
"--worker_job=/job:worker",
];
local masterCommand = workerBaseCommand + [
"--worker_job=/job:master",
];
local gpuResources = {
limits: {
"nvidia.com/gpu": updatedParams.workerGpu,
},
};
local cloud = std.toString(updatedParams.cloud);
local baseEnv = [
{
name: "PYTHONPATH",
value: "/home/jovyan",
},
];
local nonGkeEnv = baseEnv + [
{
name: "GOOGLE_APPLICATION_CREDENTIALS",
value: "/secret/gcp-credentials/key.json"
},
];
local nonGkeVolumes = [
{
name: "gcp-credentials",
secret: {
secretName: "gcp-credentials",
},
},
];
local nonGkeImagePullSecrets = [
{
name: "gcp-registry-credentials",
},
];
local nonGkeVolumeMounts = [
{
mountPath: "/secret/gcp-credentials",
name: "gcp-credentials",
},
];
local tfjob = {
apiVersion: "kubeflow.org/v1alpha2",
kind: "TFJob",
metadata: {
name: updatedParams.jobName,
namespace: namespace,
},
spec: {
tfReplicaSpecs: {
Master: {
replicas: 1,
template: {
spec: {
containers: [
{
command: masterCommand,
env: if cloud != "gke" then nonGkeEnv else baseEnv,
image: if updatedParams.workerGpu > 0 then updatedParams.gpuImage else updatedParams.cpuImage,
name: "tensorflow",
[if updatedParams.workerGpu > 0 then "resources"]: gpuResources,
[if cloud != "gke" then "volumeMounts"]: nonGkeVolumeMounts,
},
],
[if cloud != "gke" then "imagePullSecrets"]: nonGkeImagePullSecrets,
restartPolicy: "OnFailure",
[if cloud != "gke" then "volumes"]: nonGkeVolumes,
},
},
}, // Master
Worker: {
replicas: updatedParams.workers,
template: {
spec: {
containers: [
{
command: workerCommand,
env: if cloud != "gke" then nonGkeEnv else baseEnv,
image: if updatedParams.workerGpu > 0 then updatedParams.gpuImage else updatedParams.cpuImage,
name: "tensorflow",
[if updatedParams.workerGpu > 0 then "resources"]: gpuResources,
[if cloud != "gke" then "volumeMounts"]: nonGkeVolumeMounts,
},
],
[if cloud != "gke" then "imagePullSecrets"]: nonGkeImagePullSecrets,
restartPolicy: "OnFailure",
[if cloud != "gke" then "volumes"]: nonGkeVolumes,
},
},
}, // Worker
Ps: {
replicas: updatedParams.ps,
template: {
spec: {
containers: [
{
command: psCommand,
env: if cloud != "gke" then nonGkeEnv else baseEnv,
image: updatedParams.cpuImage,
name: "tensorflow",
[if cloud != "gke" then "volumeMounts"]: nonGkeVolumeMounts,
},
],
[if cloud != "gke" then "imagePullSecrets"]: nonGkeImagePullSecrets,
restartPolicy: "OnFailure",
[if cloud != "gke" then "volumes"]: nonGkeVolumes,
},
},
}, // Ps
}, // tfReplicaSpecs
}, // Spec
}; // tfJob
k.core.v1.list.new([
tfjob,
])