mirror of https://github.com/kubeflow/examples.git
Remove v1alpah1 TFJobs from the GH issue summarization example. (#264)
* We should be using v1alpha2 exclusively now.
This commit is contained in:
parent
4ea761630d
commit
90044d24c4
|
|
@ -24,10 +24,6 @@
|
|||
namespace: "null",
|
||||
replicas: 2,
|
||||
},
|
||||
tensor2tensor: {
|
||||
cpuImage: "gcr.io/kubeflow-examples/issue-summarization-t2t-trainer-cpu:v20180428-9da5cb7-dirty-4e1f35",
|
||||
namespace: "null",
|
||||
},
|
||||
tensorboard: {
|
||||
image: "tensorflow/tensorflow:1.7.0",
|
||||
// logDir needs to be overwritten based on where the data is
|
||||
|
|
@ -35,22 +31,6 @@
|
|||
logDir: "",
|
||||
name: "gh",
|
||||
},
|
||||
tfjob: {
|
||||
image: "gcr.io/kubeflow-dev/tf-job-issue-summarization:v20180425-e79f888",
|
||||
input_data_gcs_bucket: "kubeflow-examples",
|
||||
input_data_gcs_path: "github-issue-summarization-data/github-issues.zip",
|
||||
namespace: "null",
|
||||
output_model_gcs_bucket: "kubeflow-examples",
|
||||
output_model_gcs_path: "github-issue-summarization-data/output_model.h5",
|
||||
sample_size: "100000",
|
||||
},
|
||||
"tfjob-pvc": {
|
||||
image: "gcr.io/kubeflow-dev/tf-job-issue-summarization:v20180425-e79f888",
|
||||
input_data: "/data/github_issues.csv",
|
||||
namespace: "null",
|
||||
output_model: "/data/model.h5",
|
||||
sample_size: "2000000",
|
||||
},
|
||||
ui: {
|
||||
namespace: "null",
|
||||
githubToken: "",
|
||||
|
|
|
|||
|
|
@ -1,7 +0,0 @@
|
|||
local env = std.extVar("__ksonnet/environments");
|
||||
local params = std.extVar("__ksonnet/params").components.tensor2tensor;
|
||||
local k = import "k.libsonnet";
|
||||
|
||||
local tensor2tensor = import "tensor2tensor.libsonnet";
|
||||
|
||||
std.prune(k.core.v1.list.new([tensor2tensor.parts(params, env).job]))
|
||||
|
|
@ -1,150 +0,0 @@
|
|||
{
|
||||
parts(params, env):: {
|
||||
// Define some defaults.
|
||||
local updatedParams = {
|
||||
sync: "0",
|
||||
|
||||
dataDir: "gs://kubeflow-examples-data/gh_issue_summarization/data",
|
||||
usrDir: "./github",
|
||||
problem: "github_issue_summarization_problem",
|
||||
|
||||
model: "transformer_encoder",
|
||||
hparams: "transformer_github_issues",
|
||||
hparamsSet: "transformer_github_issues",
|
||||
outputGCSPath: "gs://kubecon-gh-demo/gh-t2t-out/temp",
|
||||
|
||||
gpuImage: null,
|
||||
cpuImage: null,
|
||||
|
||||
trainSteps: 20000,
|
||||
evalSteps: 10,
|
||||
|
||||
psGpu: 0,
|
||||
workerGpu: 0,
|
||||
|
||||
workers: 3,
|
||||
masters: 1,
|
||||
ps: 1,
|
||||
|
||||
jobName: "tensor2tensor",
|
||||
} + params,
|
||||
|
||||
local containerEnv = [
|
||||
{
|
||||
name: "PYTHONPATH",
|
||||
value: "/home/jovyan",
|
||||
}
|
||||
],
|
||||
local baseCommand = [
|
||||
"/home/jovyan/github/t2t_launcher.sh",
|
||||
"--train_steps=" + updatedParams.trainSteps,
|
||||
"--hparams_set=" + updatedParams.hparams,
|
||||
"--model=" + updatedParams.model,
|
||||
"--problem=" + updatedParams.problem,
|
||||
"--t2t_usr_dir=" + updatedParams.usrDir,
|
||||
"--data_dir=" + updatedParams.dataDir,
|
||||
"--output_dir=" + updatedParams.outputGCSPath,
|
||||
],
|
||||
local psCommand = baseCommand + [
|
||||
"--schedule=run_std_server",
|
||||
],
|
||||
local totalWorkerReplicas = updatedParams.workers + updatedParams.masters,
|
||||
local workerBaseCommand = baseCommand + [
|
||||
"--schedule=train",
|
||||
"--sync=" + updatedParams.sync,
|
||||
"--ps_gpu=" + updatedParams.psGpu,
|
||||
"--worker_gpu=" + updatedParams.workerGpu,
|
||||
// We explicitly want to add worker and
|
||||
"--worker_replicas=" + totalWorkerReplicas,
|
||||
"--ps_replicas=" + updatedParams.ps,
|
||||
"--eval_steps=" + updatedParams.evalSteps,
|
||||
],
|
||||
local workerCommand = workerBaseCommand + [
|
||||
"--worker_job=/job:worker",
|
||||
],
|
||||
local masterCommand = workerBaseCommand + [
|
||||
"--worker_job=/job:master",
|
||||
],
|
||||
local namespace = env.namespace,
|
||||
|
||||
job:: {
|
||||
apiVersion: "kubeflow.org/v1alpha1",
|
||||
kind: "TFJob",
|
||||
metadata: {
|
||||
name: updatedParams.jobName,
|
||||
namespace: env.namespace,
|
||||
},
|
||||
spec: {
|
||||
replicaSpecs: [
|
||||
{
|
||||
replicas: 1,
|
||||
template: {
|
||||
spec: {
|
||||
containers: [
|
||||
{
|
||||
image: if updatedParams.workerGpu > 0 then updatedParams.gpuImage else updatedParams.cpuImage,
|
||||
name: "tensorflow",
|
||||
command: masterCommand,
|
||||
env: containerEnv,
|
||||
[if updatedParams.workerGpu > 0 then "resources"]: {
|
||||
limits: {
|
||||
"nvidia.com/gpu": updatedParams.workerGpu,
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
restartPolicy: "OnFailure",
|
||||
},
|
||||
},
|
||||
tfReplicaType: "MASTER",
|
||||
},
|
||||
{
|
||||
replicas: updatedParams.workers,
|
||||
template: {
|
||||
spec: {
|
||||
containers: [
|
||||
{
|
||||
image: if updatedParams.workerGpu > 0 then updatedParams.gpuImage else updatedParams.cpuImage,
|
||||
name: "tensorflow",
|
||||
command: workerCommand,
|
||||
env: containerEnv,
|
||||
[if updatedParams.workerGpu > 0 then "resources"]: {
|
||||
limits: {
|
||||
"nvidia.com/gpu": updatedParams.workerGpu,
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
restartPolicy: "OnFailure",
|
||||
},
|
||||
},
|
||||
tfReplicaType: "WORKER",
|
||||
},
|
||||
{
|
||||
replicas: updatedParams.ps,
|
||||
template: {
|
||||
spec: {
|
||||
containers: [
|
||||
{
|
||||
image: updatedParams.cpuImage,
|
||||
name: "tensorflow",
|
||||
command: psCommand,
|
||||
env: containerEnv,
|
||||
},
|
||||
],
|
||||
restartPolicy: "OnFailure",
|
||||
},
|
||||
},
|
||||
tfReplicaType: "PS",
|
||||
},
|
||||
],
|
||||
terminationPolicy: {
|
||||
chief: {
|
||||
replicaIndex: 0,
|
||||
replicaName: "MASTER",
|
||||
},
|
||||
},
|
||||
},
|
||||
}, // job
|
||||
}, //parts
|
||||
}
|
||||
|
|
@ -1,61 +0,0 @@
|
|||
// Train the model reading & writing the data from a PVC.
|
||||
local env = std.extVar("__ksonnet/environments");
|
||||
local params = std.extVar("__ksonnet/params").components["tfjob-pvc"];
|
||||
local k = import "k.libsonnet";
|
||||
|
||||
local tfjob = {
|
||||
apiVersion: "kubeflow.org/v1alpha1",
|
||||
kind: "TFJob",
|
||||
metadata: {
|
||||
name: "tf-job-issue-summarization-pvc",
|
||||
namespace: env.namespace,
|
||||
},
|
||||
spec: {
|
||||
replicaSpecs: [
|
||||
{
|
||||
replicas: 1,
|
||||
template: {
|
||||
spec: {
|
||||
containers: [
|
||||
{
|
||||
image: params.image,
|
||||
name: "tensorflow",
|
||||
volumeMounts: [
|
||||
{
|
||||
name: "data",
|
||||
mountPath: "/data",
|
||||
},
|
||||
],
|
||||
command: [
|
||||
"python",
|
||||
"/workdir/train.py",
|
||||
"--sample_size=" + std.toString(params.sample_size),
|
||||
"--input_data=" + params.input_data,
|
||||
"--output_model=" + params.output_model,
|
||||
],
|
||||
},
|
||||
],
|
||||
volumes: [
|
||||
{
|
||||
name: "data",
|
||||
persistentVolumeClaim: {
|
||||
claimName: "data-pvc",
|
||||
},
|
||||
},
|
||||
],
|
||||
restartPolicy: "OnFailure",
|
||||
},
|
||||
},
|
||||
tfReplicaType: "MASTER",
|
||||
},
|
||||
],
|
||||
terminationPolicy: {
|
||||
chief: {
|
||||
replicaIndex: 0,
|
||||
replicaName: "MASTER",
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
std.prune(k.core.v1.list.new([tfjob]))
|
||||
|
|
@ -1,7 +0,0 @@
|
|||
local env = std.extVar("__ksonnet/environments");
|
||||
local params = std.extVar("__ksonnet/params").components.tfjob;
|
||||
local k = import "k.libsonnet";
|
||||
|
||||
local tfjob = import "tfjob.libsonnet";
|
||||
|
||||
std.prune(k.core.v1.list.new([tfjob.parts(params)]))
|
||||
|
|
@ -1,67 +0,0 @@
|
|||
{
|
||||
parts(params):: {
|
||||
apiVersion: "kubeflow.org/v1alpha1",
|
||||
kind: "TFJob",
|
||||
metadata: {
|
||||
name: "tf-job-issue-summarization",
|
||||
namespace: params.namespace,
|
||||
},
|
||||
spec: {
|
||||
replicaSpecs: [
|
||||
{
|
||||
replicas: 1,
|
||||
template: {
|
||||
spec: {
|
||||
containers: [
|
||||
{
|
||||
image: params.image,
|
||||
name: "tensorflow",
|
||||
volumeMounts: [
|
||||
{
|
||||
name: "gcp-credentials",
|
||||
mountPath: "/secret/gcp-credentials",
|
||||
readOnly: true,
|
||||
},
|
||||
],
|
||||
command: [
|
||||
"python",
|
||||
],
|
||||
args: [
|
||||
"/workdir/train.py",
|
||||
"--sample_size=" + std.toString(params.sample_size),
|
||||
"--input_data_gcs_bucket=" + params.input_data_gcs_bucket,
|
||||
"--input_data_gcs_path=" + params.input_data_gcs_path,
|
||||
"--output_model_gcs_bucket=" + params.output_model_gcs_bucket,
|
||||
"--output_model_gcs_path=" + params.output_model_gcs_path,
|
||||
],
|
||||
env: [
|
||||
{
|
||||
name: "GOOGLE_APPLICATION_CREDENTIALS",
|
||||
value: "/secret/gcp-credentials/key.json",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
volumes: [
|
||||
{
|
||||
name: "gcp-credentials",
|
||||
secret: {
|
||||
secretName: "gcp-credentials",
|
||||
},
|
||||
},
|
||||
],
|
||||
restartPolicy: "OnFailure",
|
||||
},
|
||||
},
|
||||
tfReplicaType: "MASTER",
|
||||
},
|
||||
],
|
||||
terminationPolicy: {
|
||||
chief: {
|
||||
replicaIndex: 0,
|
||||
replicaName: "MASTER",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
Loading…
Reference in New Issue