GIS E2E test verify the TFJob runs successfully (#456)

* Create a test for submitting the TFJob for the GitHub issue summarization example.

* This test needs to be run manually right now. In a follow on PR we will
  integrate it into CI.

* We use the image built from Dockerfile.estimator because that is the image
  we are running train_test.py in.

  * Note: The current version of the code now requires Python3 (I think this
    is due to an earlier PR which refactored the code into a shared
    implementation for using TF estimator and not TF estimator).

* Create a TFJob component for TFJob v1beta1; this is the version
  in KF 0.4.

TFJob component
  * Upgrade to v1beta to work with 0.4
  * Update command line arguments to match the versions in the current code
      * input & output are now single parameters rather then separate parameters
        for bucket and name

  * change default input to a CSV file because the current version of the
    code doesn't handle unzipping it.

* Use ks_util from kubeflow/testing

* Address comments.
This commit is contained in:
Jeremy Lewi 2019-01-08 15:06:49 -08:00 committed by Kubernetes Prow Robot
parent 959d072e68
commit 1cc4550b7d
39 changed files with 191 additions and 343404 deletions

View File

@ -1,8 +0,0 @@
local base = import "base.libsonnet";
// uncomment if you reference ksonnet-lib
// local k = import "k.libsonnet";
base + {
// Insert user-specified overrides here. For example if a component is named \"nginx-deployment\", you might have something like:\n")
// "nginx-deployment"+: k.deployment.mixin.metadata.labels({foo: "bar"})
}

View File

@ -1,17 +0,0 @@
local params = std.extVar('__ksonnet/params');
local globals = import 'globals.libsonnet';
local envParams = params + {
components+: {
"tfjob-v1alpha2"+: {
output_model_gcs_bucket: 'kubecon-gh-demo',
output_model_gcs_path: 'gh-demo/20181008/output',
},
},
};
{
components: {
[x]: envParams.components[x] + globals
for x in std.objectFields(envParams.components)
},
}

View File

@ -1,8 +0,0 @@
local base = import "base.libsonnet";
// uncomment if you reference ksonnet-lib
// local k = import "k.libsonnet";
base + {
// Insert user-specified overrides here. For example if a component is named \"nginx-deployment\", you might have something like:\n")
// "nginx-deployment"+: k.deployment.mixin.metadata.labels({foo: "bar"})
}

View File

@ -1,19 +0,0 @@
local params = std.extVar('__ksonnet/params');
local globals = import 'globals.libsonnet';
local envParams = params + {
components+: {
"tfjob-v1beta1"+: {
name: 'jlewi-gis-test',
namespace: 'kubeflow',
num_epochs: 1,
sample_size: 10,
},
},
};
{
components: {
[x]: envParams.components[x] + globals
for x in std.objectFields(envParams.components)
},
}

View File

@ -1,3 +0,0 @@
{
outputGCSPath: 'gs://cloud-ml-dev_jlewi/gh-t2t-out/temp',
}

View File

@ -1,8 +0,0 @@
local base = import "base.libsonnet";
// uncomment if you reference ksonnet-lib
// local k = import "k.libsonnet";
base {
// Insert user-specified overrides here. For example if a component is named \"nginx-deployment\", you might have something like:\n")
// "nginx-deployment"+: k.deployment.mixin.metadata.labels({foo: "bar"})
}

View File

@ -1,12 +0,0 @@
local params = std.extVar('__ksonnet/params');
local globals = import 'globals.libsonnet';
local envParams = params {
components+: {},
};
{
components: {
[x]: envParams.components[x] + globals
for x in std.objectFields(envParams.components)
},
}

View File

@ -1,8 +0,0 @@
local base = import "base.libsonnet";
// uncomment if you reference ksonnet-lib
// local k = import "k.libsonnet";
base + {
// Insert user-specified overrides here. For example if a component is named \"nginx-deployment\", you might have something like:\n")
// "nginx-deployment"+: k.deployment.mixin.metadata.labels({foo: "bar"})
}

View File

@ -1,23 +0,0 @@
local params = std.extVar('__ksonnet/params');
local globals = import 'globals.libsonnet';
local envParams = params + {
components+: {
"tfjob-v1alpha2"+: {
name: 'jlewi-gis-test',
namespace: 'jlewi',
},
"tfjob-v1beta1"+: {
name: 'jlewi-gis-test',
namespace: 'kubeflow',
num_epochs: 1,
sample_size: 10,
},
},
};
{
components: {
[x]: envParams.components[x] + globals
for x in std.objectFields(envParams.components)
},
}

View File

@ -1,8 +0,0 @@
local base = import "base.libsonnet";
// uncomment if you reference ksonnet-lib
// local k = import "k.libsonnet";
base + {
// Insert user-specified overrides here. For example if a component is named \"nginx-deployment\", you might have something like:\n")
// "nginx-deployment"+: k.deployment.mixin.metadata.labels({foo: "bar"})
}

View File

@ -1,19 +0,0 @@
local params = std.extVar('__ksonnet/params');
local globals = import 'globals.libsonnet';
local envParams = params + {
components+: {
"tfjob-v1beta1"+: {
name: 'jlewi-gis-test',
namespace: 'kubeflow',
num_epochs: 1,
sample_size: 10,
},
},
};
{
components: {
[x]: envParams.components[x] + globals
for x in std.objectFields(envParams.components)
},
}

View File

@ -1,8 +0,0 @@
local base = import "base.libsonnet";
// uncomment if you reference ksonnet-lib
// local k = import "k.libsonnet";
base + {
// Insert user-specified overrides here. For example if a component is named \"nginx-deployment\", you might have something like:\n")
// "nginx-deployment"+: k.deployment.mixin.metadata.labels({foo: "bar"})
}

View File

@ -1,17 +0,0 @@
local params = std.extVar('__ksonnet/params');
local globals = import 'globals.libsonnet';
local envParams = params + {
components+: {
"tfjob-v1beta1"+: {
name: 'jlewi-gis-test',
namespace: 'jlewi',
},
},
};
{
components: {
[x]: envParams.components[x] + globals
for x in std.objectFields(envParams.components)
},
}

View File

@ -1,8 +0,0 @@
local base = import "base.libsonnet";
// uncomment if you reference ksonnet-lib
// local k = import "k.libsonnet";
base + {
// Insert user-specified overrides here. For example if a component is named \"nginx-deployment\", you might have something like:\n")
// "nginx-deployment"+: k.deployment.mixin.metadata.labels({foo: "bar"})
}

View File

@ -1,17 +0,0 @@
local params = std.extVar('__ksonnet/params');
local globals = import 'globals.libsonnet';
local envParams = params + {
components+: {
"tfjob-v1alpha2"+: {
name: 'jlewi-gis-test',
namespace: 'jlewi',
},
},
};
{
components: {
[x]: envParams.components[x] + globals
for x in std.objectFields(envParams.components)
},
}

View File

@ -1,8 +0,0 @@
local base = import "base.libsonnet";
// uncomment if you reference ksonnet-lib
// local k = import "k.libsonnet";
base + {
// Insert user-specified overrides here. For example if a component is named \"nginx-deployment\", you might have something like:\n")
// "nginx-deployment"+: k.deployment.mixin.metadata.labels({foo: "bar"})
}

View File

@ -1,19 +0,0 @@
local params = std.extVar('__ksonnet/params');
local globals = import 'globals.libsonnet';
local envParams = params + {
components+: {
"tfjob-v1beta1"+: {
name: 'jlewi-gis-test',
namespace: 'kubeflow',
num_epochs: 1,
sample_size: 10,
},
},
};
{
components: {
[x]: envParams.components[x] + globals
for x in std.objectFields(envParams.components)
},
}

View File

@ -1,129 +0,0 @@
local k8s = import 'k8s.libsonnet';
local fn = {
mapContainers(f):: {
local podContainers = super.spec.template.spec.containers,
spec+: {
template+: {
spec+: {
containers: std.map(f, podContainers),
},
},
},
},
mapContainersWithName(names, f)::
local nameSet = if std.type(names) == 'array' then std.set(names) else std.set([names]);
local inNameSet(name) = std.length(std.setInter(nameSet, std.set([name]))) > 0;
self.mapContainers(function(c) if std.objectHas(c, 'name') && inNameSet(c.name) then f(c) else c),
};
k8s + {
apps:: k8s.apps + {
v1:: k8s.apps.v1 + {
daemonSet:: k8s.apps.v1.daemonSet + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
deployment:: k8s.apps.v1.deployment + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
replicaSet:: k8s.apps.v1.replicaSet + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
statefulSet:: k8s.apps.v1.statefulSet + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
},
v1beta1:: k8s.apps.v1beta1 + {
deployment:: k8s.apps.v1beta1.deployment + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
statefulSet:: k8s.apps.v1beta1.statefulSet + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
},
v1beta2:: k8s.apps.v1beta2 + {
daemonSet:: k8s.apps.v1beta2.daemonSet + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
deployment:: k8s.apps.v1beta2.deployment + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
replicaSet:: k8s.apps.v1beta2.replicaSet + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
statefulSet:: k8s.apps.v1beta2.statefulSet + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
},
},
batch:: k8s.batch + {
v1:: k8s.batch.v1 + {
job:: k8s.batch.v1.job + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
},
v1beta1:: k8s.batch.v1beta1 + {
cronJob:: k8s.batch.v1beta1.cronJob + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
},
v2alpha1:: k8s.batch.v2alpha1 + {
cronJob:: k8s.batch.v2alpha1.cronJob + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
},
},
core:: k8s.core + {
v1:: k8s.core.v1 + {
list:: {
new(items):: {
apiVersion: 'v1',
} + {
kind: 'List',
} + self.items(items),
items(items):: if std.type(items) == 'array' then { items+: items } else { items+: [items] },
},
pod:: k8s.core.v1.pod + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
podTemplate:: k8s.core.v1.podTemplate + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
replicationController:: k8s.core.v1.replicationController + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
},
},
extensions:: k8s.extensions + {
v1beta1:: k8s.extensions.v1beta1 + {
daemonSet:: k8s.extensions.v1beta1.daemonSet + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
deployment:: k8s.extensions.v1beta1.deployment + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
replicaSet:: k8s.extensions.v1beta1.replicaSet + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
},
},
}

View File

@ -1,129 +0,0 @@
local k8s = import 'k8s.libsonnet';
local fn = {
mapContainers(f):: {
local podContainers = super.spec.template.spec.containers,
spec+: {
template+: {
spec+: {
containers: std.map(f, podContainers),
},
},
},
},
mapContainersWithName(names, f)::
local nameSet = if std.type(names) == 'array' then std.set(names) else std.set([names]);
local inNameSet(name) = std.length(std.setInter(nameSet, std.set([name]))) > 0;
self.mapContainers(function(c) if std.objectHas(c, 'name') && inNameSet(c.name) then f(c) else c),
};
k8s + {
apps:: k8s.apps + {
v1:: k8s.apps.v1 + {
daemonSet:: k8s.apps.v1.daemonSet + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
deployment:: k8s.apps.v1.deployment + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
replicaSet:: k8s.apps.v1.replicaSet + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
statefulSet:: k8s.apps.v1.statefulSet + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
},
v1beta1:: k8s.apps.v1beta1 + {
deployment:: k8s.apps.v1beta1.deployment + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
statefulSet:: k8s.apps.v1beta1.statefulSet + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
},
v1beta2:: k8s.apps.v1beta2 + {
daemonSet:: k8s.apps.v1beta2.daemonSet + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
deployment:: k8s.apps.v1beta2.deployment + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
replicaSet:: k8s.apps.v1beta2.replicaSet + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
statefulSet:: k8s.apps.v1beta2.statefulSet + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
},
},
batch:: k8s.batch + {
v1:: k8s.batch.v1 + {
job:: k8s.batch.v1.job + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
},
v1beta1:: k8s.batch.v1beta1 + {
cronJob:: k8s.batch.v1beta1.cronJob + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
},
v2alpha1:: k8s.batch.v2alpha1 + {
cronJob:: k8s.batch.v2alpha1.cronJob + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
},
},
core:: k8s.core + {
v1:: k8s.core.v1 + {
list:: {
new(items):: {
apiVersion: 'v1',
} + {
kind: 'List',
} + self.items(items),
items(items):: if std.type(items) == 'array' then { items+: items } else { items+: [items] },
},
pod:: k8s.core.v1.pod + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
podTemplate:: k8s.core.v1.podTemplate + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
replicationController:: k8s.core.v1.replicationController + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
},
},
extensions:: k8s.extensions + {
v1beta1:: k8s.extensions.v1beta1 + {
daemonSet:: k8s.extensions.v1beta1.daemonSet + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
deployment:: k8s.extensions.v1beta1.deployment + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
replicaSet:: k8s.extensions.v1beta1.replicaSet + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
},
},
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,129 +0,0 @@
local k8s = import 'k8s.libsonnet';
local fn = {
mapContainers(f):: {
local podContainers = super.spec.template.spec.containers,
spec+: {
template+: {
spec+: {
containers: std.map(f, podContainers),
},
},
},
},
mapContainersWithName(names, f)::
local nameSet = if std.type(names) == 'array' then std.set(names) else std.set([names]);
local inNameSet(name) = std.length(std.setInter(nameSet, std.set([name]))) > 0;
self.mapContainers(function(c) if std.objectHas(c, 'name') && inNameSet(c.name) then f(c) else c),
};
k8s + {
apps:: k8s.apps + {
v1:: k8s.apps.v1 + {
daemonSet:: k8s.apps.v1.daemonSet + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
deployment:: k8s.apps.v1.deployment + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
replicaSet:: k8s.apps.v1.replicaSet + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
statefulSet:: k8s.apps.v1.statefulSet + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
},
v1beta1:: k8s.apps.v1beta1 + {
deployment:: k8s.apps.v1beta1.deployment + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
statefulSet:: k8s.apps.v1beta1.statefulSet + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
},
v1beta2:: k8s.apps.v1beta2 + {
daemonSet:: k8s.apps.v1beta2.daemonSet + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
deployment:: k8s.apps.v1beta2.deployment + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
replicaSet:: k8s.apps.v1beta2.replicaSet + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
statefulSet:: k8s.apps.v1beta2.statefulSet + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
},
},
batch:: k8s.batch + {
v1:: k8s.batch.v1 + {
job:: k8s.batch.v1.job + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
},
v1beta1:: k8s.batch.v1beta1 + {
cronJob:: k8s.batch.v1beta1.cronJob + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
},
v2alpha1:: k8s.batch.v2alpha1 + {
cronJob:: k8s.batch.v2alpha1.cronJob + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
},
},
core:: k8s.core + {
v1:: k8s.core.v1 + {
list:: {
new(items):: {
apiVersion: 'v1',
} + {
kind: 'List',
} + self.items(items),
items(items):: if std.type(items) == 'array' then { items+: items } else { items+: [items] },
},
pod:: k8s.core.v1.pod + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
podTemplate:: k8s.core.v1.podTemplate + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
replicationController:: k8s.core.v1.replicationController + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
},
},
extensions:: k8s.extensions + {
v1beta1:: k8s.extensions.v1beta1 + {
daemonSet:: k8s.extensions.v1beta1.daemonSet + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
deployment:: k8s.extensions.v1beta1.deployment + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
replicaSet:: k8s.extensions.v1beta1.replicaSet + {
mapContainers(f):: fn.mapContainers(f),
mapContainersWithName(names, f):: fn.mapContainersWithName(names, f),
},
},
},
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -25,12 +25,11 @@
},
"tfjob": {
name: "tfjob-issue-summarization",
image: "gcr.io/kubeflow-examples/tf-job-issue-summarization:v20180629-v0.1-2-g98ed4b4-dirty-182929",
input_data_gcs_bucket: "kubeflow-examples",
input_data_gcs_path: "github-issue-summarization-data/github-issues.zip",
output_model_gcs_bucket: "kubeflow-examples",
output_model_gcs_path: "github-issue-summarization-data",
image: "gcr.io/kubeflow-examples/github-issue-summarization/trainer-estimator:v20181229-v0.2-131-g662c666-dirty-312900",
input_data: "gs://kubeflow-examples/github-issue-summarization-data/github_issues_sample.csv",
output_model: "/tmp/model.h5",
sample_size: "100000",
num_epochs: "7",
gcpSecretName: "user-gcp-sa",
gcpSecretFile: "user-gcp-sa.json",
},

View File

@ -14,6 +14,7 @@ local tfjob = {
namespace: namespace,
},
spec: {
tTLSecondsAfterFinished: 60 * 60 * 24 * 7,
tfReplicaSpecs: {
Master: {
replicas: 1,
@ -32,15 +33,13 @@ local tfjob = {
],
command: [
"python",
],
args: [
"/workdir/train.py",
"train.py",
"--num_epochs=" + std.toString(params.num_epochs),
"--sample_size=" + std.toString(params.sample_size),
"--input_data_gcs_bucket=" + params.input_data_gcs_bucket,
"--input_data_gcs_path=" + params.input_data_gcs_path,
"--output_model_gcs_bucket=" + params.output_model_gcs_bucket,
"--output_model_gcs_path=" + params.output_model_gcs_path,
"--input_data=" + params.input_data,
"--output_model=" + params.output_model,
],
workingDir: "/issues",
env: [
{
name: "GOOGLE_APPLICATION_CREDENTIALS",
@ -68,3 +67,4 @@ local tfjob = {
k.core.v1.list.new([
tfjob,
])

View File

@ -1,7 +1,9 @@
# TODO(jlewi): Can we merge with Dockerfile?
# This Dockerfile is used for training with TF.Estimator.
# This Dockerfile is used for training.
# We can probably use the same notebook Docker image if
# we just upgrade the notebook version.
# we just upgrade the notebook version. The conda environments
# however complicate things so it might be simpler just to
# have a separate image.
FROM python:3.6
# TODO(jlewi): We should probably pin version of TF and other libraries.

View File

@ -0,0 +1,99 @@
"""Test training using TFJob.
This file tests that we can submit the job from ksonnet
and that the job runs to completion.
It is an integration test as it depends on having access to
a Kubeflow deployment to submit the TFJob to.
Python Path Requirements:
kubeflow/tf-operator/py - https://github.com/kubeflow/tf-operator
* Provides utilities for testing TFJobs
kubeflow/testing/py - https://github.com/kubeflow/testing/tree/master/py
* Provides utilities for testing
Manually running the test
1. Configure your KUBECONFIG file to point to the desired cluster
2. Set --params=name=${NAME},namespace=${NAMESPACE}
* name should be the name for your job
* namespace should be the namespace to use
3. To test a new image set the parameter image e.g
--params=name=${NAME},namespace=${NAMESPACE},image=${IMAGE}
4. To control how long it trains set sample_size and num_epochs
--params=num_epochs=1,sample_size=10,...
"""
import json
import logging
import os
from kubernetes import client as k8s_client
from py import tf_job_client
from py import test_runner
from kubeflow.testing import ks_util
from kubeflow.testing import test_util
from kubeflow.testing import util
class TFJobTest(test_util.TestCase):
def __init__(self, args):
namespace, name, env = test_runner.parse_runtime_params(args)
self.app_dir = args.app_dir
if not self.app_dir:
self.app_dir = os.path.join(os.path.dirname(__file__), "..",
"ks_app")
self.app_dir = os.path.abspath(self.app_dir)
logging.info("--app_dir not set defaulting to: %s", self.app_dir)
self.env = env
self.namespace = namespace
self.params = args.params
self.ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir)
super(TFJobTest, self).__init__(class_name="TFJobTest", name=name)
def test_train(self):
# We repeat the test multiple times.
# This ensures that if we delete the job we can create a new job with the
# same name.
api_client = k8s_client.ApiClient()
component = "tfjob"
# Setup the ksonnet app
ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
self.params)
# Create the TF job
util.run([self.ks_cmd, "apply", self.env, "-c", component],
cwd=self.app_dir)
logging.info("Created job %s in namespaces %s", self.name, self.namespace)
# Wait for the job to complete.
logging.info("Waiting for job to finish.")
results = tf_job_client.wait_for_job(
api_client,
self.namespace,
self.name,
status_callback=tf_job_client.log_status)
logging.info("Final TFJob:\n %s", json.dumps(results, indent=2))
# Check for errors creating pods and services. Can potentially
# help debug failed test runs.
creation_failures = tf_job_client.get_creation_failures_from_tfjob(
api_client, self.namespace, results)
if creation_failures:
logging.warning(creation_failures)
if not tf_job_client.job_succeeded(results):
self.failure = "Job {0} in namespace {1} in status {2}".format( # pylint: disable=attribute-defined-outside-init
self.name, self.namespace, results.get("status", {}))
logging.error(self.failure)
return
# We don't delete the jobs. We rely on TTLSecondsAfterFinished
# to delete old jobs. Leaving jobs around should make it
# easier to debug.
if __name__ == "__main__":
test_runner.main(module=__name__)

View File

@ -15,13 +15,23 @@ local defaultParams = {
dataVolume: "kubeflow-test-volume",
// Default step image:
stepImage: "gcr.io/kubeflow-ci/test-worker:v20181017-bfeaaf5-dirty-4adcd0",
stepImage: "gcr.io/kubeflow-ci/test-worker:v20190104-f2a1cdf-e3b0c4",
// Which Kubeflow cluster to use for running TFJobs on.
kfProject: "kubeflow-ci",
kfZone: "us-east1-d",
kfCluster: "kf-v0-4-n00",
};
local params = defaultParams + overrides;
local prowEnv = util.parseEnv(params.prow_env);
// Workflow template is the name of the workflow template; typically the name of the ks component.
// This is used as a label to make it easy to identify all Argo workflows created from a given
// template.
local workflow_template = "gis";
// Create a dictionary of the different prow variables so we can refer to them in the workflow.
//
// Important: We want to initialize all variables we reference to some value. If we don't
@ -56,11 +66,14 @@ local srcRootDir = testDir + "/src";
// The directory containing the kubeflow/kubeflow repo
local srcDir = srcRootDir + "/" + prowDict.REPO_OWNER + "/" + prowDict.REPO_NAME;
// value of KUBECONFIG environment variable. This should be a full path.
local kubeConfig = testDir + "/.kube/kubeconfig";
// These variables control where the docker images get pushed and what
// tag to use
local imageBase = "gcr.io/kubeflow-ci/github-issue-summarization";
local imageTag = "build-" + prowDict["BUILD_ID"];
local trainerImage = imageBase + "/trainer-estimator:" + imageTag;
// Build template is a template for constructing Argo step templates.
//
@ -89,9 +102,18 @@ local buildTemplate = {
// py scripts to use.
local kubeflowTestingPy = srcRootDir + "/kubeflow/testing/py",
local tfOperatorPy = srcRootDir + "/kubeflow/tf-operator",
// Actual template for Argo
argoTemplate: {
name: template.name,
metadata: {
labels: prowDict + {
workflow: params.name,
workflow_template: workflow_template,
step_name: + template.name,
},
},
container: {
command: template.command,
name: template.name,
@ -101,7 +123,7 @@ local buildTemplate = {
{
// Add the source directories to the python path.
name: "PYTHONPATH",
value: kubeflowTestingPy,
value: kubeflowTestingPy + ":" + tfOperatorPy,
},
{
name: "GOOGLE_APPLICATION_CREDENTIALS",
@ -116,6 +138,12 @@ local buildTemplate = {
},
},
},
{
// We use a directory in our NFS share to store our kube config.
// This way we can configure it on a single step and reuse it on subsequent steps.
name: "KUBECONFIG",
value: kubeConfig,
},
] + prowEnv + template.env_vars,
volumeMounts: [
{
@ -135,7 +163,6 @@ local buildTemplate = {
},
}; // buildTemplate
// Create a list of dictionary.
// Each item is a dictionary describing one step in the graph.
local dagTemplates = [
@ -147,7 +174,9 @@ local dagTemplates = [
env_vars: [{
name: "EXTRA_REPOS",
value: "kubeflow/testing@HEAD",
// tf-operator has utilities needed for testing TFJobs.
// TODO(jlewi): Update extra repos once kubeflow/testing#271 are merged.
value: "kubeflow/testing@HEAD:274;kubeflow/tf-operator@HEAD",
}],
},
dependencies: null,
@ -199,11 +228,50 @@ local dagTemplates = [
"train_test.py",
],
// Use the newly built image.
image: imageBase + "/trainer-estimator:" + imageTag,
image: trainerImage,
workingDir: "/issues",
},
dependencies: ["build-images"],
}, // train-test
{
// Configure KUBECONFIG
template: buildTemplate {
name: "get-kubeconfig",
command: util.buildCommand([
[
"gcloud",
"auth",
"activate-service-account",
"--key-file=${GOOGLE_APPLICATION_CREDENTIALS}",
],
[
"gcloud",
"--project=" + params.kfProject,
"container",
"clusters",
"get-credentials",
"--zone=" + params.kfZone,
params.kfCluster,
]]
),
workingDir: srcDir + "/github_issue_summarization",
},
dependencies: ["checkout"],
}, // get-kubeconfig
{
// Run the python test for TFJob
template: buildTemplate {
name: "tfjob-test",
command: [
"python",
"tfjob_test.py",
"--params=name=gis-test-" + prowDict["BUILD_ID"] + ",namespace=kubeflow,num_epochs=1,sample_size=10,image=" + trainerImage,
"--artifacts_path=" + artifactsDir,
],
workingDir: srcDir + "/github_issue_summarization/testing",
},
dependencies: ["build-images", "get-kubeconfig"],
}, // tfjob-test
];
// Dag defines the tasks in the graph
@ -282,11 +350,8 @@ local workflow = {
metadata: {
name: params.name,
namespace: env.namespace,
labels: {
org: prowDict.REPO_OWNER,
repo: prowDict.REPO_NAME,
workflow: "gis",
[if std.objectHas(prowDict, "PULL_NUMBER") then "pr"]: prowDict.PULL_NUMBER,
labels: prowDict + {
workflow_template: workflow_template,
},
},
spec: {

View File

@ -9,8 +9,8 @@ local envParams = params + {
},
gis+: {
namespace: 'kubeflow-test-infra',
name: 'jlewi-gis-search-test-449-1228-184223',
prow_env: 'JOB_NAME=gis-search-test,JOB_TYPE=presubmit,REPO_NAME=examples,REPO_OWNER=kubeflow,BUILD_NUMBER=1228-184223,BUILD_ID=1228-184223,PULL_NUMBER=449',
name: 'jlewi-gis-search-test-456-0105-104058',
prow_env: 'JOB_NAME=gis-search-test,JOB_TYPE=presubmit,REPO_NAME=examples,REPO_OWNER=kubeflow,BUILD_NUMBER=0105-104058,BUILD_ID=0105-104058,PULL_NUMBER=456',
},
},
};