Update ksonnet for datagen (#309)

* Update the datagen component.

* We should use a K8s job rather than a TFJob. We can also simplify the
  ksonnet by just putting the spec into the jsonnet file rather than trying
  to share various bits of the spec with the TFJob for training.

Related to kubeflow/examples#308 use globals to allow parameters to be shared
across components (e.g. working directory.)

* Update the README with information about data.

* Fix table markdown.
This commit is contained in:
Jeremy Lewi 2018-11-07 14:28:16 -08:00 committed by k8s-ci-robot
parent 11879e2ff1
commit d01b76b6f9
4 changed files with 91 additions and 12 deletions

View File

@ -3,4 +3,32 @@
This directory contains assets for setting up a demo of the code search example.
It is primarily intended for use by Kubeflow contributors working on the shared demo.
Users looking to run the example should follow the README.md in the parent directory.
Users looking to run the example should follow the README.md in the parent directory.
# GCP Resources
We are using the following project
* **org**: kubeflow.org
* **project**: code-search-demo
* **[code-search-team@kubeflow.org](https://github.com/kubeflow/internal-acls/blob/master/code-search-team.members.txt)** Google group administering access
# Results
## 2018-11-05
jlewi@ ran experiments that produced the following results
| What | location | Description
|------|----------|-------------------------
| Preprocessed data| gs://code-search-demo/20181104/data/func-doc-pairs-00???-of-00100.csv | This is the output of the Dataflow preprocessing job
| Training data | gs://code-search-demo/20181104/data/kf_github_function_docstring-train-00???-of-00100 | TFRecord files produced by running T2T datagen

View File

@ -35,6 +35,7 @@
"t2t-code-search-datagen": {
jobType: "datagen",
name: "t2t-code-search-datagen",
image: $.components["t2t-job"].image,
problem: $.components["t2t-code-search"].problem,
dataDir: $.components["t2t-code-search"].workingDir + "/data",
},

View File

@ -1,7 +1,66 @@
// A K8s job to run datagen using T2T.
local k = import "k.libsonnet";
local t2tJob = import "t2t-job.libsonnet";
local env = std.extVar("__ksonnet/environments");
local params = std.extVar("__ksonnet/params").components["t2t-code-search-datagen"];
std.prune(k.core.v1.list.new([t2tJob.parts(params, env).job]))
local jobSpec = {
apiVersion: "batch/v1",
kind: "Job",
metadata: {
name: params.name,
namespace: env.namespace,
labels: {
app: params.name,
},
},
spec: {
replicas: 1,
template: {
metadata: {
labels: {
app: params.name,
},
},
spec: {
restartPolicy: "OnFailure",
containers: [
{
name: "t2t-datagen",
image: params.image,
command: [
"/usr/local/sbin/t2t-entrypoint",
"t2t-datagen",
"--problem=" + params.problem,
"--data_dir=" + params.dataDir,
],
env: [
{
name: "GOOGLE_APPLICATION_CREDENTIALS",
value: "/secret/gcp-credentials/user-gcp-sa.json",
},
],
workingDir: "/src",
volumeMounts: [
{
mountPath: "/secret/gcp-credentials",
name: "gcp-credentials",
},
], //volumeMounts
},
], // containers
volumes: [
{
name: "gcp-credentials",
secret: {
secretName: "user-gcp-sa",
},
},
],
}, // spec
},
},
};
std.prune(k.core.v1.list.new([jobSpec]))

View File

@ -1,14 +1,6 @@
local baseParams = std.extVar("__ksonnet/params").components["t2t-job"];
{
getDatagenCmd(params)::
[
"/usr/local/sbin/t2t-entrypoint",
"t2t-datagen",
"--problem=" + params.problem,
"--data_dir=" + params.dataDir,
],
{
getExporterCmd(params)::
[
"/usr/local/sbin/t2t-entrypoint",
@ -104,7 +96,6 @@ local baseParams = std.extVar("__ksonnet/params").components["t2t-job"];
local cmd = $.getTrainerCmd(params),
local workerCmd = if params.jobType == "exporter" then $.getExporterCmd(params)
else if params.jobType == "datagen" then $.getDatagenCmd(params)
else cmd.worker,
job:: {