Fix model file upload (#160)

* Add component parameters Add model_url & port arguments to flask app Add service_type, image, and model_url parameters to ui component Fix problem argument in tensor2tensor component * Fix broken UI component Fix broken UI component structure by adding all, service, & deployment parts Add parameter defaults for tfjob to resolve failures deploying other components * Add missing imports in flask app Fix syntax error in argument parsing Remove underscores from parameter names to workaround ksonnet bug #554: https://github.com/ksonnet/ksonnet/issues/554 * Fix syntax errors in t2t instructions Add CPU image build arg to docker build command for t2t-training Fix link to ksonnet app dir Correct param names for tensor2tensor component Add missing params for tensor2tensor component Fix apply command syntax Swap out log view pod for t2t-master instead of tf-operator Fix link to training with tfjob * Fix model file upload Update default params for tfjob-v1alpha2 Fix build directory path in Makefile * Resolve lint issues Lines too long * Add specific image tag to tfjob-v1alpha2 default * Fix defaults for training output files Update image tag Add UI image tag * Revert service account secret details Update associated readme
2018-06-29 18:41:20 -07:00 · 2018-06-29 18:41:20 -07:00 · 836ad70421
parent 98ed4b4a69
commit 836ad70421
4 changed files with 19 additions and 15 deletions
--- a/github_issue_summarization/02_training_the_model_tfjob.md
+++ b/github_issue_summarization/02_training_the_model_tfjob.md
@ -53,7 +53,7 @@ and the resulting model.

 * Give the storage account `roles/storage.admin` role so that it can access GCS Buckets.

-* Download its key as a json file and create a secret named `gcp-credentials` with the key `key.json`
+* Download its key as a json file and create a secret named `user-gcp-sa` with the key `user-gcp-sa.json`

 ```commandline
 SERVICE_ACCOUNT=github-issue-summarization
@ -68,7 +68,7 @@ KEY_FILE=/home/agwl/secrets/${SERVICE_ACCOUNT}@${PROJECT}.iam.gserviceaccount.co
 gcloud iam service-accounts keys create ${KEY_FILE} \
  --iam-account ${SERVICE_ACCOUNT}@${PROJECT}.iam.gserviceaccount.com

-kubectl --namespace=${NAMESPACE} create secret generic gcp-credentials --from-file=key.json="${KEY_FILE}"
+kubectl --namespace=${NAMESPACE} create secret generic user-gcp-sa --from-file=user-gcp-sa.json="${KEY_FILE}"
 ```


--- a/github_issue_summarization/ks-kubeflow/components/params.libsonnet
+++ b/github_issue_summarization/ks-kubeflow/components/params.libsonnet
@ -53,14 +53,15 @@
    ui: {
      namespace: "null",
      githubToken: "",
+      image: "gcr.io/kubeflow-examples/issue-summarization-ui:v20180629-v0.1-2-g98ed4b4-dirty-182929",
    },
    "tfjob-v1alpha2": {
-      name: "tfjob-v1alpha2",
-      image: "gcr.io/kubeflow-dev/tf-job-issue-summarization:v20180425-e79f888",
+      name: "tfjob-issue-summarization",
+      image: "gcr.io/kubeflow-examples/tf-job-issue-summarization:v20180629-v0.1-2-g98ed4b4-dirty-182929",
      input_data_gcs_bucket: "kubeflow-examples",
      input_data_gcs_path: "github-issue-summarization-data/github-issues.zip",
      output_model_gcs_bucket: "kubeflow-examples",
-      output_model_gcs_path: "github-issue-summarization-data/output_model.h5",
+      output_model_gcs_path: "github-issue-summarization-data",
      sample_size: "100000",
      gcpSecretName: "user-gcp-sa",
      gcpSecretFile: "user-gcp-sa.json",
--- a/github_issue_summarization/notebooks/Makefile
+++ b/github_issue_summarization/notebooks/Makefile
@ -35,7 +35,7 @@ DIR := $(shell pwd)

 # Use a subdirectory of the root directory
 # this way it will be excluded by git diff-files
-BUILD_DIR := $(shell cd ../build/notebook_build && pwd)
+BUILD_DIR := $(pwd)

 MODEL_GCS := gs://kubeflow-examples-data/gh_issue_summarization/model/v20180426
 # You can override this on the command line as
--- a/github_issue_summarization/notebooks/train.py
+++ b/github_issue_summarization/notebooks/train.py
@ -72,28 +72,26 @@ def main():  # pylint: disable=too-many-statements
    default="",
    help="The output location for the model GCS or local file path.")

-  # TODO(jlewi): We should get rid of the following arguments and just use
-  # --output_model_h5. If the output is a gs:// location we should use
-  # a local file and then upload it to GCS.
  parser.add_argument("--output_model_gcs_bucket", type=str, default="")
  parser.add_argument(
    "--output_model_gcs_path",
    type=str,
-    default="github-issue-summarization-data/output_model.h5")
+    default="github-issue-summarization-data")

  parser.add_argument(
    "--output_body_preprocessor_dpkl",
    type=str,
-    default="body_preprocessor.dpkl")
+    default="body_pp.dpkl")
  parser.add_argument(
    "--output_title_preprocessor_dpkl",
    type=str,
-    default="title_preprocessor.dpkl")
+    default="title_pp.dpkl")
  parser.add_argument(
    "--output_train_title_vecs_npy", type=str, default="train_title_vecs.npy")
  parser.add_argument(
    "--output_train_body_vecs_npy", type=str, default="train_body_vecs.npy")
-  parser.add_argument("--output_model_h5", type=str, default="output_model.h5")
+  parser.add_argument(
+    "--output_model_h5", type=str, default="seq2seq_model_tutorial.h5")

  args = parser.parse_args()

@ -273,11 +271,16 @@ def main():  # pylint: disable=too-many-statements
        args.output_model)

  if output_model_gcs_bucket:
-    logging.info("Uploading model to bucket %s path %s.",
+    logging.info("Uploading model files to bucket %s path %s.",
                 output_model_gcs_bucket, output_model_gcs_path)
    bucket = storage.Bucket(storage.Client(), output_model_gcs_bucket)
-    storage.Blob(output_model_gcs_path, bucket).upload_from_filename(
+    storage.Blob(
+      output_model_gcs_path + "/" + args.output_model_h5, bucket).upload_from_filename(
      args.output_model_h5)
+    storage.Blob(output_model_gcs_path + "/" + args.output_body_preprocessor_dpkl,
+                 bucket).upload_from_filename(args.output_body_preprocessor_dpkl)
+    storage.Blob(output_model_gcs_path + "/" + args.output_title_preprocessor_dpkl,
+                 bucket).upload_from_filename(args.output_title_preprocessor_dpkl)


 if __name__ == '__main__':