Add kubecon demo to xgboost_ames_housing directory (#589)

* Add xgboost-ames-housing demo from Kubecon EU 2019. * fix links in the .ipynb in the xgboost-ames-housing demo * update to the xgboost demo example from kubecon - move example to its own directory - remove unnecessarry files - modify util and update notebook * change the names related to kubecon and update readme * use fairing instead of own fairing_util in the notebook * remove fairing_util and move the remaining to util instead * update synthetic data example as comments - generalize yaml - remove updating github procedures - update readme - rename files * fix pylint. * fix pylint.
2019-07-16 10:33:25 -07:00 · 2019-07-16 10:33:25 -07:00 · ac9f2f1238
parent 567998cb4e
commit ac9f2f1238
9 changed files with 1293 additions and 0 deletions
--- a/xgboost_synthetic/.gitignore
+++ b/xgboost_synthetic/.gitignore
@ -0,0 +1,7 @@
+**/.build
+**/.ipynb_checkpoints
+**/.pipeline_build
+**/__pycache__
+*.zip
+mlpipeline-metrics.json
+mlpipeline-ui-metadata.json
--- a/xgboost_synthetic/Dockerfile
+++ b/xgboost_synthetic/Dockerfile
@ -0,0 +1,23 @@
+# Create docker image for the demo
+#
+# This docker image is based on existing notebook image
+# It also includes the dependencies required for training and deploying
+# this way we can use it as the base image
+FROM gcr.io/kubeflow-images-public/tensorflow-1.12.0-notebook-cpu:v0.5.0
+
+USER root
+
+COPY requirements.txt .
+RUN pip3 --no-cache-dir install -r requirements.txt
+
+RUN apt-get update -y
+RUN apt-get install -y emacs
+
+RUN pip3 install https://storage.googleapis.com/ml-pipeline/release/0.1.20/kfp.tar.gz
+
+# Checkout kubeflow/testing because we use some of its utilities
+RUN mkdir -p /src/kubeflow && \
+    cd /src/kubeflow && \
+    git clone https://github.com/kubeflow/testing.git testing
+
+USER jovyan
--- a/xgboost_synthetic/Makefile
+++ b/xgboost_synthetic/Makefile
@ -0,0 +1,55 @@
+# IMG is the base path for images..
+# Individual images will be
+# $(IMG)/$(NAME):$(TAG)
+
+PROJECT ?= code-search-demo
+IMG ?= gcr.io/${PROJECT}/xgboost-synthetic/notebook
+GITOPS_IMAGE ?= gcr.io/${PROJECT}/xgboost-synthetic/gitops
+
+# List any changed  files. We only include files in the notebooks directory.
+# because that is the code in the docker image.
+# In particular we exclude changes to the ksonnet configs.
+CHANGED_FILES := $(shell git diff-files)
+
+# Whether to use cached images with GCB
+USE_IMAGE_CACHE ?= true
+
+ifeq ($(strip $(CHANGED_FILES)),)
+# Changed files is empty; not dirty
+# Don't include --dirty because it could be dirty if files outside the ones we care
+# about changed.
+GIT_VERSION := $(shell git describe --always)
+else
+GIT_VERSION := $(shell git describe --always)-dirty-$(shell git diff | shasum -a256 | cut -c -6)
+endif
+
+TAG := $(shell date +v%Y%m%d)-$(GIT_VERSION)
+all: build
+
+# To build without the cache set the environment variable
+# export DOCKER_BUILD_OPTS=--no-cache
+.PHONY: build
+build-dir: ./Dockerfile ./requirements.txt
+	rm -rf ./.build
+	mkdir  -p ./.build
+	cp ./requirements.txt ./.build/
+	cp ./Dockerfile* ./.build/
+
+build: build-dir
+	cd .build && docker build ${DOCKER_BUILD_OPTS} -t $(IMG):$(TAG) -f Dockerfile . \
+           --label=git-verions=$(GIT_VERSION)
+	docker tag $(IMG):$(TAG) $(IMG):latest
+	@echo Built $(IMG):latest
+	@echo Built $(IMG):$(TAG)
+
+build-gcb: build-dir
+	gcloud builds submit --machine-type=n1-highcpu-32 --project=$(PROJECT) --tag=$(IMG):$(TAG) \
+		--timeout=3600 ./build
+	@echo Built $(IMG):$(TAG)
+
+# Build but don't attach the latest tag. This allows manual testing/inspection of the image
+# first.
+push: build
+	docker push $(IMG):$(TAG)
+	docker push $(IMG):latest
+	@echo Pushed $(IMG):$(TAG)
--- a/xgboost_synthetic/README.md
+++ b/xgboost_synthetic/README.md
@ -0,0 +1,9 @@
+# xgboost-synthetic
+Kubeflow fairing, pipelines demo using synthetic data
+
+1. Launch a notebook
+
+   ```
+   kubectl apply -f notebook.xgboost-synthetic.yaml
+   ```
+1. Attach an extra data volume named 
--- a/xgboost_synthetic/build-train-deploy.ipynb
+++ b/xgboost_synthetic/build-train-deploy.ipynb
--- a/xgboost_synthetic/notebook.xgboost-synthetic.yaml
+++ b/xgboost_synthetic/notebook.xgboost-synthetic.yaml
@ -0,0 +1,30 @@
+apiVersion: kubeflow.org/v1alpha1
+kind: Notebook
+metadata:
+  labels:
+    app: notebook
+  name: xgboost-synthetic
+  namespace: kubeflow
+spec:
+  template:
+    spec:
+      containers:
+      - env: []
+        image: gcr.io/kubeflow-images-public/tensorflow-1.12.0-notebook-cpu:v0.5.0
+        name: tf-cpu
+        resources:
+          limits:
+            cpu: 8
+            memory: 16Gi
+          requests:
+            cpu: 1
+            memory: 1Gi
+        volumeMounts:
+        - mountPath: /home/jovyan
+          name: xgboost-synthetic
+      serviceAccountName: jupyter-notebook
+      ttlSecondsAfterFinished: 300
+      volumes:
+      - name: xgboost-synthetic
+        persistentVolumeClaim:
+          claimName: xgboost-synthetic
--- a/xgboost_synthetic/requirements.txt
+++ b/xgboost_synthetic/requirements.txt
@ -0,0 +1,11 @@
+fairing
+fire
+gitpython
+google-cloud-storage
+joblib
+numpy
+pandas
+retrying
+seldon-core
+sklearn
+xgboost
--- a/xgboost_synthetic/util.py
+++ b/xgboost_synthetic/util.py
@ -0,0 +1,45 @@
+import logging
+import os
+import shutil
+import subprocess
+import json
+import requests
+from retrying import retry
+import numpy as np
+
+KFP_PACKAGE = 'https://storage.googleapis.com/ml-pipeline/release/0.1.20/kfp.tar.gz'
+def notebook_setup():
+  # Install the SDK
+
+  subprocess.check_call(["pip3", "install", KFP_PACKAGE, "--upgrade"])
+
+  logging.basicConfig(format='%(message)s')
+  logging.getLogger().setLevel(logging.INFO)
+
+  subprocess.check_call(["gcloud", "auth", "configure-docker", "--quiet"])
+  subprocess.check_call(["gcloud", "auth", "activate-service-account",
+                         "--key-file=" + os.getenv("GOOGLE_APPLICATION_CREDENTIALS"),
+                         "--quiet"])
+
+def copy_data_to_nfs(nfs_path, model_dir):
+  if not os.path.exists(nfs_path):
+    shutil.copytree("ames_dataset", nfs_path)
+
+  if not os.path.exists(model_dir):
+    os.makedirs(model_dir)
+
+@retry(wait_exponential_multiplier=1000, wait_exponential_max=5000,
+       stop_max_delay=2*60*1000)
+def predict_nparray(url, data, feature_names=None):
+  pdata = {
+      "data": {
+          "names":feature_names,
+          "tensor": {
+              "shape": np.asarray(data.shape).tolist(),
+              "values": data.flatten().tolist(),
+          },
+      }
+  }
+  serialized_data = json.dumps(pdata)
+  r = requests.post(url, data={'json':serialized_data}, timeout=5)
+  return r
--- a/xgboost_synthetic/xgboost_util.py
+++ b/xgboost_synthetic/xgboost_util.py
@ -0,0 +1,111 @@
+import logging
+import re
+from google.cloud import storage
+import joblib
+from sklearn.metrics import mean_absolute_error
+from sklearn.model_selection import train_test_split
+from sklearn.impute import SimpleImputer
+import pandas as pd
+from xgboost import XGBRegressor
+
+def read_input(file_name, test_size=0.25):
+  """Read input data and split it into train and test."""
+
+  if file_name.startswith("gs://"):
+    gcs_path = file_name
+    train_bucket_name, train_path = split_gcs_uri(gcs_path)
+
+    storage_client = storage.Client()
+    train_bucket = storage_client.get_bucket(train_bucket_name)
+    train_blob = train_bucket.blob(train_path)
+
+    file_name = "/tmp/data.csv"
+    train_blob.download_to_filename(file_name)
+
+  data = pd.read_csv(file_name)
+  data.dropna(axis=0, subset=['SalePrice'], inplace=True)
+
+  y = data.SalePrice
+  X = data.drop(['SalePrice'], axis=1).select_dtypes(exclude=['object'])
+
+  train_X, test_X, train_y, test_y = train_test_split(X.values,
+                                                    y.values,
+                                                    test_size=test_size,
+                                                    shuffle=False)
+
+  imputer = SimpleImputer()
+  train_X = imputer.fit_transform(train_X)
+  test_X = imputer.transform(test_X)
+
+  return (train_X, train_y), (test_X, test_y)
+
+def load_model(model_path):
+  local_model_path = model_path
+
+  if model_path.startswith("gs://"):
+    gcs_path = model_path
+    train_bucket_name, train_path = split_gcs_uri(gcs_path)
+
+    storage_client = storage.Client()
+    train_bucket = storage_client.get_bucket(train_bucket_name)
+    train_blob = train_bucket.blob(train_path)
+
+    local_model_path = "/tmp/model.dat"
+    logging.info("Downloading model to %s", local_model_path)
+    train_blob.download_to_filename(local_model_path)
+
+  model = joblib.load(local_model_path)
+  return model
+
+def train_model(train_X,
+                train_y,
+                test_X,
+                test_y,
+                n_estimators,
+                learning_rate):
+  """Train the model using XGBRegressor."""
+  model = XGBRegressor(n_estimators=n_estimators, learning_rate=learning_rate)
+
+  model.fit(train_X,
+          train_y,
+          early_stopping_rounds=40,
+          eval_set=[(test_X, test_y)])
+
+  logging.info("Best RMSE on eval: %.2f with %d rounds",
+               model.best_score,
+               model.best_iteration+1)
+  return model
+
+def eval_model(model, test_X, test_y):
+  """Evaluate the model performance."""
+  predictions = model.predict(test_X)
+  logging.info("mean_absolute_error=%.2f", mean_absolute_error(predictions, test_y))
+
+def save_model(model, model_file):
+  """Save XGBoost model for serving."""
+
+  gcs_path = None
+  if model_file.startswith("gs://"):
+    gcs_path = model_file
+    model_file = "/tmp/model.dat"
+  joblib.dump(model, model_file)
+  logging.info("Model export success: %s", model_file)
+
+  if gcs_path:
+    model_bucket_name, model_path = split_gcs_uri(gcs_path)
+    storage_client = storage.Client()
+    model_bucket = storage_client.get_bucket(model_bucket_name)
+    model_blob = model_bucket.blob(model_path)
+
+    logging.info("Uploading model to %s", gcs_path)
+    model_blob.upload_from_filename(model_file)
+
+def split_gcs_uri(gcs_uri):
+  """Split a GCS URI into bucket and path."""
+  GCS_REGEX = re.compile("gs://([^/]*)(/.*)?")
+  m = GCS_REGEX.match(gcs_uri)
+  bucket = m.group(1)
+  path = ""
+  if m.group(2):
+    path = m.group(2).lstrip("/")
+  return bucket, path