Add kubecon demo to xgboost_ames_housing directory (#589)

* Add xgboost-ames-housing demo from Kubecon EU 2019. * fix links in the .ipynb in the xgboost-ames-housing demo * update to the xgboost demo example from kubecon - move example to its own directory - remove unnecessarry files - modify util and update notebook * change the names related to kubecon and update readme * use fairing instead of own fairing_util in the notebook * remove fairing_util and move the remaining to util instead * update synthetic data example as comments - generalize yaml - remove updating github procedures - update readme - rename files * fix pylint. * fix pylint.
2019-07-16 10:33:25 -07:00 · 2019-07-16 10:33:25 -07:00 · ac9f2f1238
parent 567998cb4e
commit ac9f2f1238
9 changed files with 1293 additions and 0 deletions
--- a/xgboost_synthetic/.gitignore
+++ b/xgboost_synthetic/.gitignore
@ -0,0 +1,7 @@
 **/.build
 **/.ipynb_checkpoints
 **/.pipeline_build
 **/__pycache__
 *.zip
 mlpipeline-metrics.json
 mlpipeline-ui-metadata.json
--- a/xgboost_synthetic/Dockerfile
+++ b/xgboost_synthetic/Dockerfile
@ -0,0 +1,23 @@
 # Create docker image for the demo
 #
 # This docker image is based on existing notebook image
 # It also includes the dependencies required for training and deploying
 # this way we can use it as the base image
 FROM gcr.io/kubeflow-images-public/tensorflow-1.12.0-notebook-cpu:v0.5.0
 USER root
 COPY requirements.txt .
 RUN pip3 --no-cache-dir install -r requirements.txt
 RUN apt-get update -y
 RUN apt-get install -y emacs
 RUN pip3 install https://storage.googleapis.com/ml-pipeline/release/0.1.20/kfp.tar.gz
 # Checkout kubeflow/testing because we use some of its utilities
 RUN mkdir -p /src/kubeflow && \
    cd /src/kubeflow && \
    git clone https://github.com/kubeflow/testing.git testing
 USER jovyan
--- a/xgboost_synthetic/Makefile
+++ b/xgboost_synthetic/Makefile
@ -0,0 +1,55 @@
 # IMG is the base path for images..
 # Individual images will be
 # $(IMG)/$(NAME):$(TAG)
 PROJECT ?= code-search-demo
 IMG ?= gcr.io/${PROJECT}/xgboost-synthetic/notebook
 GITOPS_IMAGE ?= gcr.io/${PROJECT}/xgboost-synthetic/gitops
 # List any changed  files. We only include files in the notebooks directory.
 # because that is the code in the docker image.
 # In particular we exclude changes to the ksonnet configs.
 CHANGED_FILES := $(shell git diff-files)
 # Whether to use cached images with GCB
 USE_IMAGE_CACHE ?= true
 ifeq ($(strip $(CHANGED_FILES)),)
 # Changed files is empty; not dirty
 # Don't include --dirty because it could be dirty if files outside the ones we care
 # about changed.
 GIT_VERSION := $(shell git describe --always)
 else
 GIT_VERSION := $(shell git describe --always)-dirty-$(shell git diff | shasum -a256 | cut -c -6)
 endif
 TAG := $(shell date +v%Y%m%d)-$(GIT_VERSION)
 all: build
 # To build without the cache set the environment variable
 # export DOCKER_BUILD_OPTS=--no-cache
 .PHONY: build
 build-dir: ./Dockerfile ./requirements.txt
 	rm -rf ./.build
 	mkdir  -p ./.build
 	cp ./requirements.txt ./.build/
 	cp ./Dockerfile* ./.build/
 build: build-dir
 	cd .build && docker build ${DOCKER_BUILD_OPTS} -t $(IMG):$(TAG) -f Dockerfile . \
           --label=git-verions=$(GIT_VERSION)
 	docker tag $(IMG):$(TAG) $(IMG):latest
 	@echo Built $(IMG):latest
 	@echo Built $(IMG):$(TAG)
 build-gcb: build-dir
 	gcloud builds submit --machine-type=n1-highcpu-32 --project=$(PROJECT) --tag=$(IMG):$(TAG) \
 		--timeout=3600 ./build
 	@echo Built $(IMG):$(TAG)
 # Build but don't attach the latest tag. This allows manual testing/inspection of the image
 # first.
 push: build
 	docker push $(IMG):$(TAG)
 	docker push $(IMG):latest
 	@echo Pushed $(IMG):$(TAG)
--- a/xgboost_synthetic/README.md
+++ b/xgboost_synthetic/README.md
@ -0,0 +1,9 @@
 # xgboost-synthetic
 Kubeflow fairing, pipelines demo using synthetic data
 1. Launch a notebook
   ```
   kubectl apply -f notebook.xgboost-synthetic.yaml
   ```
 1. Attach an extra data volume named 
--- a/xgboost_synthetic/build-train-deploy.ipynb
+++ b/xgboost_synthetic/build-train-deploy.ipynb
--- a/xgboost_synthetic/notebook.xgboost-synthetic.yaml
+++ b/xgboost_synthetic/notebook.xgboost-synthetic.yaml
@ -0,0 +1,30 @@
 apiVersion: kubeflow.org/v1alpha1
 kind: Notebook
 metadata:
  labels:
    app: notebook
  name: xgboost-synthetic
  namespace: kubeflow
 spec:
  template:
    spec:
      containers:
      - env: []
        image: gcr.io/kubeflow-images-public/tensorflow-1.12.0-notebook-cpu:v0.5.0
        name: tf-cpu
        resources:
          limits:
            cpu: 8
            memory: 16Gi
          requests:
            cpu: 1
            memory: 1Gi
        volumeMounts:
        - mountPath: /home/jovyan
          name: xgboost-synthetic
      serviceAccountName: jupyter-notebook
      ttlSecondsAfterFinished: 300
      volumes:
      - name: xgboost-synthetic
        persistentVolumeClaim:
          claimName: xgboost-synthetic
--- a/xgboost_synthetic/requirements.txt
+++ b/xgboost_synthetic/requirements.txt
@ -0,0 +1,11 @@
 fairing
 fire
 gitpython
 google-cloud-storage
 joblib
 numpy
 pandas
 retrying
 seldon-core
 sklearn
 xgboost
--- a/xgboost_synthetic/util.py
+++ b/xgboost_synthetic/util.py
@ -0,0 +1,45 @@
 import logging
 import os
 import shutil
 import subprocess
 import json
 import requests
 from retrying import retry
 import numpy as np
 KFP_PACKAGE = 'https://storage.googleapis.com/ml-pipeline/release/0.1.20/kfp.tar.gz'
 def notebook_setup():
  # Install the SDK
  subprocess.check_call(["pip3", "install", KFP_PACKAGE, "--upgrade"])
  logging.basicConfig(format='%(message)s')
  logging.getLogger().setLevel(logging.INFO)
  subprocess.check_call(["gcloud", "auth", "configure-docker", "--quiet"])
  subprocess.check_call(["gcloud", "auth", "activate-service-account",
                         "--key-file=" + os.getenv("GOOGLE_APPLICATION_CREDENTIALS"),
                         "--quiet"])
 def copy_data_to_nfs(nfs_path, model_dir):
  if not os.path.exists(nfs_path):
    shutil.copytree("ames_dataset", nfs_path)
  if not os.path.exists(model_dir):
    os.makedirs(model_dir)
@retry(wait_exponential_multiplier=1000, wait_exponential_max=5000,
       stop_max_delay=2*60*1000)
 def predict_nparray(url, data, feature_names=None):
  pdata = {
      "data": {
          "names":feature_names,
          "tensor": {
              "shape": np.asarray(data.shape).tolist(),
              "values": data.flatten().tolist(),
          },
      }
  }
  serialized_data = json.dumps(pdata)
  r = requests.post(url, data={'json':serialized_data}, timeout=5)
  return r
--- a/xgboost_synthetic/xgboost_util.py
+++ b/xgboost_synthetic/xgboost_util.py
@ -0,0 +1,111 @@
 import logging
 import re
 from google.cloud import storage
 import joblib
 from sklearn.metrics import mean_absolute_error
 from sklearn.model_selection import train_test_split
 from sklearn.impute import SimpleImputer
 import pandas as pd
 from xgboost import XGBRegressor
 def read_input(file_name, test_size=0.25):
  """Read input data and split it into train and test."""
  if file_name.startswith("gs://"):
    gcs_path = file_name
    train_bucket_name, train_path = split_gcs_uri(gcs_path)
    storage_client = storage.Client()
    train_bucket = storage_client.get_bucket(train_bucket_name)
    train_blob = train_bucket.blob(train_path)
    file_name = "/tmp/data.csv"
    train_blob.download_to_filename(file_name)
  data = pd.read_csv(file_name)
  data.dropna(axis=0, subset=['SalePrice'], inplace=True)
  y = data.SalePrice
  X = data.drop(['SalePrice'], axis=1).select_dtypes(exclude=['object'])
  train_X, test_X, train_y, test_y = train_test_split(X.values,
                                                    y.values,
                                                    test_size=test_size,
                                                    shuffle=False)
  imputer = SimpleImputer()
  train_X = imputer.fit_transform(train_X)
  test_X = imputer.transform(test_X)
  return (train_X, train_y), (test_X, test_y)
 def load_model(model_path):
  local_model_path = model_path
  if model_path.startswith("gs://"):
    gcs_path = model_path
    train_bucket_name, train_path = split_gcs_uri(gcs_path)
    storage_client = storage.Client()
    train_bucket = storage_client.get_bucket(train_bucket_name)
    train_blob = train_bucket.blob(train_path)
    local_model_path = "/tmp/model.dat"
    logging.info("Downloading model to %s", local_model_path)
    train_blob.download_to_filename(local_model_path)
  model = joblib.load(local_model_path)
  return model
 def train_model(train_X,
                train_y,
                test_X,
                test_y,
                n_estimators,
                learning_rate):
  """Train the model using XGBRegressor."""
  model = XGBRegressor(n_estimators=n_estimators, learning_rate=learning_rate)
  model.fit(train_X,
          train_y,
          early_stopping_rounds=40,
          eval_set=[(test_X, test_y)])
  logging.info("Best RMSE on eval: %.2f with %d rounds",
               model.best_score,
               model.best_iteration+1)
  return model
 def eval_model(model, test_X, test_y):
  """Evaluate the model performance."""
  predictions = model.predict(test_X)
  logging.info("mean_absolute_error=%.2f", mean_absolute_error(predictions, test_y))
 def save_model(model, model_file):
  """Save XGBoost model for serving."""
  gcs_path = None
  if model_file.startswith("gs://"):
    gcs_path = model_file
    model_file = "/tmp/model.dat"
  joblib.dump(model, model_file)
  logging.info("Model export success: %s", model_file)
  if gcs_path:
    model_bucket_name, model_path = split_gcs_uri(gcs_path)
    storage_client = storage.Client()
    model_bucket = storage_client.get_bucket(model_bucket_name)
    model_blob = model_bucket.blob(model_path)
    logging.info("Uploading model to %s", gcs_path)
    model_blob.upload_from_filename(model_file)
 def split_gcs_uri(gcs_uri):
  """Split a GCS URI into bucket and path."""
  GCS_REGEX = re.compile("gs://([^/]*)(/.*)?")
  m = GCS_REGEX.match(gcs_uri)
  bucket = m.group(1)
  path = ""
  if m.group(2):
    path = m.group(2).lstrip("/")
  return bucket, path