Add kubecon demo to xgboost_ames_housing directory (#589)

* Add xgboost-ames-housing demo from Kubecon EU 2019.

* fix links in the .ipynb in the xgboost-ames-housing demo

* update to the xgboost demo example from kubecon
- move example to its own directory
- remove unnecessarry files
- modify util and update notebook

* change the names related to kubecon and update readme

* use fairing instead of own fairing_util in the notebook

* remove fairing_util and move the remaining to util instead

* update synthetic data example as comments
- generalize yaml
- remove updating github procedures
- update readme
- rename files

* fix pylint.

* fix pylint.
This commit is contained in:
Chun-Hsiang Wang 2019-07-16 10:33:25 -07:00 committed by Kubernetes Prow Robot
parent 567998cb4e
commit ac9f2f1238
9 changed files with 1293 additions and 0 deletions

7
xgboost_synthetic/.gitignore vendored Normal file
View File

@ -0,0 +1,7 @@
**/.build
**/.ipynb_checkpoints
**/.pipeline_build
**/__pycache__
*.zip
mlpipeline-metrics.json
mlpipeline-ui-metadata.json

View File

@ -0,0 +1,23 @@
# Create docker image for the demo
#
# This docker image is based on existing notebook image
# It also includes the dependencies required for training and deploying
# this way we can use it as the base image
FROM gcr.io/kubeflow-images-public/tensorflow-1.12.0-notebook-cpu:v0.5.0
USER root
COPY requirements.txt .
RUN pip3 --no-cache-dir install -r requirements.txt
RUN apt-get update -y
RUN apt-get install -y emacs
RUN pip3 install https://storage.googleapis.com/ml-pipeline/release/0.1.20/kfp.tar.gz
# Checkout kubeflow/testing because we use some of its utilities
RUN mkdir -p /src/kubeflow && \
cd /src/kubeflow && \
git clone https://github.com/kubeflow/testing.git testing
USER jovyan

View File

@ -0,0 +1,55 @@
# IMG is the base path for images..
# Individual images will be
# $(IMG)/$(NAME):$(TAG)
PROJECT ?= code-search-demo
IMG ?= gcr.io/${PROJECT}/xgboost-synthetic/notebook
GITOPS_IMAGE ?= gcr.io/${PROJECT}/xgboost-synthetic/gitops
# List any changed files. We only include files in the notebooks directory.
# because that is the code in the docker image.
# In particular we exclude changes to the ksonnet configs.
CHANGED_FILES := $(shell git diff-files)
# Whether to use cached images with GCB
USE_IMAGE_CACHE ?= true
ifeq ($(strip $(CHANGED_FILES)),)
# Changed files is empty; not dirty
# Don't include --dirty because it could be dirty if files outside the ones we care
# about changed.
GIT_VERSION := $(shell git describe --always)
else
GIT_VERSION := $(shell git describe --always)-dirty-$(shell git diff | shasum -a256 | cut -c -6)
endif
TAG := $(shell date +v%Y%m%d)-$(GIT_VERSION)
all: build
# To build without the cache set the environment variable
# export DOCKER_BUILD_OPTS=--no-cache
.PHONY: build
build-dir: ./Dockerfile ./requirements.txt
rm -rf ./.build
mkdir -p ./.build
cp ./requirements.txt ./.build/
cp ./Dockerfile* ./.build/
build: build-dir
cd .build && docker build ${DOCKER_BUILD_OPTS} -t $(IMG):$(TAG) -f Dockerfile . \
--label=git-verions=$(GIT_VERSION)
docker tag $(IMG):$(TAG) $(IMG):latest
@echo Built $(IMG):latest
@echo Built $(IMG):$(TAG)
build-gcb: build-dir
gcloud builds submit --machine-type=n1-highcpu-32 --project=$(PROJECT) --tag=$(IMG):$(TAG) \
--timeout=3600 ./build
@echo Built $(IMG):$(TAG)
# Build but don't attach the latest tag. This allows manual testing/inspection of the image
# first.
push: build
docker push $(IMG):$(TAG)
docker push $(IMG):latest
@echo Pushed $(IMG):$(TAG)

View File

@ -0,0 +1,9 @@
# xgboost-synthetic
Kubeflow fairing, pipelines demo using synthetic data
1. Launch a notebook
```
kubectl apply -f notebook.xgboost-synthetic.yaml
```
1. Attach an extra data volume named

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,30 @@
apiVersion: kubeflow.org/v1alpha1
kind: Notebook
metadata:
labels:
app: notebook
name: xgboost-synthetic
namespace: kubeflow
spec:
template:
spec:
containers:
- env: []
image: gcr.io/kubeflow-images-public/tensorflow-1.12.0-notebook-cpu:v0.5.0
name: tf-cpu
resources:
limits:
cpu: 8
memory: 16Gi
requests:
cpu: 1
memory: 1Gi
volumeMounts:
- mountPath: /home/jovyan
name: xgboost-synthetic
serviceAccountName: jupyter-notebook
ttlSecondsAfterFinished: 300
volumes:
- name: xgboost-synthetic
persistentVolumeClaim:
claimName: xgboost-synthetic

View File

@ -0,0 +1,11 @@
fairing
fire
gitpython
google-cloud-storage
joblib
numpy
pandas
retrying
seldon-core
sklearn
xgboost

45
xgboost_synthetic/util.py Normal file
View File

@ -0,0 +1,45 @@
import logging
import os
import shutil
import subprocess
import json
import requests
from retrying import retry
import numpy as np
KFP_PACKAGE = 'https://storage.googleapis.com/ml-pipeline/release/0.1.20/kfp.tar.gz'
def notebook_setup():
# Install the SDK
subprocess.check_call(["pip3", "install", KFP_PACKAGE, "--upgrade"])
logging.basicConfig(format='%(message)s')
logging.getLogger().setLevel(logging.INFO)
subprocess.check_call(["gcloud", "auth", "configure-docker", "--quiet"])
subprocess.check_call(["gcloud", "auth", "activate-service-account",
"--key-file=" + os.getenv("GOOGLE_APPLICATION_CREDENTIALS"),
"--quiet"])
def copy_data_to_nfs(nfs_path, model_dir):
if not os.path.exists(nfs_path):
shutil.copytree("ames_dataset", nfs_path)
if not os.path.exists(model_dir):
os.makedirs(model_dir)
@retry(wait_exponential_multiplier=1000, wait_exponential_max=5000,
stop_max_delay=2*60*1000)
def predict_nparray(url, data, feature_names=None):
pdata = {
"data": {
"names":feature_names,
"tensor": {
"shape": np.asarray(data.shape).tolist(),
"values": data.flatten().tolist(),
},
}
}
serialized_data = json.dumps(pdata)
r = requests.post(url, data={'json':serialized_data}, timeout=5)
return r

View File

@ -0,0 +1,111 @@
import logging
import re
from google.cloud import storage
import joblib
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import pandas as pd
from xgboost import XGBRegressor
def read_input(file_name, test_size=0.25):
"""Read input data and split it into train and test."""
if file_name.startswith("gs://"):
gcs_path = file_name
train_bucket_name, train_path = split_gcs_uri(gcs_path)
storage_client = storage.Client()
train_bucket = storage_client.get_bucket(train_bucket_name)
train_blob = train_bucket.blob(train_path)
file_name = "/tmp/data.csv"
train_blob.download_to_filename(file_name)
data = pd.read_csv(file_name)
data.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = data.SalePrice
X = data.drop(['SalePrice'], axis=1).select_dtypes(exclude=['object'])
train_X, test_X, train_y, test_y = train_test_split(X.values,
y.values,
test_size=test_size,
shuffle=False)
imputer = SimpleImputer()
train_X = imputer.fit_transform(train_X)
test_X = imputer.transform(test_X)
return (train_X, train_y), (test_X, test_y)
def load_model(model_path):
local_model_path = model_path
if model_path.startswith("gs://"):
gcs_path = model_path
train_bucket_name, train_path = split_gcs_uri(gcs_path)
storage_client = storage.Client()
train_bucket = storage_client.get_bucket(train_bucket_name)
train_blob = train_bucket.blob(train_path)
local_model_path = "/tmp/model.dat"
logging.info("Downloading model to %s", local_model_path)
train_blob.download_to_filename(local_model_path)
model = joblib.load(local_model_path)
return model
def train_model(train_X,
train_y,
test_X,
test_y,
n_estimators,
learning_rate):
"""Train the model using XGBRegressor."""
model = XGBRegressor(n_estimators=n_estimators, learning_rate=learning_rate)
model.fit(train_X,
train_y,
early_stopping_rounds=40,
eval_set=[(test_X, test_y)])
logging.info("Best RMSE on eval: %.2f with %d rounds",
model.best_score,
model.best_iteration+1)
return model
def eval_model(model, test_X, test_y):
"""Evaluate the model performance."""
predictions = model.predict(test_X)
logging.info("mean_absolute_error=%.2f", mean_absolute_error(predictions, test_y))
def save_model(model, model_file):
"""Save XGBoost model for serving."""
gcs_path = None
if model_file.startswith("gs://"):
gcs_path = model_file
model_file = "/tmp/model.dat"
joblib.dump(model, model_file)
logging.info("Model export success: %s", model_file)
if gcs_path:
model_bucket_name, model_path = split_gcs_uri(gcs_path)
storage_client = storage.Client()
model_bucket = storage_client.get_bucket(model_bucket_name)
model_blob = model_bucket.blob(model_path)
logging.info("Uploading model to %s", gcs_path)
model_blob.upload_from_filename(model_file)
def split_gcs_uri(gcs_uri):
"""Split a GCS URI into bucket and path."""
GCS_REGEX = re.compile("gs://([^/]*)(/.*)?")
m = GCS_REGEX.match(gcs_uri)
bucket = m.group(1)
path = ""
if m.group(2):
path = m.group(2).lstrip("/")
return bucket, path