mirror of https://github.com/kubeflow/examples.git
Add kubecon demo to xgboost_ames_housing directory (#589)
* Add xgboost-ames-housing demo from Kubecon EU 2019. * fix links in the .ipynb in the xgboost-ames-housing demo * update to the xgboost demo example from kubecon - move example to its own directory - remove unnecessarry files - modify util and update notebook * change the names related to kubecon and update readme * use fairing instead of own fairing_util in the notebook * remove fairing_util and move the remaining to util instead * update synthetic data example as comments - generalize yaml - remove updating github procedures - update readme - rename files * fix pylint. * fix pylint.
This commit is contained in:
parent
567998cb4e
commit
ac9f2f1238
|
@ -0,0 +1,7 @@
|
|||
**/.build
|
||||
**/.ipynb_checkpoints
|
||||
**/.pipeline_build
|
||||
**/__pycache__
|
||||
*.zip
|
||||
mlpipeline-metrics.json
|
||||
mlpipeline-ui-metadata.json
|
|
@ -0,0 +1,23 @@
|
|||
# Create docker image for the demo
|
||||
#
|
||||
# This docker image is based on existing notebook image
|
||||
# It also includes the dependencies required for training and deploying
|
||||
# this way we can use it as the base image
|
||||
FROM gcr.io/kubeflow-images-public/tensorflow-1.12.0-notebook-cpu:v0.5.0
|
||||
|
||||
USER root
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip3 --no-cache-dir install -r requirements.txt
|
||||
|
||||
RUN apt-get update -y
|
||||
RUN apt-get install -y emacs
|
||||
|
||||
RUN pip3 install https://storage.googleapis.com/ml-pipeline/release/0.1.20/kfp.tar.gz
|
||||
|
||||
# Checkout kubeflow/testing because we use some of its utilities
|
||||
RUN mkdir -p /src/kubeflow && \
|
||||
cd /src/kubeflow && \
|
||||
git clone https://github.com/kubeflow/testing.git testing
|
||||
|
||||
USER jovyan
|
|
@ -0,0 +1,55 @@
|
|||
# IMG is the base path for images..
|
||||
# Individual images will be
|
||||
# $(IMG)/$(NAME):$(TAG)
|
||||
|
||||
PROJECT ?= code-search-demo
|
||||
IMG ?= gcr.io/${PROJECT}/xgboost-synthetic/notebook
|
||||
GITOPS_IMAGE ?= gcr.io/${PROJECT}/xgboost-synthetic/gitops
|
||||
|
||||
# List any changed files. We only include files in the notebooks directory.
|
||||
# because that is the code in the docker image.
|
||||
# In particular we exclude changes to the ksonnet configs.
|
||||
CHANGED_FILES := $(shell git diff-files)
|
||||
|
||||
# Whether to use cached images with GCB
|
||||
USE_IMAGE_CACHE ?= true
|
||||
|
||||
ifeq ($(strip $(CHANGED_FILES)),)
|
||||
# Changed files is empty; not dirty
|
||||
# Don't include --dirty because it could be dirty if files outside the ones we care
|
||||
# about changed.
|
||||
GIT_VERSION := $(shell git describe --always)
|
||||
else
|
||||
GIT_VERSION := $(shell git describe --always)-dirty-$(shell git diff | shasum -a256 | cut -c -6)
|
||||
endif
|
||||
|
||||
TAG := $(shell date +v%Y%m%d)-$(GIT_VERSION)
|
||||
all: build
|
||||
|
||||
# To build without the cache set the environment variable
|
||||
# export DOCKER_BUILD_OPTS=--no-cache
|
||||
.PHONY: build
|
||||
build-dir: ./Dockerfile ./requirements.txt
|
||||
rm -rf ./.build
|
||||
mkdir -p ./.build
|
||||
cp ./requirements.txt ./.build/
|
||||
cp ./Dockerfile* ./.build/
|
||||
|
||||
build: build-dir
|
||||
cd .build && docker build ${DOCKER_BUILD_OPTS} -t $(IMG):$(TAG) -f Dockerfile . \
|
||||
--label=git-verions=$(GIT_VERSION)
|
||||
docker tag $(IMG):$(TAG) $(IMG):latest
|
||||
@echo Built $(IMG):latest
|
||||
@echo Built $(IMG):$(TAG)
|
||||
|
||||
build-gcb: build-dir
|
||||
gcloud builds submit --machine-type=n1-highcpu-32 --project=$(PROJECT) --tag=$(IMG):$(TAG) \
|
||||
--timeout=3600 ./build
|
||||
@echo Built $(IMG):$(TAG)
|
||||
|
||||
# Build but don't attach the latest tag. This allows manual testing/inspection of the image
|
||||
# first.
|
||||
push: build
|
||||
docker push $(IMG):$(TAG)
|
||||
docker push $(IMG):latest
|
||||
@echo Pushed $(IMG):$(TAG)
|
|
@ -0,0 +1,9 @@
|
|||
# xgboost-synthetic
|
||||
Kubeflow fairing, pipelines demo using synthetic data
|
||||
|
||||
1. Launch a notebook
|
||||
|
||||
```
|
||||
kubectl apply -f notebook.xgboost-synthetic.yaml
|
||||
```
|
||||
1. Attach an extra data volume named
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,30 @@
|
|||
apiVersion: kubeflow.org/v1alpha1
|
||||
kind: Notebook
|
||||
metadata:
|
||||
labels:
|
||||
app: notebook
|
||||
name: xgboost-synthetic
|
||||
namespace: kubeflow
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- env: []
|
||||
image: gcr.io/kubeflow-images-public/tensorflow-1.12.0-notebook-cpu:v0.5.0
|
||||
name: tf-cpu
|
||||
resources:
|
||||
limits:
|
||||
cpu: 8
|
||||
memory: 16Gi
|
||||
requests:
|
||||
cpu: 1
|
||||
memory: 1Gi
|
||||
volumeMounts:
|
||||
- mountPath: /home/jovyan
|
||||
name: xgboost-synthetic
|
||||
serviceAccountName: jupyter-notebook
|
||||
ttlSecondsAfterFinished: 300
|
||||
volumes:
|
||||
- name: xgboost-synthetic
|
||||
persistentVolumeClaim:
|
||||
claimName: xgboost-synthetic
|
|
@ -0,0 +1,11 @@
|
|||
fairing
|
||||
fire
|
||||
gitpython
|
||||
google-cloud-storage
|
||||
joblib
|
||||
numpy
|
||||
pandas
|
||||
retrying
|
||||
seldon-core
|
||||
sklearn
|
||||
xgboost
|
|
@ -0,0 +1,45 @@
|
|||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import json
|
||||
import requests
|
||||
from retrying import retry
|
||||
import numpy as np
|
||||
|
||||
KFP_PACKAGE = 'https://storage.googleapis.com/ml-pipeline/release/0.1.20/kfp.tar.gz'
|
||||
def notebook_setup():
|
||||
# Install the SDK
|
||||
|
||||
subprocess.check_call(["pip3", "install", KFP_PACKAGE, "--upgrade"])
|
||||
|
||||
logging.basicConfig(format='%(message)s')
|
||||
logging.getLogger().setLevel(logging.INFO)
|
||||
|
||||
subprocess.check_call(["gcloud", "auth", "configure-docker", "--quiet"])
|
||||
subprocess.check_call(["gcloud", "auth", "activate-service-account",
|
||||
"--key-file=" + os.getenv("GOOGLE_APPLICATION_CREDENTIALS"),
|
||||
"--quiet"])
|
||||
|
||||
def copy_data_to_nfs(nfs_path, model_dir):
|
||||
if not os.path.exists(nfs_path):
|
||||
shutil.copytree("ames_dataset", nfs_path)
|
||||
|
||||
if not os.path.exists(model_dir):
|
||||
os.makedirs(model_dir)
|
||||
|
||||
@retry(wait_exponential_multiplier=1000, wait_exponential_max=5000,
|
||||
stop_max_delay=2*60*1000)
|
||||
def predict_nparray(url, data, feature_names=None):
|
||||
pdata = {
|
||||
"data": {
|
||||
"names":feature_names,
|
||||
"tensor": {
|
||||
"shape": np.asarray(data.shape).tolist(),
|
||||
"values": data.flatten().tolist(),
|
||||
},
|
||||
}
|
||||
}
|
||||
serialized_data = json.dumps(pdata)
|
||||
r = requests.post(url, data={'json':serialized_data}, timeout=5)
|
||||
return r
|
|
@ -0,0 +1,111 @@
|
|||
import logging
|
||||
import re
|
||||
from google.cloud import storage
|
||||
import joblib
|
||||
from sklearn.metrics import mean_absolute_error
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.impute import SimpleImputer
|
||||
import pandas as pd
|
||||
from xgboost import XGBRegressor
|
||||
|
||||
def read_input(file_name, test_size=0.25):
|
||||
"""Read input data and split it into train and test."""
|
||||
|
||||
if file_name.startswith("gs://"):
|
||||
gcs_path = file_name
|
||||
train_bucket_name, train_path = split_gcs_uri(gcs_path)
|
||||
|
||||
storage_client = storage.Client()
|
||||
train_bucket = storage_client.get_bucket(train_bucket_name)
|
||||
train_blob = train_bucket.blob(train_path)
|
||||
|
||||
file_name = "/tmp/data.csv"
|
||||
train_blob.download_to_filename(file_name)
|
||||
|
||||
data = pd.read_csv(file_name)
|
||||
data.dropna(axis=0, subset=['SalePrice'], inplace=True)
|
||||
|
||||
y = data.SalePrice
|
||||
X = data.drop(['SalePrice'], axis=1).select_dtypes(exclude=['object'])
|
||||
|
||||
train_X, test_X, train_y, test_y = train_test_split(X.values,
|
||||
y.values,
|
||||
test_size=test_size,
|
||||
shuffle=False)
|
||||
|
||||
imputer = SimpleImputer()
|
||||
train_X = imputer.fit_transform(train_X)
|
||||
test_X = imputer.transform(test_X)
|
||||
|
||||
return (train_X, train_y), (test_X, test_y)
|
||||
|
||||
def load_model(model_path):
|
||||
local_model_path = model_path
|
||||
|
||||
if model_path.startswith("gs://"):
|
||||
gcs_path = model_path
|
||||
train_bucket_name, train_path = split_gcs_uri(gcs_path)
|
||||
|
||||
storage_client = storage.Client()
|
||||
train_bucket = storage_client.get_bucket(train_bucket_name)
|
||||
train_blob = train_bucket.blob(train_path)
|
||||
|
||||
local_model_path = "/tmp/model.dat"
|
||||
logging.info("Downloading model to %s", local_model_path)
|
||||
train_blob.download_to_filename(local_model_path)
|
||||
|
||||
model = joblib.load(local_model_path)
|
||||
return model
|
||||
|
||||
def train_model(train_X,
|
||||
train_y,
|
||||
test_X,
|
||||
test_y,
|
||||
n_estimators,
|
||||
learning_rate):
|
||||
"""Train the model using XGBRegressor."""
|
||||
model = XGBRegressor(n_estimators=n_estimators, learning_rate=learning_rate)
|
||||
|
||||
model.fit(train_X,
|
||||
train_y,
|
||||
early_stopping_rounds=40,
|
||||
eval_set=[(test_X, test_y)])
|
||||
|
||||
logging.info("Best RMSE on eval: %.2f with %d rounds",
|
||||
model.best_score,
|
||||
model.best_iteration+1)
|
||||
return model
|
||||
|
||||
def eval_model(model, test_X, test_y):
|
||||
"""Evaluate the model performance."""
|
||||
predictions = model.predict(test_X)
|
||||
logging.info("mean_absolute_error=%.2f", mean_absolute_error(predictions, test_y))
|
||||
|
||||
def save_model(model, model_file):
|
||||
"""Save XGBoost model for serving."""
|
||||
|
||||
gcs_path = None
|
||||
if model_file.startswith("gs://"):
|
||||
gcs_path = model_file
|
||||
model_file = "/tmp/model.dat"
|
||||
joblib.dump(model, model_file)
|
||||
logging.info("Model export success: %s", model_file)
|
||||
|
||||
if gcs_path:
|
||||
model_bucket_name, model_path = split_gcs_uri(gcs_path)
|
||||
storage_client = storage.Client()
|
||||
model_bucket = storage_client.get_bucket(model_bucket_name)
|
||||
model_blob = model_bucket.blob(model_path)
|
||||
|
||||
logging.info("Uploading model to %s", gcs_path)
|
||||
model_blob.upload_from_filename(model_file)
|
||||
|
||||
def split_gcs_uri(gcs_uri):
|
||||
"""Split a GCS URI into bucket and path."""
|
||||
GCS_REGEX = re.compile("gs://([^/]*)(/.*)?")
|
||||
m = GCS_REGEX.match(gcs_uri)
|
||||
bucket = m.group(1)
|
||||
path = ""
|
||||
if m.group(2):
|
||||
path = m.group(2).lstrip("/")
|
||||
return bucket, path
|
Loading…
Reference in New Issue