decision-forests/examples/hyperparameter_optimization.py

# Copyright 2021 Google LLC.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

r"""Example of automated hyper-parameter tuning with TensorFlow Decision Forests.

This example trains, displays, evaluates and export a Gradient Boosted Tree
model.

Usage example:

  pip3 install tensorflow_decision_forests -U
  python3 hyperparameter_optimization.py

Or

  bazel run -c opt \
  //tensorflow_decision_forests/examples:hyperparameter_optimization
  \
  -- --alsologtostderr
"""

from absl import app

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_decision_forests as tfdf
import tf_keras

def main(argv):
  if len(argv) > 1:
    raise app.UsageError("Too many command-line arguments.")

  # Download the Adult dataset.
  dataset_path = tf_keras.utils.get_file(
      "adult.csv",
      "https://raw.githubusercontent.com/google/yggdrasil-decision-forests/"
      "main/yggdrasil_decision_forests/test_data/dataset/adult.csv")

  # Load a dataset into a Pandas Dataframe.
  dataset_df = pd.read_csv(dataset_path)  # "df" for Pandas's DataFrame.

  print("First the first three examples:")
  print(dataset_df.head(3))

  # Notice that the dataset contains a mix of numerical and categorical
  # features. TensorFlow Decision Forests handles them automatically (e.g. no
  # need for one-hot encoding or normalization; except for the label).

  # Split the dataset into a training and a testing dataset.
  test_indices = np.random.rand(len(dataset_df)) < 0.30
  test_ds_pd = dataset_df[test_indices]
  train_ds_pd = dataset_df[~test_indices]
  print(f"{len(train_ds_pd)} examples in training"
        f", {len(test_ds_pd)} examples for testing.")

  # Converts datasets from Pandas dataframe to TensorFlow dataset format.
  train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label="income")
  test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_ds_pd, label="income")

  # Tune the model.
  #
  # The hyper-parameters to optimize are automatically set with
  # "use_predefined_hps=True". See
  # https://www.tensorflow.org/decision_forests/tutorials/automatic_tuning_colab
  # for an example where the hyper-parameter space is configured manually.
  tuner = tfdf.tuner.RandomSearch(num_trials=30, use_predefined_hps=True)
  model = tfdf.keras.GradientBoostedTreesModel(verbose=2, tuner=tuner)
  model.fit(train_ds)

  # Some information about the model.
  print(model.summary())

  # Evaluates the model on the test dataset.
  model.compile(metrics=["accuracy"])
  evaluation = model.evaluate(test_ds)
  print(f"BinaryCrossentropyloss: {evaluation[0]}")
  print(f"Accuracy: {evaluation[1]}")

  # Exports the model to disk in the SavedModel format for later re-use. This
  # model can be used with TensorFlow Serving and Yggdrasil Decision Forests
  # (https://ydf.readthedocs.io/en/latest/serving_apis.html).
  model.save("/tmp/my_saved_model")


if __name__ == "__main__":
  app.run(main)