addons/tensorflow_addons/optimizers/tests/conditional_gradient_test.py

1466 lines
45 KiB
Python

# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for Conditional Gradient."""
import numpy as np
import pytest
import platform
import tensorflow as tf
from tensorflow_addons.utils import test_utils
from tensorflow_addons.optimizers import conditional_gradient as cg_lib
def _dtypes_to_test(use_gpu):
# Based on issue #347 in the following link,
# "https://github.com/tensorflow/addons/issues/347"
# tf.half is not registered for 'ResourceScatterUpdate' OpKernel
# for 'GPU' devices.
# So we have to remove tf.half when testing with gpu.
# The function "_DtypesToTest" is from
# "https://github.com/tensorflow/tensorflow/blob/5d4a6cee737a1dc6c20172a1dc1
# 5df10def2df72/tensorflow/python/kernel_tests/conv_ops_3d_test.py#L53-L62"
#
# Update cpu to use tf.half once issue in TF2.4 is fixed: https://github.com/tensorflow/tensorflow/issues/45136
if use_gpu:
return [tf.float32, tf.float64]
else:
return [tf.float32, tf.float64]
def _dtypes_with_checking_system(use_gpu, system):
# Based on issue #36764 in the following link,
# "https://github.com/tensorflow/tensorflow/issues/36764"
# tf.half is not registered for tf.linalg.svd function on Windows
# CPU version.
# So we have to remove tf.half when testing with Windows CPU version.
if system == "Windows":
return [tf.float32, tf.float64]
else:
return _dtypes_to_test(use_gpu)
@pytest.mark.usefixtures("maybe_run_functions_eagerly")
def test_like_dist_belief_nuclear_cg01():
db_grad, db_out = _db_params_nuclear_cg01()
num_samples = len(db_grad)
var0 = tf.Variable([0.0] * num_samples)
grads0 = tf.constant([0.0] * num_samples)
ord = "nuclear"
cg_opt = cg_lib.ConditionalGradient(learning_rate=0.1, lambda_=0.1, ord=ord)
for i in range(num_samples):
grads0 = tf.constant(db_grad[i])
cg_opt.apply_gradients(zip([grads0], [var0]))
np.testing.assert_allclose(
np.array(db_out[i]), var0.numpy(), rtol=1e-6, atol=1e-6
)
@pytest.mark.with_device(["cpu", "gpu"])
@pytest.mark.parametrize("dtype", [tf.float16, tf.float32, tf.float64])
def test_minimize_sparse_resource_variable_frobenius(dtype, device):
if "gpu" in device and dtype == tf.float16:
pytest.xfail("See https://github.com/tensorflow/addons/issues/347")
var0 = tf.Variable([[1.0, 2.0]], dtype=dtype)
def loss():
x = tf.constant([[4.0], [5.0]], dtype=dtype)
pred = tf.matmul(tf.nn.embedding_lookup([var0], [0]), x)
return pred * pred
# the gradient based on the current loss function
grads0_0 = 32 * 1.0 + 40 * 2.0
grads0_1 = 40 * 1.0 + 50 * 2.0
grads0 = tf.constant([[grads0_0, grads0_1]], dtype=dtype)
norm0 = tf.math.reduce_sum(grads0**2) ** 0.5
learning_rate = 0.1
lambda_ = 0.1
ord = "fro"
opt = cg_lib.ConditionalGradient(
learning_rate=learning_rate, lambda_=lambda_, ord=ord
)
_ = opt.minimize(loss, var_list=[var0])
test_utils.assert_allclose_according_to_type(
[
[
1.0 * learning_rate - (1 - learning_rate) * lambda_ * grads0_0 / norm0,
2.0 * learning_rate - (1 - learning_rate) * lambda_ * grads0_1 / norm0,
]
],
var0.numpy(),
)
@pytest.mark.parametrize("dtype", [(tf.half, 0), (tf.float32, 1), (tf.float64, 2)])
@pytest.mark.usefixtures("maybe_run_functions_eagerly")
@pytest.mark.parametrize("use_resource", [True, False])
def test_basic_frobenius(dtype, use_resource):
if use_resource:
var0 = tf.Variable([1.0, 2.0], dtype=dtype[0], name="var0_%d" % dtype[1])
var1 = tf.Variable([3.0, 4.0], dtype=dtype[0], name="var0_%d" % dtype[1])
else:
var0 = tf.Variable([1.0, 2.0], dtype=dtype[0])
var1 = tf.Variable([3.0, 4.0], dtype=dtype[0])
grads0 = tf.constant([0.1, 0.1], dtype=dtype[0])
grads1 = tf.constant([0.01, 0.01], dtype=dtype[0])
norm0 = tf.math.reduce_sum(grads0**2) ** 0.5
norm1 = tf.math.reduce_sum(grads1**2) ** 0.5
def learning_rate():
return 0.5
def lambda_():
return 0.01
ord = "fro"
cg_opt = cg_lib.ConditionalGradient(
learning_rate=learning_rate, lambda_=lambda_, ord=ord
)
_ = cg_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
# Check we have slots
assert ["conditional_gradient"] == cg_opt.get_slot_names()
slot0 = cg_opt.get_slot(var0, "conditional_gradient")
assert slot0.get_shape() == var0.get_shape()
slot1 = cg_opt.get_slot(var1, "conditional_gradient")
assert slot1.get_shape() == var1.get_shape()
test_utils.assert_allclose_according_to_type(
np.array(
[
1.0 * 0.5 - (1 - 0.5) * 0.01 * 0.1 / norm0,
2.0 * 0.5 - (1 - 0.5) * 0.01 * 0.1 / norm0,
]
),
var0.numpy(),
)
test_utils.assert_allclose_according_to_type(
np.array(
[
3.0 * 0.5 - (1 - 0.5) * 0.01 * 0.01 / norm1,
4.0 * 0.5 - (1 - 0.5) * 0.01 * 0.01 / norm1,
]
),
var1.numpy(),
)
# Step 2: the conditional_gradient contain the previous update.
cg_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
test_utils.assert_allclose_according_to_type(
np.array(
[
(1.0 * 0.5 - (1 - 0.5) * 0.01 * 0.1 / norm0) * 0.5
- (1 - 0.5) * 0.01 * 0.1 / norm0,
(2.0 * 0.5 - (1 - 0.5) * 0.01 * 0.1 / norm0) * 0.5
- (1 - 0.5) * 0.01 * 0.1 / norm0,
]
),
var0.numpy(),
)
test_utils.assert_allclose_according_to_type(
np.array(
[
(3.0 * 0.5 - (1 - 0.5) * 0.01 * 0.01 / norm1) * 0.5
- (1 - 0.5) * 0.01 * 0.01 / norm1,
(4.0 * 0.5 - (1 - 0.5) * 0.01 * 0.01 / norm1) * 0.5
- (1 - 0.5) * 0.01 * 0.01 / norm1,
]
),
var1.numpy(),
)
@pytest.mark.usefixtures("maybe_run_functions_eagerly")
@pytest.mark.parametrize("use_resource", [True, False])
def test_basic_nuclear(use_resource):
# TODO:
# to address issue #36764
for i, dtype in enumerate(
_dtypes_with_checking_system(
use_gpu=test_utils.is_gpu_available(), system=platform.system()
)
):
if use_resource:
var0 = tf.Variable([1.0, 2.0], dtype=dtype, name="var0_%d" % i)
var1 = tf.Variable([3.0, 4.0], dtype=dtype, name="var1_%d" % i)
else:
var0 = tf.Variable([1.0, 2.0], dtype=dtype)
var1 = tf.Variable([3.0, 4.0], dtype=dtype)
grads0 = tf.constant([0.1, 0.1], dtype=dtype)
grads1 = tf.constant([0.01, 0.01], dtype=dtype)
top_singular_vector0 = cg_lib.ConditionalGradient._top_singular_vector(grads0)
top_singular_vector1 = cg_lib.ConditionalGradient._top_singular_vector(grads1)
def learning_rate():
return 0.5
def lambda_():
return 0.01
ord = "nuclear"
cg_opt = cg_lib.ConditionalGradient(
learning_rate=learning_rate, lambda_=lambda_, ord=ord
)
_ = cg_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
# Check we have slots
assert ["conditional_gradient"] == cg_opt.get_slot_names()
slot0 = cg_opt.get_slot(var0, "conditional_gradient")
assert slot0.get_shape() == var0.get_shape()
slot1 = cg_opt.get_slot(var1, "conditional_gradient")
assert slot1.get_shape() == var1.get_shape()
test_utils.assert_allclose_according_to_type(
np.array(
[
1.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector0[0],
2.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector0[1],
]
),
var0.numpy(),
)
test_utils.assert_allclose_according_to_type(
np.array(
[
3.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector1[0],
4.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector1[1],
]
),
var1.numpy(),
)
# Step 2: the conditional_gradient contain the previous update.
cg_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
test_utils.assert_allclose_according_to_type(
np.array(
[
(1.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector0[0]) * 0.5
- (1 - 0.5) * 0.01 * top_singular_vector0[0],
(2.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector0[1]) * 0.5
- (1 - 0.5) * 0.01 * top_singular_vector0[1],
]
),
var0.numpy(),
)
test_utils.assert_allclose_according_to_type(
np.array(
[
(3.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector1[0]) * 0.5
- (1 - 0.5) * 0.01 * top_singular_vector1[1],
(4.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector1[0]) * 0.5
- (1 - 0.5) * 0.01 * top_singular_vector1[1],
]
),
var1.numpy(),
)
@pytest.mark.usefixtures("maybe_run_functions_eagerly")
def test_minimize_sparse_resource_variable_nuclear():
# TODO:
# to address issue #347 and #36764.
for dtype in _dtypes_with_checking_system(
use_gpu=test_utils.is_gpu_available(), system=platform.system()
):
var0 = tf.Variable([[1.0, 2.0]], dtype=dtype)
def loss():
x = tf.constant([[4.0], [5.0]], dtype=dtype)
pred = tf.matmul(tf.nn.embedding_lookup([var0], [0]), x)
return pred * pred
# the gradient based on the current loss function
grads0_0 = 32 * 1.0 + 40 * 2.0
grads0_1 = 40 * 1.0 + 50 * 2.0
grads0 = tf.constant([[grads0_0, grads0_1]], dtype=dtype)
top_singular_vector0 = cg_lib.ConditionalGradient._top_singular_vector(grads0)
learning_rate = 0.1
lambda_ = 0.1
ord = "nuclear"
opt = cg_lib.ConditionalGradient(
learning_rate=learning_rate, lambda_=lambda_, ord=ord
)
_ = opt.minimize(loss, var_list=[var0])
# Validate updated params
test_utils.assert_allclose_according_to_type(
[
[
1.0 * learning_rate
- (1 - learning_rate) * lambda_ * top_singular_vector0[0][0],
2.0 * learning_rate
- (1 - learning_rate) * lambda_ * top_singular_vector0[0][1],
]
],
var0.numpy(),
)
@pytest.mark.usefixtures("maybe_run_functions_eagerly")
def test_tensor_learning_rate_and_conditional_gradient_nuclear():
for dtype in _dtypes_with_checking_system(
use_gpu=test_utils.is_gpu_available(), system=platform.system()
):
# TODO:
# Based on issue #36764 in the following link,
# "https://github.com/tensorflow/tensorflow/issues/36764"
# tf.half is not registered for tf.linalg.svd function on Windows
# CPU version.
# So we have to remove tf.half when testing with Windows CPU version.
var0 = tf.Variable([1.0, 2.0], dtype=dtype)
var1 = tf.Variable([3.0, 4.0], dtype=dtype)
grads0 = tf.constant([0.1, 0.1], dtype=dtype)
grads1 = tf.constant([0.01, 0.01], dtype=dtype)
top_singular_vector0 = cg_lib.ConditionalGradient._top_singular_vector(grads0)
top_singular_vector1 = cg_lib.ConditionalGradient._top_singular_vector(grads1)
ord = "nuclear"
cg_opt = cg_lib.ConditionalGradient(
learning_rate=tf.constant(0.5), lambda_=tf.constant(0.01), ord=ord
)
_ = cg_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
# Check we have slots
assert ["conditional_gradient"] == cg_opt.get_slot_names()
slot0 = cg_opt.get_slot(var0, "conditional_gradient")
assert slot0.get_shape() == var0.get_shape()
slot1 = cg_opt.get_slot(var1, "conditional_gradient")
assert slot1.get_shape() == var1.get_shape()
# Check that the parameters have been updated.
test_utils.assert_allclose_according_to_type(
np.array(
[
1.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector0[0],
2.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector0[1],
]
),
var0.numpy(),
)
test_utils.assert_allclose_according_to_type(
np.array(
[
3.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector1[0],
4.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector1[1],
]
),
var1.numpy(),
)
# Step 2: the conditional_gradient contain the
# previous update.
cg_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
# Check that the parameters have been updated.
test_utils.assert_allclose_according_to_type(
np.array(
[
(1.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector0[0]) * 0.5
- (1 - 0.5) * 0.01 * top_singular_vector0[0],
(2.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector0[1]) * 0.5
- (1 - 0.5) * 0.01 * top_singular_vector0[1],
]
),
var0.numpy(),
)
test_utils.assert_allclose_according_to_type(
np.array(
[
(3.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector1[0]) * 0.5
- (1 - 0.5) * 0.01 * top_singular_vector1[0],
(4.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector1[1]) * 0.5
- (1 - 0.5) * 0.01 * top_singular_vector1[1],
]
),
var1.numpy(),
)
@pytest.mark.usefixtures("maybe_run_functions_eagerly")
def test_variables_across_graphs_frobenius():
optimizer = cg_lib.ConditionalGradient(0.01, 0.5, ord="fro")
var0 = tf.Variable([1.0, 2.0], dtype=tf.float32, name="var0")
var1 = tf.Variable([3.0, 4.0], dtype=tf.float32, name="var1")
def loss():
return tf.math.reduce_sum(var0 + var1)
optimizer.minimize(loss, var_list=[var0, var1])
optimizer_variables = optimizer.variables()
# There should be three items. The first item is iteration,
# and one item for each variable.
assert optimizer_variables[1].name.startswith("ConditionalGradient/var0")
assert optimizer_variables[2].name.startswith("ConditionalGradient/var1")
assert 3 == len(optimizer_variables)
@pytest.mark.usefixtures("maybe_run_functions_eagerly")
def test_variables_across_graphs_nuclear():
optimizer = cg_lib.ConditionalGradient(0.01, 0.5, ord="nuclear")
var0 = tf.Variable([1.0, 2.0], dtype=tf.float32, name="var0")
var1 = tf.Variable([3.0, 4.0], dtype=tf.float32, name="var1")
def loss():
return tf.math.reduce_sum(var0 + var1)
optimizer.minimize(loss, var_list=[var0, var1])
optimizer_variables = optimizer.variables()
# There should be three items. The first item is iteration,
# and one item for each variable.
assert optimizer_variables[1].name.startswith("ConditionalGradient/var0")
assert optimizer_variables[2].name.startswith("ConditionalGradient/var1")
assert 3 == len(optimizer_variables)
@pytest.mark.usefixtures("maybe_run_functions_eagerly")
def test_minimize_with_2D_indicies_for_embedding_lookup_frobenius():
# This test invokes the ResourceSparseApplyConditionalGradient
# operation.
var0 = tf.Variable(tf.ones([2, 2]))
def loss():
return tf.math.reduce_sum(tf.nn.embedding_lookup(var0, [[1]]))
# the gradient for this loss function:
grads0 = tf.constant([[0, 0], [1, 1]], dtype=tf.float32)
norm0 = tf.math.reduce_sum(grads0**2) ** 0.5
learning_rate = 0.1
lambda_ = 0.1
ord = "fro"
opt = cg_lib.ConditionalGradient(
learning_rate=learning_rate, lambda_=lambda_, ord=ord
)
_ = opt.minimize(loss, var_list=[var0])
# Run 1 step of cg_op
test_utils.assert_allclose_according_to_type(
[
[1, 1],
[
learning_rate * 1 - (1 - learning_rate) * lambda_ * 1 / norm0,
learning_rate * 1 - (1 - learning_rate) * lambda_ * 1 / norm0,
],
],
var0.numpy(),
)
@pytest.mark.usefixtures("maybe_run_functions_eagerly")
def test_minimize_with_2D_indicies_for_embedding_lookup_nuclear():
# This test invokes the ResourceSparseApplyConditionalGradient
# operation.
var0 = tf.Variable(tf.ones([2, 2]))
def loss():
return tf.math.reduce_sum(tf.nn.embedding_lookup(var0, [[1]]))
# the gradient for this loss function:
grads0 = tf.constant([[0, 0], [1, 1]], dtype=tf.float32)
top_singular_vector0 = cg_lib.ConditionalGradient._top_singular_vector(grads0)
learning_rate = 0.1
lambda_ = 0.1
ord = "nuclear"
opt = cg_lib.ConditionalGradient(
learning_rate=learning_rate, lambda_=lambda_, ord=ord
)
_ = opt.minimize(loss, var_list=[var0])
# Run 1 step of cg_op
test_utils.assert_allclose_according_to_type(
[
learning_rate * 1
- (1 - learning_rate) * lambda_ * top_singular_vector0[1][0],
learning_rate * 1
- (1 - learning_rate) * lambda_ * top_singular_vector0[1][1],
],
var0[1],
)
@pytest.mark.usefixtures("maybe_run_functions_eagerly")
@pytest.mark.parametrize("dtype", [tf.half, tf.float32, tf.float64])
def test_tensor_learning_rate_and_conditional_gradient_frobenius(dtype):
var0 = tf.Variable([1.0, 2.0], dtype=dtype)
var1 = tf.Variable([3.0, 4.0], dtype=dtype)
grads0 = tf.constant([0.1, 0.1], dtype=dtype)
grads1 = tf.constant([0.01, 0.01], dtype=dtype)
norm0 = tf.math.reduce_sum(grads0**2) ** 0.5
norm1 = tf.math.reduce_sum(grads1**2) ** 0.5
ord = "fro"
cg_opt = cg_lib.ConditionalGradient(
learning_rate=tf.constant(0.5), lambda_=tf.constant(0.01), ord=ord
)
_ = cg_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
# Check we have slots
assert ["conditional_gradient"] == cg_opt.get_slot_names()
slot0 = cg_opt.get_slot(var0, "conditional_gradient")
assert slot0.get_shape() == var0.get_shape()
slot1 = cg_opt.get_slot(var1, "conditional_gradient")
assert slot1.get_shape() == var1.get_shape()
# Check that the parameters have been updated.
test_utils.assert_allclose_according_to_type(
np.array(
[
1.0 * 0.5 - (1 - 0.5) * 0.01 * 0.1 / norm0,
2.0 * 0.5 - (1 - 0.5) * 0.01 * 0.1 / norm0,
]
),
var0.numpy(),
)
test_utils.assert_allclose_according_to_type(
np.array(
[
3.0 * 0.5 - (1 - 0.5) * 0.01 * 0.01 / norm1,
4.0 * 0.5 - (1 - 0.5) * 0.01 * 0.01 / norm1,
]
),
var1.numpy(),
)
# Step 2: the conditional_gradient contain the
# previous update.
cg_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
# Check that the parameters have been updated.
test_utils.assert_allclose_according_to_type(
np.array(
[
(1.0 * 0.5 - (1 - 0.5) * 0.01 * 0.1 / norm0) * 0.5
- (1 - 0.5) * 0.01 * 0.1 / norm0,
(2.0 * 0.5 - (1 - 0.5) * 0.01 * 0.1 / norm0) * 0.5
- (1 - 0.5) * 0.01 * 0.1 / norm0,
]
),
var0.numpy(),
)
test_utils.assert_allclose_according_to_type(
np.array(
[
(3.0 * 0.5 - (1 - 0.5) * 0.01 * 0.01 / norm1) * 0.5
- (1 - 0.5) * 0.01 * 0.01 / norm1,
(4.0 * 0.5 - (1 - 0.5) * 0.01 * 0.01 / norm1) * 0.5
- (1 - 0.5) * 0.01 * 0.01 / norm1,
]
),
var1.numpy(),
)
def _db_params_frobenius_cg01():
"""Return dist-belief conditional_gradient values.
Return values been generated from the dist-belief
conditional_gradient unittest, running with a learning rate of 0.1
and a lambda_ of 0.1.
These values record how a parameter vector of size 10, initialized
with 0.0, gets updated with 10 consecutive conditional_gradient
steps.
It uses random gradients.
Returns:
db_grad: The gradients to apply
db_out: The parameters after the conditional_gradient update.
"""
db_grad = [[]] * 10
db_out = [[]] * 10
db_grad[0] = [
0.00096264342,
0.17914793,
0.93945462,
0.41396621,
0.53037018,
0.93197989,
0.78648776,
0.50036013,
0.55345792,
0.96722615,
]
db_out[0] = [
-4.1555551e-05,
-7.7334875e-03,
-4.0554531e-02,
-1.7870162e-02,
-2.2895107e-02,
-4.0231861e-02,
-3.3951234e-02,
-2.1599628e-02,
-2.3891762e-02,
-4.1753378e-02,
]
db_grad[1] = [
0.17075552,
0.88821375,
0.20873757,
0.25236958,
0.57578111,
0.15312378,
0.5513742,
0.94687688,
0.16012503,
0.22159521,
]
db_out[1] = [
-0.00961733,
-0.0507779,
-0.01580694,
-0.01599489,
-0.03470477,
-0.01264373,
-0.03443632,
-0.05546713,
-0.01140388,
-0.01665068,
]
db_grad[2] = [
0.35077485,
0.47304362,
0.44412705,
0.44368884,
0.078527533,
0.81223965,
0.31168157,
0.43203235,
0.16792089,
0.24644311,
]
db_out[2] = [
-0.02462724,
-0.03699233,
-0.03154434,
-0.03153357,
-0.00876844,
-0.05606323,
-0.02447166,
-0.03469437,
-0.0124694,
-0.01829169,
]
db_grad[3] = [
0.9694621,
0.75035888,
0.28171822,
0.83813518,
0.53807181,
0.3728098,
0.81454384,
0.03848977,
0.89759839,
0.93665648,
]
db_out[3] = [
-0.04124615,
-0.03371741,
-0.0144246,
-0.03668303,
-0.02240246,
-0.02052062,
-0.03503307,
-0.00500922,
-0.03715545,
-0.0393002,
]
db_grad[4] = [
0.38578293,
0.8536852,
0.88722926,
0.66276771,
0.13678469,
0.94036359,
0.69107032,
0.81897682,
0.5433259,
0.67860287,
]
db_out[4] = [
-0.01979208,
-0.0380417,
-0.03747472,
-0.0305847,
-0.00779536,
-0.04024222,
-0.03156913,
-0.0337613,
-0.02578116,
-0.03148952,
]
db_grad[5] = [
0.27885768,
0.76100707,
0.24625534,
0.81354135,
0.18959245,
0.48038563,
0.84163809,
0.41172323,
0.83259648,
0.44941229,
]
db_out[5] = [
-0.01555188,
-0.04084422,
-0.01573331,
-0.04265549,
-0.01000746,
-0.02740575,
-0.04412147,
-0.02341569,
-0.0431026,
-0.02502293,
]
db_grad[6] = [
0.27233034,
0.056316052,
0.5039115,
0.24105175,
0.35697976,
0.75913221,
0.73577434,
0.16014607,
0.57500273,
0.071136251,
]
db_out[6] = [
-0.01890448,
-0.00767214,
-0.03367592,
-0.01962219,
-0.02374279,
-0.05110247,
-0.05128598,
-0.01254396,
-0.04094185,
-0.00703416,
]
db_grad[7] = [
0.58697265,
0.2494842,
0.08106143,
0.39954534,
0.15892942,
0.12683646,
0.74053431,
0.16033,
0.66625422,
0.73515922,
]
db_out[7] = [
-0.03772914,
-0.01599993,
-0.00831695,
-0.02635719,
-0.01207801,
-0.01285448,
-0.05034328,
-0.01104364,
-0.04477356,
-0.04558991,
]
db_grad[8] = [
0.8215279,
0.41994119,
0.95172721,
0.68000203,
0.79439718,
0.43384039,
0.55561525,
0.22567581,
0.93331909,
0.29438227,
]
db_out[8] = [
-0.03919835,
-0.01970845,
-0.04187151,
-0.03195836,
-0.03546333,
-0.01999326,
-0.02899324,
-0.01083582,
-0.04472339,
-0.01725317,
]
db_grad[9] = [
0.68297005,
0.67758518,
0.1748755,
0.13266537,
0.70697063,
0.055731893,
0.68593478,
0.50580865,
0.12602448,
0.093537711,
]
db_out[9] = [
-0.04510314,
-0.04282944,
-0.0147322,
-0.0111956,
-0.04617687,
-0.00535998,
-0.0442614,
-0.03158399,
-0.01207165,
-0.00736567,
]
return db_grad, db_out
@pytest.mark.usefixtures("maybe_run_functions_eagerly")
def test_like_dist_belief_frobenius_cg01():
db_grad, db_out = _db_params_frobenius_cg01()
num_samples = len(db_grad)
var0 = tf.Variable([0.0] * num_samples)
grads0 = tf.constant([0.0] * num_samples)
ord = "fro"
cg_opt = cg_lib.ConditionalGradient(learning_rate=0.1, lambda_=0.1, ord=ord)
for i in range(num_samples):
grads0 = tf.constant(db_grad[i])
cg_opt.apply_gradients(zip([grads0], [var0]))
np.testing.assert_allclose(
np.array(db_out[i]), var0.numpy(), rtol=1e-06, atol=1e-06
)
@pytest.mark.usefixtures("maybe_run_functions_eagerly")
def test_sparse_frobenius():
# TODO:
# To address the issue #347.
for dtype in _dtypes_to_test(use_gpu=test_utils.is_gpu_available()):
var0 = tf.Variable(tf.zeros([4, 2], dtype=dtype))
var1 = tf.Variable(tf.constant(1.0, dtype, [4, 2]))
grads0 = tf.IndexedSlices(
tf.constant([[0.1, 0.1]], dtype=dtype),
tf.constant([1]),
tf.constant([4, 2]),
)
grads1 = tf.IndexedSlices(
tf.constant([[0.01, 0.01], [0.01, 0.01]], dtype=dtype),
tf.constant([2, 3]),
tf.constant([4, 2]),
)
norm0 = tf.math.reduce_sum(tf.math.multiply(grads0, grads0)) ** 0.5
norm1 = tf.math.reduce_sum(tf.math.multiply(grads1, grads1)) ** 0.5
learning_rate = 0.1
lambda_ = 0.1
ord = "fro"
cg_opt = cg_lib.ConditionalGradient(
learning_rate=learning_rate, lambda_=lambda_, ord=ord
)
_ = cg_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
# Check we have slots
assert ["conditional_gradient"] == cg_opt.get_slot_names()
slot0 = cg_opt.get_slot(var0, "conditional_gradient")
assert slot0.get_shape() == var0.get_shape()
slot1 = cg_opt.get_slot(var1, "conditional_gradient")
assert slot1.get_shape() == var1.get_shape()
# Check that the parameters have been updated.
test_utils.assert_allclose_according_to_type(
np.array(
[
0 - (1 - learning_rate) * lambda_ * 0 / norm0,
0 - (1 - learning_rate) * lambda_ * 0 / norm0,
]
),
var0[0].numpy(),
)
test_utils.assert_allclose_according_to_type(
np.array(
[
0 - (1 - learning_rate) * lambda_ * 0.1 / norm0,
0 - (1 - learning_rate) * lambda_ * 0.1 / norm0,
]
),
var0[1].numpy(),
)
test_utils.assert_allclose_according_to_type(
np.array(
[
1.0 * learning_rate - (1 - learning_rate) * lambda_ * 0.01 / norm1,
1.0 * learning_rate - (1 - learning_rate) * lambda_ * 0.01 / norm1,
]
),
var1[2].numpy(),
)
# Step 2: the conditional_gradient contain the
# previous update.
cg_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
# Check that the parameters have been updated.
np.testing.assert_allclose(np.array([0, 0]), var0[0].numpy())
test_utils.assert_allclose_according_to_type(
np.array(
[
(0 - (1 - learning_rate) * lambda_ * 0.1 / norm0) * learning_rate
- (1 - learning_rate) * lambda_ * 0.1 / norm0,
(0 - (1 - learning_rate) * lambda_ * 0.1 / norm0) * learning_rate
- (1 - learning_rate) * lambda_ * 0.1 / norm0,
]
),
var0[1].numpy(),
)
test_utils.assert_allclose_according_to_type(
np.array(
[
(1.0 * learning_rate - (1 - learning_rate) * lambda_ * 0.01 / norm1)
* learning_rate
- (1 - learning_rate) * lambda_ * 0.01 / norm1,
(1.0 * learning_rate - (1 - learning_rate) * lambda_ * 0.01 / norm1)
* learning_rate
- (1 - learning_rate) * lambda_ * 0.01 / norm1,
]
),
var1[2].numpy(),
)
@pytest.mark.usefixtures("maybe_run_functions_eagerly")
@pytest.mark.parametrize("dtype", [tf.half, tf.float32, tf.float64])
def test_sharing_frobenius(dtype):
var0 = tf.Variable([1.0, 2.0], dtype=dtype)
var1 = tf.Variable([3.0, 4.0], dtype=dtype)
grads0 = tf.constant([0.1, 0.1], dtype=dtype)
grads1 = tf.constant([0.01, 0.01], dtype=dtype)
norm0 = tf.math.reduce_sum(grads0**2) ** 0.5
norm1 = tf.math.reduce_sum(grads1**2) ** 0.5
learning_rate = 0.1
lambda_ = 0.1
ord = "fro"
cg_opt = cg_lib.ConditionalGradient(
learning_rate=learning_rate, lambda_=lambda_, ord=ord
)
_ = cg_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
# Check we have slots
assert ["conditional_gradient"] == cg_opt.get_slot_names()
slot0 = cg_opt.get_slot(var0, "conditional_gradient")
assert slot0.get_shape() == var0.get_shape()
slot1 = cg_opt.get_slot(var1, "conditional_gradient")
assert slot1.get_shape() == var1.get_shape()
# Because in the eager mode, as we declare two cg_update
# variables, it already altomatically finish executing them.
# Thus, we cannot test the param value at this time for
# eager mode. We can only test the final value of param
# after the second execution.
# Step 2: the second conditional_gradient contain
# the previous update.
# Check that the parameters have been updated.
cg_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
test_utils.assert_allclose_according_to_type(
np.array(
[
(1.0 * learning_rate - (1 - learning_rate) * lambda_ * 0.1 / norm0)
* learning_rate
- (1 - learning_rate) * lambda_ * 0.1 / norm0,
(2.0 * learning_rate - (1 - learning_rate) * lambda_ * 0.1 / norm0)
* learning_rate
- (1 - learning_rate) * lambda_ * 0.1 / norm0,
]
),
var0.numpy(),
)
test_utils.assert_allclose_according_to_type(
np.array(
[
(3.0 * learning_rate - (1 - learning_rate) * lambda_ * 0.01 / norm1)
* learning_rate
- (1 - learning_rate) * lambda_ * 0.01 / norm1,
(4.0 * learning_rate - (1 - learning_rate) * lambda_ * 0.01 / norm1)
* learning_rate
- (1 - learning_rate) * lambda_ * 0.01 / norm1,
]
),
var1.numpy(),
)
@pytest.mark.usefixtures("maybe_run_functions_eagerly")
def test_sharing_nuclear():
# TODO:
# To address the issue #36764.
for dtype in _dtypes_with_checking_system(
use_gpu=test_utils.is_gpu_available(), system=platform.system()
):
var0 = tf.Variable([1.0, 2.0], dtype=dtype)
var1 = tf.Variable([3.0, 4.0], dtype=dtype)
grads0 = tf.constant([0.1, 0.1], dtype=dtype)
grads1 = tf.constant([0.01, 0.01], dtype=dtype)
top_singular_vector0 = cg_lib.ConditionalGradient._top_singular_vector(grads0)
top_singular_vector1 = cg_lib.ConditionalGradient._top_singular_vector(grads1)
learning_rate = 0.1
lambda_ = 0.1
ord = "nuclear"
cg_opt = cg_lib.ConditionalGradient(
learning_rate=learning_rate, lambda_=lambda_, ord=ord
)
_ = cg_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
# Check we have slots
assert ["conditional_gradient"] == cg_opt.get_slot_names()
slot0 = cg_opt.get_slot(var0, "conditional_gradient")
assert slot0.get_shape() == var0.get_shape()
slot1 = cg_opt.get_slot(var1, "conditional_gradient")
assert slot1.get_shape() == var1.get_shape()
# Because in the eager mode, as we declare two cg_update
# variables, it already altomatically finish executing them.
# Thus, we cannot test the param value at this time for
# eager mode. We can only test the final value of param
# after the second execution.
# Step 2: the second conditional_gradient contain
# the previous update.
# Check that the parameters have been updated.
cg_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
test_utils.assert_allclose_according_to_type(
np.array(
[
(
1.0 * learning_rate
- (1 - learning_rate) * lambda_ * top_singular_vector0[0]
)
* learning_rate
- (1 - learning_rate) * lambda_ * top_singular_vector0[0],
(
2.0 * learning_rate
- (1 - learning_rate) * lambda_ * top_singular_vector0[1]
)
* learning_rate
- (1 - learning_rate) * lambda_ * top_singular_vector0[1],
]
),
var0.numpy(),
)
test_utils.assert_allclose_according_to_type(
np.array(
[
(
3.0 * learning_rate
- (1 - learning_rate) * lambda_ * top_singular_vector1[0]
)
* learning_rate
- (1 - learning_rate) * lambda_ * top_singular_vector1[0],
(
4.0 * learning_rate
- (1 - learning_rate) * lambda_ * top_singular_vector1[1]
)
* learning_rate
- (1 - learning_rate) * lambda_ * top_singular_vector1[1],
]
),
var1.numpy(),
)
def _db_params_nuclear_cg01():
"""Return dist-belief conditional_gradient values.
Return values been generated from the dist-belief
conditional_gradient unittest, running with a learning rate of 0.1
and a lambda_ of 0.1.
These values record how a parameter vector of size 10, initialized
with 0.0, gets updated with 10 consecutive conditional_gradient
steps.
It uses random gradients.
Returns:
db_grad: The gradients to apply
db_out: The parameters after the conditional_gradient update.
"""
db_grad = [[]] * 10
db_out = [[]] * 10
db_grad[0] = [
0.00096264342,
0.17914793,
0.93945462,
0.41396621,
0.53037018,
0.93197989,
0.78648776,
0.50036013,
0.55345792,
0.96722615,
]
db_out[0] = [
-4.1552783e-05,
-7.7334875e-03,
-4.0554535e-02,
-1.7870164e-02,
-2.2895109e-02,
-4.0231861e-02,
-3.3951234e-02,
-2.1599628e-02,
-2.3891764e-02,
-4.1753381e-02,
]
db_grad[1] = [
0.17075552,
0.88821375,
0.20873757,
0.25236958,
0.57578111,
0.15312378,
0.5513742,
0.94687688,
0.16012503,
0.22159521,
]
db_out[1] = [
-0.00961733,
-0.0507779,
-0.01580694,
-0.01599489,
-0.03470477,
-0.01264373,
-0.03443632,
-0.05546713,
-0.01140388,
-0.01665068,
]
db_grad[2] = [
0.35077485,
0.47304362,
0.44412705,
0.44368884,
0.078527533,
0.81223965,
0.31168157,
0.43203235,
0.16792089,
0.24644311,
]
db_out[2] = [
-0.02462724,
-0.03699233,
-0.03154433,
-0.03153357,
-0.00876844,
-0.05606324,
-0.02447166,
-0.03469437,
-0.0124694,
-0.01829169,
]
db_grad[3] = [
0.9694621,
0.75035888,
0.28171822,
0.83813518,
0.53807181,
0.3728098,
0.81454384,
0.03848977,
0.89759839,
0.93665648,
]
db_out[3] = [
-0.04124615,
-0.03371741,
-0.0144246,
-0.03668303,
-0.02240246,
-0.02052062,
-0.03503307,
-0.00500922,
-0.03715545,
-0.0393002,
]
db_grad[4] = [
0.38578293,
0.8536852,
0.88722926,
0.66276771,
0.13678469,
0.94036359,
0.69107032,
0.81897682,
0.5433259,
0.67860287,
]
db_out[4] = [
-0.01979207,
-0.0380417,
-0.03747472,
-0.0305847,
-0.00779536,
-0.04024221,
-0.03156913,
-0.0337613,
-0.02578116,
-0.03148951,
]
db_grad[5] = [
0.27885768,
0.76100707,
0.24625534,
0.81354135,
0.18959245,
0.48038563,
0.84163809,
0.41172323,
0.83259648,
0.44941229,
]
db_out[5] = [
-0.01555188,
-0.04084422,
-0.01573331,
-0.04265549,
-0.01000746,
-0.02740575,
-0.04412147,
-0.02341569,
-0.0431026,
-0.02502293,
]
db_grad[6] = [
0.27233034,
0.056316052,
0.5039115,
0.24105175,
0.35697976,
0.75913221,
0.73577434,
0.16014607,
0.57500273,
0.071136251,
]
db_out[6] = [
-0.01890448,
-0.00767214,
-0.03367592,
-0.01962219,
-0.02374278,
-0.05110246,
-0.05128598,
-0.01254396,
-0.04094184,
-0.00703416,
]
db_grad[7] = [
0.58697265,
0.2494842,
0.08106143,
0.39954534,
0.15892942,
0.12683646,
0.74053431,
0.16033,
0.66625422,
0.73515922,
]
db_out[7] = [
-0.03772915,
-0.01599993,
-0.00831695,
-0.0263572,
-0.01207801,
-0.01285448,
-0.05034329,
-0.01104364,
-0.04477356,
-0.04558992,
]
db_grad[8] = [
0.8215279,
0.41994119,
0.95172721,
0.68000203,
0.79439718,
0.43384039,
0.55561525,
0.22567581,
0.93331909,
0.29438227,
]
db_out[8] = [
-0.03919835,
-0.01970845,
-0.04187151,
-0.03195836,
-0.03546333,
-0.01999326,
-0.02899324,
-0.01083582,
-0.04472339,
-0.01725317,
]
db_grad[9] = [
0.68297005,
0.67758518,
0.1748755,
0.13266537,
0.70697063,
0.055731893,
0.68593478,
0.50580865,
0.12602448,
0.093537711,
]
db_out[9] = [
-0.04510314,
-0.04282944,
-0.0147322,
-0.0111956,
-0.04617687,
-0.00535998,
-0.0442614,
-0.031584,
-0.01207165,
-0.00736567,
]
return db_grad, db_out
@pytest.mark.usefixtures("maybe_run_functions_eagerly")
def test_sparse_nuclear():
# TODO:
# To address the issue #347 and issue #36764.
for dtype in _dtypes_with_checking_system(
use_gpu=test_utils.is_gpu_available(), system=platform.system()
):
var0 = tf.Variable(tf.zeros([4, 2], dtype=dtype))
var1 = tf.Variable(tf.constant(1.0, dtype, [4, 2]))
grads0 = tf.IndexedSlices(
tf.constant([[0.1, 0.1]], dtype=dtype),
tf.constant([1]),
tf.constant([4, 2]),
)
grads1 = tf.IndexedSlices(
tf.constant([[0.01, 0.01], [0.01, 0.01]], dtype=dtype),
tf.constant([2, 3]),
tf.constant([4, 2]),
)
top_singular_vector0 = tf.constant(
[[0.0, 0.0], [0.7071067, 0.7071067], [0.0, 0.0], [0.0, 0.0]], dtype=dtype
)
top_singular_vector1 = tf.constant(
[
[-4.2146844e-08, -4.2146844e-08],
[0.0000000e00, 0.0000000e00],
[4.9999994e-01, 4.9999994e-01],
[4.9999994e-01, 4.9999994e-01],
],
dtype=dtype,
)
learning_rate = 0.1
lambda_ = 0.1
ord = "nuclear"
cg_opt = cg_lib.ConditionalGradient(
learning_rate=learning_rate, lambda_=lambda_, ord=ord
)
_ = cg_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
# Check we have slots
assert ["conditional_gradient"] == cg_opt.get_slot_names()
slot0 = cg_opt.get_slot(var0, "conditional_gradient")
assert slot0.get_shape() == var0.get_shape()
slot1 = cg_opt.get_slot(var1, "conditional_gradient")
assert slot1.get_shape() == var1.get_shape()
# Check that the parameters have been updated.
test_utils.assert_allclose_according_to_type(
np.array(
[
0 - (1 - learning_rate) * lambda_ * top_singular_vector0[0][0],
0 - (1 - learning_rate) * lambda_ * top_singular_vector0[0][1],
]
),
var0[0].numpy(),
)
test_utils.assert_allclose_according_to_type(
np.array(
[
0 - (1 - learning_rate) * lambda_ * top_singular_vector0[1][0],
0 - (1 - learning_rate) * lambda_ * top_singular_vector0[1][1],
]
),
var0[1].numpy(),
)
test_utils.assert_allclose_according_to_type(
np.array(
[
1.0 * learning_rate
- (1 - learning_rate) * lambda_ * top_singular_vector1[2][0],
1.0 * learning_rate
- (1 - learning_rate) * lambda_ * top_singular_vector1[2][1],
]
),
var1[2].numpy(),
)
# Step 2: the conditional_gradient contain the
# previous update.
cg_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
# Check that the parameters have been updated.
np.testing.assert_allclose(np.array([0, 0]), var0[0].numpy())
test_utils.assert_allclose_according_to_type(
np.array(
[
(0 - (1 - learning_rate) * lambda_ * top_singular_vector0[1][0])
* learning_rate
- (1 - learning_rate) * lambda_ * top_singular_vector0[1][0],
(0 - (1 - learning_rate) * lambda_ * top_singular_vector0[1][1])
* learning_rate
- (1 - learning_rate) * lambda_ * top_singular_vector0[1][1],
]
),
var0[1].numpy(),
)
test_utils.assert_allclose_according_to_type(
np.array(
[
(
1.0 * learning_rate
- (1 - learning_rate) * lambda_ * top_singular_vector1[2][0]
)
* learning_rate
- (1 - learning_rate) * lambda_ * top_singular_vector1[2][0],
(
1.0 * learning_rate
- (1 - learning_rate) * lambda_ * top_singular_vector1[2][1]
)
* learning_rate
- (1 - learning_rate) * lambda_ * top_singular_vector1[2][1],
]
),
var1[2].numpy(),
)
def test_serialization():
learning_rate = 0.1
lambda_ = 0.1
ord = "nuclear"
optimizer = cg_lib.ConditionalGradient(
learning_rate=learning_rate, lambda_=lambda_, ord=ord
)
config = tf.keras.optimizers.serialize(optimizer)
new_optimizer = tf.keras.optimizers.deserialize(config)
assert optimizer.get_config() == new_optimizer.get_config()