Merge pull request #932 from mrlihanbo/cluster_status_condition
add conditions for cluster status
This commit is contained in:
commit
ce7d278977
|
@ -11,6 +11,7 @@ import (
|
||||||
corev1 "k8s.io/api/core/v1"
|
corev1 "k8s.io/api/core/v1"
|
||||||
"k8s.io/apimachinery/pkg/api/equality"
|
"k8s.io/apimachinery/pkg/api/equality"
|
||||||
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
||||||
|
"k8s.io/apimachinery/pkg/api/meta"
|
||||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||||
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
|
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
|
||||||
"k8s.io/apimachinery/pkg/labels"
|
"k8s.io/apimachinery/pkg/labels"
|
||||||
|
@ -37,11 +38,12 @@ const (
|
||||||
// ControllerName is the controller name that will be used when reporting events.
|
// ControllerName is the controller name that will be used when reporting events.
|
||||||
ControllerName = "cluster-status-controller"
|
ControllerName = "cluster-status-controller"
|
||||||
clusterReady = "ClusterReady"
|
clusterReady = "ClusterReady"
|
||||||
clusterHealthy = "cluster is reachable and health endpoint responded with ok"
|
clusterHealthy = "cluster is healthy and ready to accept workloads"
|
||||||
clusterNotReady = "ClusterNotReady"
|
clusterNotReady = "ClusterNotReady"
|
||||||
clusterUnhealthy = "cluster is reachable but health endpoint responded without ok"
|
clusterUnhealthy = "cluster is reachable but health endpoint responded without ok"
|
||||||
clusterNotReachableReason = "ClusterNotReachable"
|
clusterNotReachableReason = "ClusterNotReachable"
|
||||||
clusterNotReachableMsg = "cluster is not reachable"
|
clusterNotReachableMsg = "cluster is not reachable"
|
||||||
|
statusCollectionFailed = "StatusCollectionFailed"
|
||||||
// clusterStatusRetryInterval specifies the interval between two retries.
|
// clusterStatusRetryInterval specifies the interval between two retries.
|
||||||
clusterStatusRetryInterval = 500 * time.Millisecond
|
clusterStatusRetryInterval = 500 * time.Millisecond
|
||||||
// clusterStatusRetryTimeout specifies the maximum time to wait for cluster status.
|
// clusterStatusRetryTimeout specifies the maximum time to wait for cluster status.
|
||||||
|
@ -111,15 +113,15 @@ func (c *ClusterStatusController) SetupWithManager(mgr controllerruntime.Manager
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *ClusterStatusController) syncClusterStatus(cluster *clusterv1alpha1.Cluster) (controllerruntime.Result, error) {
|
func (c *ClusterStatusController) syncClusterStatus(cluster *clusterv1alpha1.Cluster) (controllerruntime.Result, error) {
|
||||||
|
var currentClusterStatus = clusterv1alpha1.ClusterStatus{}
|
||||||
|
|
||||||
// create a ClusterClient for the given member cluster
|
// create a ClusterClient for the given member cluster
|
||||||
clusterClient, err := c.ClusterClientSetFunc(cluster.Name, c.Client, c.ClusterClientOption)
|
clusterClient, err := c.ClusterClientSetFunc(cluster.Name, c.Client, c.ClusterClientOption)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
klog.Errorf("Failed to create a ClusterClient for the given member cluster: %v, err is : %v", cluster.Name, err)
|
klog.Errorf("Failed to create a ClusterClient for the given member cluster: %v, err is : %v", cluster.Name, err)
|
||||||
return controllerruntime.Result{Requeue: true}, err
|
return c.setStatusCollectionFailedCondition(cluster, currentClusterStatus, fmt.Sprintf("failed to create a ClusterClient: %v", err))
|
||||||
}
|
}
|
||||||
|
|
||||||
var currentClusterStatus = clusterv1alpha1.ClusterStatus{}
|
|
||||||
|
|
||||||
var online, healthy bool
|
var online, healthy bool
|
||||||
// in case of cluster offline, retry a few times to avoid network unstable problems.
|
// in case of cluster offline, retry a few times to avoid network unstable problems.
|
||||||
// Note: retry timeout should not be too long, otherwise will block other cluster reconcile.
|
// Note: retry timeout should not be too long, otherwise will block other cluster reconcile.
|
||||||
|
@ -134,9 +136,10 @@ func (c *ClusterStatusController) syncClusterStatus(cluster *clusterv1alpha1.Clu
|
||||||
// error indicates that retry timeout, update cluster status immediately and return.
|
// error indicates that retry timeout, update cluster status immediately and return.
|
||||||
if err != nil {
|
if err != nil {
|
||||||
klog.V(2).Infof("Cluster(%s) still offline after retry, ensuring offline is set.", cluster.Name)
|
klog.V(2).Infof("Cluster(%s) still offline after retry, ensuring offline is set.", cluster.Name)
|
||||||
currentClusterStatus.Conditions = generateReadyCondition(false, false)
|
|
||||||
setTransitionTime(&cluster.Status, ¤tClusterStatus)
|
|
||||||
c.InformerManager.Stop(cluster.Name)
|
c.InformerManager.Stop(cluster.Name)
|
||||||
|
readyCondition := generateReadyCondition(false, false)
|
||||||
|
setTransitionTime(cluster.Status.Conditions, &readyCondition)
|
||||||
|
meta.SetStatusCondition(¤tClusterStatus.Conditions, readyCondition)
|
||||||
return c.updateStatusIfNeeded(cluster, currentClusterStatus)
|
return c.updateStatusIfNeeded(cluster, currentClusterStatus)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -144,7 +147,7 @@ func (c *ClusterStatusController) syncClusterStatus(cluster *clusterv1alpha1.Clu
|
||||||
clusterInformerManager, err := c.buildInformerForCluster(cluster)
|
clusterInformerManager, err := c.buildInformerForCluster(cluster)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
klog.Errorf("Failed to get or create informer for Cluster %s. Error: %v.", cluster.GetName(), err)
|
klog.Errorf("Failed to get or create informer for Cluster %s. Error: %v.", cluster.GetName(), err)
|
||||||
return controllerruntime.Result{Requeue: true}, err
|
return c.setStatusCollectionFailedCondition(cluster, currentClusterStatus, fmt.Sprintf("failed to get or create informer: %v", err))
|
||||||
}
|
}
|
||||||
|
|
||||||
// init the lease controller for every cluster
|
// init the lease controller for every cluster
|
||||||
|
@ -152,36 +155,41 @@ func (c *ClusterStatusController) syncClusterStatus(cluster *clusterv1alpha1.Clu
|
||||||
|
|
||||||
clusterVersion, err := getKubernetesVersion(clusterClient)
|
clusterVersion, err := getKubernetesVersion(clusterClient)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
klog.Errorf("Failed to get server version of the member cluster: %v, err is : %v", cluster.Name, err)
|
return c.setStatusCollectionFailedCondition(cluster, currentClusterStatus, fmt.Sprintf("failed to get kubernetes version: %v", err))
|
||||||
return controllerruntime.Result{Requeue: true}, err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// get the list of APIs installed in the member cluster
|
// get the list of APIs installed in the member cluster
|
||||||
apiEnables, err := getAPIEnablements(clusterClient)
|
apiEnables, err := getAPIEnablements(clusterClient)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
klog.Errorf("Failed to get APIs installed in the member cluster: %v, err is : %v", cluster.Name, err)
|
return c.setStatusCollectionFailedCondition(cluster, currentClusterStatus, fmt.Sprintf("failed to get the list of APIs installed in the member cluster: %v", err))
|
||||||
return controllerruntime.Result{Requeue: true}, err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
nodes, err := listNodes(clusterInformerManager)
|
nodes, err := listNodes(clusterInformerManager)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
klog.Errorf("Failed to list nodes of cluster(%s), err: %v", cluster.Name, err)
|
return c.setStatusCollectionFailedCondition(cluster, currentClusterStatus, fmt.Sprintf("failed to list nodes: %v", err))
|
||||||
return controllerruntime.Result{Requeue: true}, err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pods, err := listPods(clusterInformerManager)
|
pods, err := listPods(clusterInformerManager)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
klog.Errorf("Failed to list pods of cluster(%s), err: %v", cluster.Name, err)
|
return c.setStatusCollectionFailedCondition(cluster, currentClusterStatus, fmt.Sprintf("failed to list pods: %v", err))
|
||||||
return controllerruntime.Result{Requeue: true}, err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
currentClusterStatus.Conditions = generateReadyCondition(online, healthy)
|
|
||||||
setTransitionTime(&cluster.Status, ¤tClusterStatus)
|
|
||||||
currentClusterStatus.KubernetesVersion = clusterVersion
|
currentClusterStatus.KubernetesVersion = clusterVersion
|
||||||
currentClusterStatus.APIEnablements = apiEnables
|
currentClusterStatus.APIEnablements = apiEnables
|
||||||
currentClusterStatus.NodeSummary = getNodeSummary(nodes)
|
currentClusterStatus.NodeSummary = getNodeSummary(nodes)
|
||||||
currentClusterStatus.ResourceSummary = getResourceSummary(nodes, pods)
|
currentClusterStatus.ResourceSummary = getResourceSummary(nodes, pods)
|
||||||
|
|
||||||
|
readyCondition := generateReadyCondition(online, healthy)
|
||||||
|
setTransitionTime(cluster.Status.Conditions, &readyCondition)
|
||||||
|
meta.SetStatusCondition(¤tClusterStatus.Conditions, readyCondition)
|
||||||
|
|
||||||
|
return c.updateStatusIfNeeded(cluster, currentClusterStatus)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *ClusterStatusController) setStatusCollectionFailedCondition(cluster *clusterv1alpha1.Cluster, currentClusterStatus clusterv1alpha1.ClusterStatus, message string) (controllerruntime.Result, error) {
|
||||||
|
readyCondition := util.NewCondition(clusterv1alpha1.ClusterConditionReady, statusCollectionFailed, message, metav1.ConditionFalse)
|
||||||
|
setTransitionTime(cluster.Status.Conditions, &readyCondition)
|
||||||
|
meta.SetStatusCondition(¤tClusterStatus.Conditions, readyCondition)
|
||||||
return c.updateStatusIfNeeded(cluster, currentClusterStatus)
|
return c.updateStatusIfNeeded(cluster, currentClusterStatus)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -304,54 +312,22 @@ func healthEndpointCheck(client *clientset.Clientset, path string) (int, error)
|
||||||
return healthStatus, resp.Error()
|
return healthStatus, resp.Error()
|
||||||
}
|
}
|
||||||
|
|
||||||
func generateReadyCondition(online, healthy bool) []metav1.Condition {
|
func generateReadyCondition(online, healthy bool) metav1.Condition {
|
||||||
var conditions []metav1.Condition
|
|
||||||
currentTime := metav1.Now()
|
|
||||||
|
|
||||||
newClusterOfflineCondition := metav1.Condition{
|
|
||||||
Type: clusterv1alpha1.ClusterConditionReady,
|
|
||||||
Status: metav1.ConditionFalse,
|
|
||||||
Reason: clusterNotReachableReason,
|
|
||||||
Message: clusterNotReachableMsg,
|
|
||||||
LastTransitionTime: currentTime,
|
|
||||||
}
|
|
||||||
|
|
||||||
newClusterReadyCondition := metav1.Condition{
|
|
||||||
Type: clusterv1alpha1.ClusterConditionReady,
|
|
||||||
Status: metav1.ConditionTrue,
|
|
||||||
Reason: clusterReady,
|
|
||||||
Message: clusterHealthy,
|
|
||||||
LastTransitionTime: currentTime,
|
|
||||||
}
|
|
||||||
|
|
||||||
newClusterNotReadyCondition := metav1.Condition{
|
|
||||||
Type: clusterv1alpha1.ClusterConditionReady,
|
|
||||||
Status: metav1.ConditionFalse,
|
|
||||||
Reason: clusterNotReady,
|
|
||||||
Message: clusterUnhealthy,
|
|
||||||
LastTransitionTime: currentTime,
|
|
||||||
}
|
|
||||||
|
|
||||||
if !online {
|
if !online {
|
||||||
conditions = append(conditions, newClusterOfflineCondition)
|
return util.NewCondition(clusterv1alpha1.ClusterConditionReady, clusterNotReachableReason, clusterNotReachableMsg, metav1.ConditionFalse)
|
||||||
} else {
|
}
|
||||||
if !healthy {
|
if !healthy {
|
||||||
conditions = append(conditions, newClusterNotReadyCondition)
|
return util.NewCondition(clusterv1alpha1.ClusterConditionReady, clusterNotReady, clusterUnhealthy, metav1.ConditionFalse)
|
||||||
} else {
|
|
||||||
conditions = append(conditions, newClusterReadyCondition)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return conditions
|
return util.NewCondition(clusterv1alpha1.ClusterConditionReady, clusterReady, clusterHealthy, metav1.ConditionTrue)
|
||||||
}
|
}
|
||||||
|
|
||||||
func setTransitionTime(oldClusterStatus, newClusterStatus *clusterv1alpha1.ClusterStatus) {
|
func setTransitionTime(existingConditions []metav1.Condition, newCondition *metav1.Condition) {
|
||||||
// preserve the last transition time if the status of member cluster not changed
|
// preserve the last transition time if the status of given condition not changed
|
||||||
if util.IsClusterReady(oldClusterStatus) == util.IsClusterReady(newClusterStatus) {
|
if existingCondition := meta.FindStatusCondition(existingConditions, newCondition.Type); existingCondition != nil {
|
||||||
if len(oldClusterStatus.Conditions) != 0 {
|
if existingCondition.Status == newCondition.Status {
|
||||||
for i := 0; i < len(newClusterStatus.Conditions); i++ {
|
newCondition.LastTransitionTime = existingCondition.LastTransitionTime
|
||||||
newClusterStatus.Conditions[i].LastTransitionTime = oldClusterStatus.Conditions[0].LastTransitionTime
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,13 @@
|
||||||
|
package util
|
||||||
|
|
||||||
|
import metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||||
|
|
||||||
|
// NewCondition returns a new condition object.
|
||||||
|
func NewCondition(conditionType, reason, message string, status metav1.ConditionStatus) metav1.Condition {
|
||||||
|
return metav1.Condition{
|
||||||
|
Type: conditionType,
|
||||||
|
Reason: reason,
|
||||||
|
Status: status,
|
||||||
|
Message: message,
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue