OCI provider based on instance-pools and instance-configurations.

This commit is contained in:
jesse.millan 2021-08-02 16:46:52 -07:00
parent d3784072e4
commit 65ebd8db83
No known key found for this signature in database
GPG Key ID: 9DD407A06EAC48A6
22 changed files with 3101 additions and 4 deletions

View File

@ -25,6 +25,7 @@ You should also take a look at the notes and "gotchas" for your specific cloud p
* [IonosCloud](./cloudprovider/ionoscloud/README.md)
* [OVHcloud](./cloudprovider/ovhcloud/README.md)
* [Linode](./cloudprovider/linode/README.md)
* [OracleCloud](./cloudprovider/oci/README.md)
* [ClusterAPI](./cloudprovider/clusterapi/README.md)
* [BizflyCloud](./cloudprovider/bizflycloud/README.md)
@ -165,5 +166,6 @@ Supported cloud providers:
* Equinix Metal https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/packet/README.md
* OVHcloud https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/ovhcloud/README.md
* Linode https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/linode/README.md
* OCI https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/oci/README.md
* Hetzner https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/hetzner/README.md
* Cluster API https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/clusterapi/README.md

View File

@ -1,5 +1,5 @@
//go:build !gce && !aws && !azure && !kubemark && !alicloud && !magnum && !digitalocean && !clusterapi && !huaweicloud && !ionoscloud && !linode && !hetzner && !bizflycloud && !brightbox && !packet
// +build !gce,!aws,!azure,!kubemark,!alicloud,!magnum,!digitalocean,!clusterapi,!huaweicloud,!ionoscloud,!linode,!hetzner,!bizflycloud,!brightbox,!packet
//go:build !gce && !aws && !azure && !kubemark && !alicloud && !magnum && !digitalocean && !clusterapi && !huaweicloud && !ionoscloud && !linode && !hetzner && !bizflycloud && !brightbox && !packet && !oci
// +build !gce,!aws,!azure,!kubemark,!alicloud,!magnum,!digitalocean,!clusterapi,!huaweicloud,!ionoscloud,!linode,!hetzner,!bizflycloud,!brightbox,!packet,!oci
/*
Copyright 2018 The Kubernetes Authors.
@ -37,6 +37,7 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/ionoscloud"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/linode"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/magnum"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/ovhcloud"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/packet"
"k8s.io/autoscaler/cluster-autoscaler/config"
@ -55,6 +56,7 @@ var AvailableCloudProviders = []string{
cloudprovider.ExoscaleProviderName,
cloudprovider.HuaweicloudProviderName,
cloudprovider.HetznerProviderName,
cloudprovider.OracleCloudProviderName,
cloudprovider.OVHcloudProviderName,
cloudprovider.ClusterAPIProviderName,
cloudprovider.IonoscloudProviderName,
@ -105,6 +107,8 @@ func buildCloudProvider(opts config.AutoscalingOptions, do cloudprovider.NodeGro
return ionoscloud.BuildIonosCloud(opts, do, rl)
case cloudprovider.LinodeProviderName:
return linode.BuildLinode(opts, do, rl)
case cloudprovider.OracleCloudProviderName:
return oci.BuildOCI(opts, do, rl)
}
return nil
}

View File

@ -0,0 +1,43 @@
//go:build oci
// +build oci
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package builder
import (
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci"
"k8s.io/autoscaler/cluster-autoscaler/config"
)
// AvailableCloudProviders supported by the cloud provider builder.
var AvailableCloudProviders = []string{
cloudprovider.OracleCloudProviderName,
}
// DefaultCloudProvider for oci-only build is oci.
const DefaultCloudProvider = cloudprovider.OracleCloudProviderName
func buildCloudProvider(opts config.AutoscalingOptions, do cloudprovider.NodeGroupDiscoveryOptions, rl *cloudprovider.ResourceLimiter) cloudprovider.CloudProvider {
switch opts.CloudProviderName {
case cloudprovider.OracleCloudProviderName:
return oci.BuildOCI(opts, do, rl)
}
return nil
}

View File

@ -60,6 +60,8 @@ const (
HuaweicloudProviderName = "huaweicloud"
// IonoscloudProviderName gets the provider name of ionoscloud
IonoscloudProviderName = "ionoscloud"
// OracleCloudProviderName gets the provider name of oci
OracleCloudProviderName = "oci"
// OVHcloudProviderName gets the provider name of ovhcloud
OVHcloudProviderName = "ovhcloud"
// LinodeProviderName gets the provider name of linode

View File

@ -0,0 +1,5 @@
approvers:
#- jlamillan
reviewers:
#- jlamillan
#- ericrrath

View File

@ -0,0 +1,220 @@
# Cluster Autoscaler for Oracle Cloud Infrastructure (OCI)
On OCI, the cluster-autoscaler utilizes [Instance Pools](https://docs.oracle.com/en-us/iaas/Content/Compute/Tasks/creatinginstancepool.htm)
combined with [Instance Configurations](https://docs.oracle.com/en-us/iaas/Content/Compute/Tasks/creatinginstanceconfig.htm) to
automatically resize a cluster's nodes based on application workload demands by:
- adding nodes to static instance-pool(s) when a pod cannot be scheduled in the cluster because of insufficient resource constraints.
- removing nodes from an instance-pool(s) when the nodes have been underutilized for an extended time, and when pods can be placed on other existing nodes.
The cluster-autoscaler works on a per-instance pool basis. You configure the cluster-autoscaler to tell it which instance pools to target
for expansion and contraction, the minimum and maximum sizes for each pool, and how you want the autoscaling to take place.
Instance pools not referenced in the configuration file are not managed by the cluster-autoscaler.
## Create Required OCI Resources
### IAM Policy (if using Instance Principals)
We recommend setting up and configuring the cluster-autoscaler to use
[Instance Principals](https://docs.oracle.com/en-us/iaas/Content/Identity/Tasks/callingservicesfrominstances.htm)
to authenticate to the OCI APIs.
The following policy provides the minimum privileges necessary for Cluster Autoscaler to run:
1: Create a compartment-level dynamic group containing the nodes (compute instances) in the cluster:
```
All {instance.compartment.id = 'ocid1.compartment.oc1..aaaaaaaa7ey4sg3a6b5wnv5hlkjlkjadslkfjalskfjalsadfadsf'}
```
2: Create a *tenancy-level* policy to allow nodes to manage instance-pools:
```
Allow dynamic-group acme-oci-cluster-autoscaler-dyn-grp to manage instance-pools in compartment <compartment-name>
Allow dynamic-group acme-oci-cluster-autoscaler-dyn-grp to manage instance-configurations in compartment <compartment-name>
Allow dynamic-group acme-oci-cluster-autoscaler-dyn-grp to manage instance-family in compartment <compartment-name>
Allow dynamic-group acme-oci-cluster-autoscaler-dyn-grp to use subnets in compartment <compartment-name>
Allow dynamic-group acme-oci-cluster-autoscaler-dyn-grp to read virtual-network-family in compartment <compartment-name>
Allow dynamic-group acme-oci-cluster-autoscaler-dyn-grp to use vnics in compartment <compartment-name>
Allow dynamic-group acme-oci-cluster-autoscaler-dyn-grp to inspect compartments in compartment <compartment-name>
```
### Instance Pool and Instance Configurations
Before you deploy the cluster auto-scaler on OCI, your need to create one or more static Instance Pools and Instance
Configuration with `cloud-init` specified in the launch details so new nodes automatically joins the existing cluster on
start up.
Advanced Instance Pool and Instance Configuration configuration is out of scope for this document. However, a
working [instance-details.json](./examples/instance-details.json) and [placement-config.json](./examples/placement-config.json)
([example](./examples/instance-details.json) based on Rancher [RKE](https://rancher.com/products/rke/)) using [cloud-init](https://cloudinit.readthedocs.io/en/latest/) are
included in the examples, which can be applied using the [OCI CLI](https://docs.oracle.com/en-us/iaas/Content/API/SDKDocs/cliinstall.htm).
Modify the `user_data` in the example [instance-details.json](./examples/instance-details.json) to suit your needs, re-base64 encode, apply:
```bash
# e.g. cloud-init. Modify, re-encode, and update user_data in instance-details.json to suit your needs:
$ echo IyEvYmluL2Jhc2gKdG91hci9saWIvYXB0L....1yZXRyeSAzIGhG91Y2ggL3RtcC9jbG91ZC1pbml0LWZpbmlzaGVkCg== | base64 -D
#!/bin/bash
groupadd docker
usermod -aG docker ubuntu
curl --retry 3 https://releases.rancher.com/install-docker/20.10.sh | sh
docker run -d --privileged --restart=unless-stopped --net=host -v /etc/kubernetes:/etc/kubernetes -v /var/run:/var/run rancher/rancher-agent:v2.5.5 --server https://my-rancher.com --token xxxxxx --worker
```
```bash
$ oci compute-management instance-configuration create --instance-details file://./cluster-autoscaler/cloudprovider/oci/examples/instance-details.json --compartment-id ocid1.compartment.oc1..aaaaaaaa7ey4sg3a6b5wnv5hlkjlkjadslkfjalskfjalsadfadsf --query 'data.id' --raw-output
ocid1.instanceconfiguration.oc1.phx.aaaaaaaa3neul67zb3goz43lybosc2o3fv67gj3zazexbb3vfcbypmpznhtq
$ oci compute-management instance-pool create --compartment-id ocid1.compartment.oc1..aaaaaaaa7ey4sg3a6b5wnv5hlkjlkjadslkfjalskfjalsadfadsf --instance-configuration-id ocid1.instanceconfiguration.oc1.phx.aaaaaaaa3neul67zb3goz43lybosc2o3fv67gj3zazexbb3vfcbypmpznhtq --placement-configurations file://./cluster-autoscaler/cloudprovider/oci/examples/placement-config.json --size 0 --wait-for-state RUNNING --query 'data.id' --raw-output
Action completed. Waiting until the resource has entered state: ('RUNNING',)
ocid1.instancepool.oc1.phx.aaaaaaaayd5bxwrzomzr2b2enchm4mof7uhw7do5hc2afkhks576syikk2ca
```
## Configure Autoscaler
Use the `--nodes=<min-nodes>:<max-nodes>:<instancepool-ocid>` parameter to specify which pre-existing instance
pools to target for automatic expansion and contraction, the minimum and maximum sizes for each node pool, and how you
want the autoscaling to take place. Instance pools not referenced in the configuration file are not managed by the
autoscaler where:
- `<min-nodes>` is the minimum number of nodes allowed in the instance-pool.
- `<max-nodes>` is the maximum number of nodes allowed in the instance-pool. Make sure the maximum number of nodes you specify does not exceed the tenancy limits for the node shape defined for the node pool.
- `<instancepool-ocid>` is the OCIDs of a pre-existing instance-pool.
If you are authenticating via instance principals, be sure the `OCI_REGION` environment variable is set to the correct
value in the deployment e.g.:
```yaml
env:
- name: OCI_REGION
value: "us-phoenix-1"
```
### Optional cloud-config file
_Optional_ cloud-config file mounted in the path specified by `--cloud-config`.
An example, of passing optional configuration via `cloud-config` file that uses configures the cluster-autoscaler to use
instance-principals authenticating via instance principalsand only see configured instance-pools in a single compartment:
```ini
[Global]
compartment-id = ocid1.compartment.oc1..aaaaaaaa7ey4sg3a6b5wnv5hlkjlkjadslkfjalskfjalsadfadsf
region = uk-london-1
use-instance-principals = true
```
### Environment variables
Configuration via environment-variables:
- `OCI_USE_INSTANCE_PRINCIPAL` - Whether to use Instance Principals for authentication rather than expecting an OCI config file to be mounted in the container. Defaults to false.
- `OCI_REGION` - **Required** when using Instance Principals. e.g. `OCI_REGION=us-phoenix-1`. See [region list](https://docs.oracle.com/en-us/iaas/Content/General/Concepts/regions.htm) for identifiers.
- `OCI_COMPARTMENT_ID` - Restrict the cluster-autoscaler to instance-pools in a single compartment. When unset, the cluster-autoscaler will manage each specified instance-pool no matter which compartment they are in.
- `OCI_REFRESH_INTERVAL` - Optional refresh interval to sync internal cache with OCI API defaults to `2m`.
## Deployment
### Create OCI config secret (only if _not_ using Instance Principals)
If you are opting for a file based OCI configuration (as opposed to instance principals), the OCI config file and private key need to be mounted into the container filesystem using a secret volume.
The following policy is required when the specified is not an administrator to run the cluster-autoscaler:
```
Allow group acme-oci-cluster-autoscaler-user-grp to manage instance-pools in compartment <compartment-name>
Allow group acme-oci-cluster-autoscaler-user-grp to manage instance-configurations in compartment <compartment-name>
Allow group acme-oci-cluster-autoscaler-user-grp to manage instance-family in compartment <compartment-name>
Allow group acme-oci-cluster-autoscaler-user-grp to use subnets in compartment <compartment-name>
Allow group acme-oci-cluster-autoscaler-user-grp to read virtual-network-family in compartment <compartment-name>
Allow group acme-oci-cluster-autoscaler-user-grp to use vnics in compartment <compartment-name>
Allow group acme-oci-cluster-autoscaler-user-grp to inspect compartments in compartment <compartment-name>
```
Example OCI config file (note `key_file` is the expected path and filename of the OCI API private-key from the perspective of the container):
```bash
$ cat ~/.oci/config
[DEFAULT]
user=ocid1.user.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
fingerprint=xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx
key_file=/root/.oci/api_key.pem
tenancy=ocid1.tenancy.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
pass_phrase=
region=us-phoenix-1
```
Create the secret (`api_key.pem` key name is required):
```bash
kubectl create secret generic oci-config -n kube-system --from-file=/Users/me/.oci/config --from-file=api_key.pem=/Users/me/.oci/my_api_key.pem
```
### Example Deployment
Two example deployments of the cluster-autoscaler that manage instancepools are located in the [examples](./examples/) directory.
[oci-ip-cluster-autoscaler-w-principals.yaml](./examples/oci-ip-cluster-autoscaler-w-principals.yaml) uses
instance principals, and [oci-ip-cluster-autoscaler-w-config.yaml](./examples/oci-ip-cluster-autoscaler-w-config.yaml) uses file
based authentication.
Note the 3 specified instance-pools are intended to correspond to different availability domains in the Phoenix, AZ region:
```yaml
...
containers:
- image: docker.io/jlamillan/autoscaler:oci-pr-rc6
name: cluster-autoscaler
command:
- ./cluster-autoscaler
- --cloud-provider=oci
- --nodes=1:10:ocid1.instancepool.oc1.phx.aaaaaaaaqdxy35acq32zjfvkybjmvlbdgj6q3m55qkwwctxhsprmz633k62q
- --nodes=0:10:ocid1.instancepool.oc1.phx.aaaaaaaazldzcu4mi5spz56upbtwnsynz2nk6jvmx7zi4hsta4uggxbulbua
- --nodes=0:20:ocid1.instancepool.oc1.phx.aaaaaaaal3jhoc32ljsfaeif4x2ssfa2a63oehjgqryiueivieee6yaqbkia
```
Instance principal based authentication deployment:
Substitute the OCIDs of _your_ instance pool(s) and set the `OCI_REGION` environment variable to the region where your
instance pool(s) reside before applying the deployment:
```
kubectl apply -f ./cloudprovider/oci/examples/oci-ip-cluster-autoscaler-w-principals.yaml
```
OCI config file based authentication deployment:
```
kubectl apply -f ./cloudprovider/oci/examples/oci-ip-cluster-autoscaler-w-config.yaml
```
## Common Notes and Gotchas:
- You must configure the instance configuration of new compute instances to join the existing cluster when they start. This can
be accomplished with `cloud-init` / `user-data` in the instance launch configuration [example](./examples/instance-details.json).
- If opting for a file based OCI configuration (as opposed to instance principals), ensure the OCI config and private-key
PEM files are mounted into the container filesystem at the [expected path](https://docs.oracle.com/en-us/iaas/Content/API/Concepts/sdkconfig.htm). Note the `key_file` option in the example `~/.oci/config` above references a private-key file mounted into container by the example [volumeMount](./examples/oci-ip-cluster-autoscaler-w-config.yaml#L165)
- Make sure the maximum number of nodes you specify does not exceed the limit for the instance-pool or the tenancy.
- We recommend creating multiple instance-pools with one availability domain specified so new nodes can be created to meet
affinity requirements across availability domains.
- If you are authenticating via instance principals, be sure the `OCI_REGION` environment variable is set to the correct
value in the deployment.
- The cluster-autoscaler will not automatically remove scaled down (terminated) `Node` objects from the Kubernetes API
without assistance from the [OCI Cloud Controller Manager](https://github.com/oracle/oci-cloud-controller-manager) (CCM).
If scaled down nodes are lingering in your cluster in the `NotReady` status, ensure the OCI CCM is installed and running
correctly (`oci-cloud-controller-manager`).
- Avoid manually changing node pools that are managed by the cluster-autoscaler. For example, do not add or remove nodes
using kubectl, or using the Console (or the Oracle Cloud Infrastructure CLI or API).
- `--node-group-auto-discovery` and `--node-autoprovisioning-enabled=true` are not supported.
- We set a `nvidia.com/gpu:NoSchedule` taint on nodes in a GPU enabled instance-pool.
## Helpful links
- [Oracle Cloud Infrastructure home](https://cloud.oracle.com)
- [OCI instance configuration documentation](https://docs.oracle.com/en-us/iaas/Content/Compute/Tasks/creatinginstanceconfig.htm)
- [instance principals](https://docs.oracle.com/en-us/iaas/Content/Identity/Tasks/callingservicesfrominstances.htm)
- [OCI Cloud Controller Manager](https://github.com/oracle/oci-cloud-controller-manager)
- [OCI Container Storage Interface driver](https://github.com/oracle/oci-cloud-controller-manager/blob/master/container-storage-interface.md)

View File

@ -0,0 +1,19 @@
{
"instanceType": "compute",
"launchDetails": {
"compartmentId": "ocid1.compartment.oc1..aaaaaaaa7ey4sg3a6b5wnv5hlkjlkjadslkfjalskfjalsadfadsf",
"shape": "VM.Standard2.8",
"sourceDetails":
{
"imageId": "ocid1.image.oc1.phx.aaaaaaaa55tzajot4gbiw2p7gquwjnvfzrasosbrq4h6wywkff4zjosp2fia",
"sourceType": "image",
"bootVolumeSizeInGBs": 100
},
"metadata": {
"user_data": "IyEvYmluL2Jhc2gKdG91Y2ggL3RtcC9jbG91ZC1pbml0LXN0YXJ0ZWQKaXB0YWJsZXMgLUYKZ3JvdXBhZGQgZG9ja2VyCnVzZXJtb2QgLWFHIGRvY2tlciB1YnVudHUKcm0gL3Zhci9saWIvYXB0L2xpc3RzL2xvY2sKcGtpbGwgLTkgLWYgYXB0CmN1cmwgLS1yZXRyeSAzIGh0dHBzOi8vcmVsZWFzZXMucmFuY2hlci5jb20vaW5zdGFsbC1kb2NrZXIvMjAuMTAuc2ggfCBzaApkb2NrZXIgcnVuIC1kIC0tcHJpdmlsZWdlZCAtLXJlc3RhcnQ9dW5sZXNzLXN0b3BwZWQgLS1uZXQ9aG9zdCAtdiAvZXRjL2t1YmVybmV0ZXM6L2V0Yy9rdWJlcm5ldGVzIC12IC92YXIvcnVuOi92YXIvcnVuIHJhbmNoZXIvcmFuY2hlci1hZ2VudDp2Mi41LjUgLS1zZXJ2ZXIgaHR0cHM6Ly9teS1yYW5jaGVyLmNvbSAtLXRva2VuIHh4eHh4eCAgLS13b3JrZXIKdG91Y2ggL3RtcC9jbG91ZC1pbml0LWZpbmlzaGVkCg=="
},
"createVnicDetails": {
"assignPublicIp": true
}
}
}

View File

@ -0,0 +1,174 @@
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
k8s-addon: cluster-autoscaler.addons.k8s.io
k8s-app: cluster-autoscaler
name: cluster-autoscaler
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: cluster-autoscaler
labels:
k8s-addon: cluster-autoscaler.addons.k8s.io
k8s-app: cluster-autoscaler
rules:
- apiGroups: [""]
resources: ["events", "endpoints"]
verbs: ["create", "patch"]
- apiGroups: [""]
resources: ["pods/eviction"]
verbs: ["create"]
- apiGroups: [""]
resources: ["pods/status"]
verbs: ["update"]
- apiGroups: [""]
resources: ["endpoints"]
resourceNames: ["cluster-autoscaler"]
verbs: ["get", "update"]
- apiGroups: [""]
resources: ["nodes"]
verbs: ["watch", "list", "get", "update"]
- apiGroups: [""]
resources: ["namepaces"]
verbs: ["list"]
- apiGroups: [""]
resources:
- "pods"
- "services"
- "replicationcontrollers"
- "persistentvolumeclaims"
- "persistentvolumes"
verbs: ["watch", "list", "get"]
- apiGroups: ["extensions"]
resources: ["replicasets", "daemonsets"]
verbs: ["watch", "list", "get"]
- apiGroups: ["policy"]
resources: ["poddisruptionbudgets"]
verbs: ["watch", "list"]
- apiGroups: ["apps"]
resources: ["statefulsets", "replicasets", "daemonsets"]
verbs: ["watch", "list", "get"]
- apiGroups: ["storage.k8s.io"]
resources: ["storageclasses", "csinodes"]
verbs: ["get", "list", "watch"]
- apiGroups: ["batch"]
resources: ["jobs", "cronjobs"]
verbs: ["watch", "list", "get"]
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
verbs: ["create"]
- apiGroups: ["coordination.k8s.io"]
resourceNames: ["cluster-autoscaler"]
resources: ["leases"]
verbs: ["get", "update"]
- apiGroups: ["storage.k8s.io"]
resources: ["csidrivers", "csistoragecapacities"]
verbs: ["get", "list"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: cluster-autoscaler
namespace: kube-system
labels:
k8s-addon: cluster-autoscaler.addons.k8s.io
k8s-app: cluster-autoscaler
rules:
- apiGroups: [""]
resources: ["configmaps"]
verbs: ["create","list","watch"]
- apiGroups: [""]
resources: ["configmaps"]
resourceNames:
- "cluster-autoscaler-status"
- "cluster-autoscaler-priority-expander"
verbs: ["delete", "get", "update", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: cluster-autoscaler
labels:
k8s-addon: cluster-autoscaler.addons.k8s.io
k8s-app: cluster-autoscaler
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: cluster-autoscaler
subjects:
- kind: ServiceAccount
name: cluster-autoscaler
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: cluster-autoscaler
namespace: kube-system
labels:
k8s-addon: cluster-autoscaler.addons.k8s.io
k8s-app: cluster-autoscaler
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: cluster-autoscaler
subjects:
- kind: ServiceAccount
name: cluster-autoscaler
namespace: kube-system
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: cluster-autoscaler
namespace: kube-system
labels:
app: cluster-autoscaler
spec:
replicas: 1
selector:
matchLabels:
app: cluster-autoscaler
template:
metadata:
labels:
app: cluster-autoscaler
spec:
serviceAccountName: cluster-autoscaler
containers:
- image: docker.io/jlamillan/autoscaler:oci-pr-rc6
name: cluster-autoscaler
command:
- ./cluster-autoscaler
- --v=5
- --logtostderr=true
- --cloud-provider=oci
- --nodes=0:10:ocid1.instancepool.oc1.phx.aaaaaaaaqdxy35acq32zjfvkybjmvlbdgj6q3m55qkwwctxhsprmz633k62q
- --nodes=0:10:ocid1.instancepool.oc1.phx.aaaaaaaazldzcu4mi5spz56upbtwnsynz2nk6jvmx7zi4hsta4uggxbulbua
- --nodes=0:10:ocid1.instancepool.oc1.phx.aaaaaaaal3jhoc32ljsfaeif4x2ssfa2a63oehjgqryiueivieee6yaqbkia
- --scale-down-delay-after-add=1m
- --scale-down-unneeded-time=1m
- --namespace=kube-system
imagePullPolicy: "Always"
env:
- name: OCI_USE_INSTANCE_PRINCIPAL
value: "false"
volumeMounts:
- name: oci-config-vol
mountPath: "/root/.oci"
readOnly: true
volumes:
- name: oci-config-vol
secret:
secretName: oci-config
items:
- key: config
path: config
- key: api_key.pem
path: api_key.pem

View File

@ -0,0 +1,163 @@
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
k8s-addon: cluster-autoscaler.addons.k8s.io
k8s-app: cluster-autoscaler
name: cluster-autoscaler
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: cluster-autoscaler
labels:
k8s-addon: cluster-autoscaler.addons.k8s.io
k8s-app: cluster-autoscaler
rules:
- apiGroups: [""]
resources: ["events", "endpoints"]
verbs: ["create", "patch"]
- apiGroups: [""]
resources: ["pods/eviction"]
verbs: ["create"]
- apiGroups: [""]
resources: ["pods/status"]
verbs: ["update"]
- apiGroups: [""]
resources: ["endpoints"]
resourceNames: ["cluster-autoscaler"]
verbs: ["get", "update"]
- apiGroups: [""]
resources: ["nodes"]
verbs: ["watch", "list", "get", "update"]
- apiGroups: [""]
resources: ["namespaces"]
verbs: ["list"]
- apiGroups: [""]
resources:
- "pods"
- "services"
- "replicationcontrollers"
- "persistentvolumeclaims"
- "persistentvolumes"
verbs: ["watch", "list", "get"]
- apiGroups: ["extensions"]
resources: ["replicasets", "daemonsets"]
verbs: ["watch", "list", "get"]
- apiGroups: ["policy"]
resources: ["poddisruptionbudgets"]
verbs: ["watch", "list"]
- apiGroups: ["apps"]
resources: ["statefulsets", "replicasets", "daemonsets"]
verbs: ["watch", "list", "get"]
- apiGroups: ["storage.k8s.io"]
resources: ["storageclasses", "csinodes"]
verbs: ["get", "list", "watch"]
- apiGroups: ["batch"]
resources: ["jobs", "cronjobs"]
verbs: ["watch", "list", "get"]
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
verbs: ["create"]
- apiGroups: ["coordination.k8s.io"]
resourceNames: ["cluster-autoscaler"]
resources: ["leases"]
verbs: ["get", "update"]
- apiGroups: ["storage.k8s.io"]
resources: ["csidrivers", "csistoragecapacities"]
verbs: ["get", "list"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: cluster-autoscaler
namespace: kube-system
labels:
k8s-addon: cluster-autoscaler.addons.k8s.io
k8s-app: cluster-autoscaler
rules:
- apiGroups: [""]
resources: ["configmaps"]
verbs: ["create","list","watch"]
- apiGroups: [""]
resources: ["configmaps"]
resourceNames:
- "cluster-autoscaler-status"
- "cluster-autoscaler-priority-expander"
verbs: ["delete", "get", "update", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: cluster-autoscaler
labels:
k8s-addon: cluster-autoscaler.addons.k8s.io
k8s-app: cluster-autoscaler
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: cluster-autoscaler
subjects:
- kind: ServiceAccount
name: cluster-autoscaler
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: cluster-autoscaler
namespace: kube-system
labels:
k8s-addon: cluster-autoscaler.addons.k8s.io
k8s-app: cluster-autoscaler
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: cluster-autoscaler
subjects:
- kind: ServiceAccount
name: cluster-autoscaler
namespace: kube-system
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: cluster-autoscaler
namespace: kube-system
labels:
app: cluster-autoscaler
spec:
replicas: 1
selector:
matchLabels:
app: cluster-autoscaler
template:
metadata:
labels:
app: cluster-autoscaler
spec:
serviceAccountName: cluster-autoscaler
containers:
- image: docker.io/jlamillan/autoscaler:oci-pr-rc6
name: cluster-autoscaler
command:
- ./cluster-autoscaler
- --v=5
- --logtostderr=true
- --cloud-provider=oci
- --nodes=0:10:ocid1.instancepool.oc1.phx.aaaaaaaaqdxy35acq32zjfvkybjmvlbdgj6q3m55qkwwctxhsprmz633k62q
- --nodes=0:10:ocid1.instancepool.oc1.phx.aaaaaaaazldzcu4mi5spz56upbtwnsynz2nk6jvmx7zi4hsta4uggxbulbua
- --nodes=1:10:ocid1.instancepool.oc1.phx.aaaaaaaal3jhoc32ljsfaeif4x2ssfa2a63oehjgqryiueivieee6yaqbkia
- --scale-down-delay-after-add=10m
- --scale-down-unneeded-time=10m
- --namespace=kube-system
imagePullPolicy: "Always"
env:
- name: OCI_USE_INSTANCE_PRINCIPAL
value: "true"
- name: OCI_REGION
value: "us-phoenix-1"

View File

@ -0,0 +1,6 @@
[
{
"availabilityDomain": "hXgQ:PHX-AD-2",
"primarySubnetId": "ocid1.subnet.oc1.phx.aaaaaaaaouihv645dp2xaee6w4uvx6emjwuscsrxcn3miwa6vmijtpdnqdeq"
}
]

View File

@ -0,0 +1,175 @@
/*
Copyright 2021 Oracle and/or its affiliates.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package oci
import (
"github.com/pkg/errors"
apiv1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/config"
caerrors "k8s.io/autoscaler/cluster-autoscaler/utils/errors"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/clientcmd"
"k8s.io/klog/v2"
"time"
)
const (
ociUseInstancePrincipalEnvVar = "OCI_USE_INSTANCE_PRINCIPAL"
ociCompartmentEnvVar = "OCI_COMPARTMENT_ID"
ociRegionEnvVar = "OCI_REGION"
ociRefreshInterval = "OCI_REFRESH_INTERVAL"
ociAnnotationCompartmentID = "oci.oraclecloud.com/compartment-id"
// ResourceGPU is the GPU resource type
ResourceGPU apiv1.ResourceName = "nvidia.com/gpu"
defaultRefreshInterval = 5 * time.Minute
)
// OciCloudProvider implements the CloudProvider interface for OCI. It contains an
// instance pool manager to interact with OCI instance pools.
type OciCloudProvider struct {
rl *cloudprovider.ResourceLimiter
poolManager InstancePoolManager
}
// CloudConfig holds the cloud config for OCI provider.
type CloudConfig struct {
Global struct {
RefreshInterval time.Duration `gcfg:"refresh-interval"`
CompartmentID string `gcfg:"compartment-id"`
Region string `gcfg:"region"`
UseInstancePrinciples bool `gcfg:"use-instance-principals"`
}
}
// Name returns name of the cloud provider.
func (ocp *OciCloudProvider) Name() string {
return cloudprovider.OracleCloudProviderName
}
// NodeGroups returns all node groups configured for this cloud provider.
func (ocp *OciCloudProvider) NodeGroups() []cloudprovider.NodeGroup {
nodePools := ocp.poolManager.GetInstancePools()
result := make([]cloudprovider.NodeGroup, 0, len(nodePools))
for _, nodePool := range nodePools {
result = append(result, nodePool)
}
return result
}
// NodeGroupForNode returns the node group for the given node, nil if the node
// should not be processed by cluster autoscaler, or non-nil error if such
// occurred. Must be implemented.
func (ocp *OciCloudProvider) NodeGroupForNode(n *apiv1.Node) (cloudprovider.NodeGroup, error) {
ociRef, err := nodeToOciRef(n)
if err != nil {
return nil, err
}
ng, err := ocp.poolManager.GetInstancePoolForInstance(ociRef)
// this instance may not be a part of an instance pool, or it may be part of a instance pool that the autoscaler does not manage
if errors.Cause(err) == errInstanceInstancePoolNotFound {
// should not be processed by cluster autoscaler
return nil, nil
}
return ng, err
}
// Pricing returns pricing model for this cloud provider or error if not available.
// Implementation optional.
func (ocp *OciCloudProvider) Pricing() (cloudprovider.PricingModel, caerrors.AutoscalerError) {
klog.Info("Pricing called")
return nil, cloudprovider.ErrNotImplemented
}
// GetAvailableMachineTypes getInstancePool all machine types that can be requested from the cloud provider.
// Implementation optional.
func (ocp *OciCloudProvider) GetAvailableMachineTypes() ([]string, error) {
klog.Info("GetAvailableMachineTypes called")
return nil, cloudprovider.ErrNotImplemented
}
// NewNodeGroup builds a theoretical node group based on the node definition provided. The node group is not automatically
// created on the cloud provider side. The node group is not returned by NodeGroups() until it is created.
// Implementation optional.
func (ocp *OciCloudProvider) NewNodeGroup(machineType string,
labels map[string]string,
systemLabels map[string]string,
taints []apiv1.Taint,
extraResources map[string]resource.Quantity,
) (cloudprovider.NodeGroup, error) {
return nil, cloudprovider.ErrNotImplemented
}
// GetResourceLimiter returns struct containing limits (max, min) for resources (cores, memory etc.).
func (ocp *OciCloudProvider) GetResourceLimiter() (*cloudprovider.ResourceLimiter, error) {
return ocp.rl, nil
}
// GPULabel returns the label added to nodes with GPU resource.
func (ocp *OciCloudProvider) GPULabel() string {
// No labels, only taint: nvidia.com/gpu:NoSchedule
return ""
}
// GetAvailableGPUTypes return all available GPU types cloud provider supports.
func (ocp *OciCloudProvider) GetAvailableGPUTypes() map[string]struct{} {
return map[string]struct{}{}
}
// Cleanup cleans up open resources before the cloud provider is destroyed, i.e. go routines etc.
func (ocp *OciCloudProvider) Cleanup() error {
return ocp.poolManager.Cleanup()
}
// Refresh is called before every main loop and can be used to dynamically update cloud provider state.
// In particular the list of node groups returned by NodeGroups can change as a result of CloudProvider.Refresh().
func (ocp *OciCloudProvider) Refresh() error {
return ocp.poolManager.Refresh()
}
// BuildOCI constructs the OciCloudProvider object that implements the could provider interface (InstancePoolManager).
func BuildOCI(opts config.AutoscalingOptions, do cloudprovider.NodeGroupDiscoveryOptions, rl *cloudprovider.ResourceLimiter) *OciCloudProvider {
ipManager, err := CreateInstancePoolManager(opts.CloudConfig, do, createKubeClient(opts))
if err != nil {
klog.Fatalf("Could not create OCI cloud provider: %v", err)
}
return &OciCloudProvider{
poolManager: ipManager,
rl: rl,
}
}
func getKubeConfig(opts config.AutoscalingOptions) *rest.Config {
klog.V(1).Infof("Using kubeconfig file: %s", opts.KubeConfigPath)
kubeConfig, err := clientcmd.BuildConfigFromFlags("", opts.KubeConfigPath)
if err != nil {
klog.Fatalf("Failed to build kubeConfig: %v", err)
}
return kubeConfig
}
func createKubeClient(opts config.AutoscalingOptions) kubernetes.Interface {
return kubernetes.NewForConfigOrDie(getKubeConfig(opts))
}

View File

@ -0,0 +1,234 @@
/*
Copyright 2021 Oracle and/or its affiliates.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package oci
import (
"fmt"
"github.com/pkg/errors"
apiv1 "k8s.io/api/core/v1"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/config"
"k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
)
const (
ociInstanceIDAnnotation = "oci.oraclecloud.com/instance-id"
ociInstancePoolIDAnnotation = "oci.oraclecloud.com/instancepool-id"
ociInstancePoolResourceIdent = "instancepool"
)
// InstancePoolNodeGroup implements the NodeGroup interface using OCI instance pools.
type InstancePoolNodeGroup struct {
manager InstancePoolManager
kubeClient kubernetes.Interface
id string
minSize int
maxSize int
}
// MaxSize returns maximum size of the instance-pool based node group.
func (ip *InstancePoolNodeGroup) MaxSize() int {
return ip.maxSize
}
// MinSize returns minimum size of the instance-pool based node group.
func (ip *InstancePoolNodeGroup) MinSize() int {
return ip.minSize
}
// TargetSize returns the current target size of the instance-pool based node group. It is possible that the
// number of nodes in Kubernetes is different at the moment but should be equal
// to Size() once everything stabilizes (new nodes finish startup and registration or
// removed nodes are deleted completely). Implementation required.
func (ip *InstancePoolNodeGroup) TargetSize() (int, error) {
return ip.manager.GetInstancePoolSize(*ip)
}
// IncreaseSize increases the size of the instance-pool based node group. To delete a node you need
// to explicitly name it and use DeleteNode. This function should wait until
// instance-pool size is updated. Implementation required.
func (ip *InstancePoolNodeGroup) IncreaseSize(delta int) error {
if delta <= 0 {
return fmt.Errorf("size increase must be positive")
}
size, err := ip.manager.GetInstancePoolSize(*ip)
if err != nil {
return err
}
if size+delta > ip.MaxSize() {
return fmt.Errorf("size increase too large - desired:%d max:%d", size+delta, ip.MaxSize())
}
return ip.manager.SetInstancePoolSize(*ip, size+delta)
}
// DeleteNodes deletes nodes from this instance-pool. Error is returned either on
// failure or if the given node doesn't belong to this instance-pool. This function
// should wait until instance-pool size is updated. Implementation required.
func (ip *InstancePoolNodeGroup) DeleteNodes(nodes []*apiv1.Node) error {
// FYI, unregistered nodes come in as the provider id as node name.
klog.Infof("DeleteNodes called with %d node(s)", len(nodes))
size, err := ip.manager.GetInstancePoolSize(*ip)
if err != nil {
return err
}
if size <= ip.MinSize() {
return fmt.Errorf("min size reached, nodes will not be deleted")
}
refs := make([]OciRef, 0, len(nodes))
for _, node := range nodes {
belongs, err := ip.Belongs(node)
if err != nil {
return err
}
if !belongs {
return fmt.Errorf("%s belong to a different instance-pool than %s", node.Name, ip.Id())
}
ociRef, err := nodeToOciRef(node)
if err != nil {
return err
}
refs = append(refs, ociRef)
}
return ip.manager.DeleteInstances(*ip, refs)
}
// DecreaseTargetSize decreases the target size of the instance-pool based node group. This function
// doesn't permit to delete any existing node and can be used only to reduce the
// request for new nodes that have not been yet fulfilled. Delta should be negative.
// It is assumed that cloud provider will not delete the existing nodes when there
// is an option to just decrease the target. Implementation required.
func (ip *InstancePoolNodeGroup) DecreaseTargetSize(delta int) error {
if delta >= 0 {
return fmt.Errorf("size decrease must be negative")
}
size, err := ip.manager.GetInstancePoolSize(*ip)
if err != nil {
return err
}
nodes, err := ip.manager.GetInstancePoolNodes(*ip)
if err != nil {
return err
}
if size+delta < len(nodes) {
return fmt.Errorf("attempt to delete existing nodes targetSize:%d delta:%d existingNodes: %d",
size, delta, len(nodes))
}
return ip.manager.SetInstancePoolSize(*ip, size+delta)
}
// Belongs returns true if the given node belongs to the InstancePoolNodeGroup.
func (ip *InstancePoolNodeGroup) Belongs(node *apiv1.Node) (bool, error) {
ref, err := nodeToOciRef(node)
if err != nil {
return false, err
}
targetInstancePool, err := ip.manager.GetInstancePoolForInstance(ref)
if err != nil {
return false, err
}
if targetInstancePool == nil {
return false, fmt.Errorf("%s doesn't belong to a known instance-pool", node.Name)
}
return targetInstancePool.Id() == ip.Id(), nil
}
// Id returns an unique identifier of the instance-pool based node group.
func (ip *InstancePoolNodeGroup) Id() string {
return ip.id
}
// Debug returns a string containing all information regarding this instance-pool.
func (ip *InstancePoolNodeGroup) Debug() string {
return fmt.Sprintf("%s (%d:%d)", ip.Id(), ip.MinSize(), ip.MaxSize())
}
// Nodes returns a list of all nodes that belong to this instance-pool.
// It is required that Instance objects returned by this method have Id field set.
// Other fields are optional.
// This list should include also instances that might have not become a kubernetes node yet.
func (ip *InstancePoolNodeGroup) Nodes() ([]cloudprovider.Instance, error) {
return ip.manager.GetInstancePoolNodes(*ip)
}
// TemplateNodeInfo returns a schedulerframework.NodeInfo structure of an empty
// (as if just started) node. This will be used in scale-up simulations to
// predict what would a new node look like if a instance-pool was expanded. The returned
// NodeInfo is expected to have a fully populated Node object, with all of the labels,
// capacity and allocatable information as well as all pods that are started on
// the node by default, using manifest (most likely only kube-proxy). Implementation optional.
func (ip *InstancePoolNodeGroup) TemplateNodeInfo() (*schedulerframework.NodeInfo, error) {
node, err := ip.manager.GetInstancePoolTemplateNode(*ip)
if err != nil {
return nil, errors.Wrap(err, "unable to build node info template")
}
nodeInfo := schedulerframework.NewNodeInfo(
cloudprovider.BuildKubeProxy(ip.id),
buildCSINodePod(),
)
nodeInfo.SetNode(node)
return nodeInfo, nil
}
// Exist checks if the instance-pool based node group really exists on the cloud provider side. Allows to tell the
// theoretical instance-pool from the real one. Implementation required.
func (ip *InstancePoolNodeGroup) Exist() bool {
return true
}
// Create creates the instance-pool based node group on the cloud provider side. Implementation optional.
func (ip *InstancePoolNodeGroup) Create() (cloudprovider.NodeGroup, error) {
return nil, cloudprovider.ErrNotImplemented
}
// Delete deletes the instance-pool based node group on the cloud provider side.
// This will be executed only for autoprovisioned instance-pools, once their size drops to 0.
// Implementation optional.
func (ip *InstancePoolNodeGroup) Delete() error {
return cloudprovider.ErrNotImplemented
}
// GetOptions returns NodeGroupAutoscalingOptions that should be used for this particular
// InstancePoolNodeGroup. Returning a nil will result in using default options.
func (ip *InstancePoolNodeGroup) GetOptions(defaults config.NodeGroupAutoscalingOptions) (*config.NodeGroupAutoscalingOptions, error) {
return nil, cloudprovider.ErrNotImplemented
}
// Autoprovisioned returns true if the instance-pool based node group is autoprovisioned. An autoprovisioned group
// was created by CA and can be deleted when scaled to 0.
func (ip *InstancePoolNodeGroup) Autoprovisioned() bool {
return false
}

View File

@ -0,0 +1,363 @@
/*
Copyright 2021 Oracle and/or its affiliates.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package oci
import (
"context"
"github.com/pkg/errors"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/oci-go-sdk/v43/common"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/oci-go-sdk/v43/core"
"k8s.io/klog/v2"
"strings"
"sync"
"time"
)
// ComputeMgmtClient wraps core.ComputeManagementClient exposing the functions we actually require.
type ComputeMgmtClient interface {
GetInstancePool(context.Context, core.GetInstancePoolRequest) (core.GetInstancePoolResponse, error)
UpdateInstancePool(context.Context, core.UpdateInstancePoolRequest) (core.UpdateInstancePoolResponse, error)
GetInstancePoolInstance(context.Context, core.GetInstancePoolInstanceRequest) (core.GetInstancePoolInstanceResponse, error)
ListInstancePoolInstances(context.Context, core.ListInstancePoolInstancesRequest) (core.ListInstancePoolInstancesResponse, error)
DetachInstancePoolInstance(context.Context, core.DetachInstancePoolInstanceRequest) (core.DetachInstancePoolInstanceResponse, error)
}
// ComputeClient wraps core.ComputeClient exposing the functions we actually require.
type ComputeClient interface {
ListVnicAttachments(ctx context.Context, request core.ListVnicAttachmentsRequest) (core.ListVnicAttachmentsResponse, error)
}
// VirtualNetworkClient wraps core.VirtualNetworkClient exposing the functions we actually require.
type VirtualNetworkClient interface {
GetVnic(context.Context, core.GetVnicRequest) (core.GetVnicResponse, error)
}
type instancePoolCache struct {
mu sync.Mutex
poolCache map[string]*core.InstancePool
instanceSummaryCache map[string]*[]core.InstanceSummary
targetSize map[string]int
unownedInstances map[OciRef]bool
computeManagementClient ComputeMgmtClient
computeClient ComputeClient
virtualNetworkClient VirtualNetworkClient
}
func newInstancePoolCache(computeManagementClient ComputeMgmtClient, computeClient ComputeClient, virtualNetworkClient VirtualNetworkClient) *instancePoolCache {
return &instancePoolCache{
poolCache: map[string]*core.InstancePool{},
instanceSummaryCache: map[string]*[]core.InstanceSummary{},
targetSize: map[string]int{},
unownedInstances: map[OciRef]bool{},
computeManagementClient: computeManagementClient,
computeClient: computeClient,
virtualNetworkClient: virtualNetworkClient,
}
}
func (c *instancePoolCache) InstancePools() map[string]*core.InstancePool {
result := map[string]*core.InstancePool{}
for k, v := range c.poolCache {
result[k] = v
}
return result
}
func (c *instancePoolCache) rebuild(staticInstancePools map[string]*InstancePoolNodeGroup, cfg CloudConfig) error {
// Since we only support static instance-pools we don't need to worry about pruning.
for id := range staticInstancePools {
resp, err := c.computeManagementClient.GetInstancePool(context.Background(), core.GetInstancePoolRequest{
InstancePoolId: common.String(id),
})
if err != nil {
klog.Errorf("get instance pool %s failed: %v", id, err)
return err
}
klog.V(5).Infof("GetInstancePool() response %v", resp.InstancePool)
c.setInstancePool(&resp.InstancePool)
// OCI instance-pools do not contain individual instance objects so they must be fetched separately.
listInstancesResponse, err := c.computeManagementClient.ListInstancePoolInstances(context.Background(), core.ListInstancePoolInstancesRequest{
InstancePoolId: common.String(id),
CompartmentId: common.String(cfg.Global.CompartmentID),
})
if err != nil {
return err
}
klog.V(5).Infof("ListInstancePoolInstances() response %v", listInstancesResponse.Items)
c.setInstanceSummaries(*resp.InstancePool.Id, &listInstancesResponse.Items)
}
// Reset unowned instances cache.
c.unownedInstances = make(map[OciRef]bool)
return nil
}
// removeInstance tries to remove the instance from the specified instance pool. If the instance isn't in the array,
// then it won't do anything removeInstance returns true if it actually removed the instance and reduced the size of
// the instance pool.
func (c *instancePoolCache) removeInstance(instancePool InstancePoolNodeGroup, instanceID string) bool {
c.mu.Lock()
defer c.mu.Unlock()
if instanceID == "" {
klog.Warning("instanceID is not set - skipping removal.")
return false
}
// This instance pool must be in state RUNNING in order to detach a particular instance.
err := c.waitForInstancePoolState(context.Background(), instancePool.Id(), core.InstancePoolLifecycleStateRunning)
if err != nil {
return false
}
_, err = c.computeManagementClient.DetachInstancePoolInstance(context.Background(), core.DetachInstancePoolInstanceRequest{
InstancePoolId: common.String(instancePool.Id()),
DetachInstancePoolInstanceDetails: core.DetachInstancePoolInstanceDetails{
InstanceId: common.String(instanceID),
IsDecrementSize: common.Bool(true),
IsAutoTerminate: common.Bool(true),
},
})
if err == nil {
// Decrease pool size in cache since IsDecrementSize was true
c.targetSize[instancePool.Id()] -= 1
return true
}
return false
}
// findInstanceByDetails attempts to find the given instance by details by searching
// through the configured instance-pools (ListInstancePoolInstances) for a match.
func (c *instancePoolCache) findInstanceByDetails(ociInstance OciRef) (*OciRef, error) {
c.mu.Lock()
defer c.mu.Unlock()
// Minimum amount of information we need to make a positive match
if ociInstance.InstanceID == "" && ociInstance.PrivateIPAddress == "" && ociInstance.PublicIPAddress == "" {
return nil, errors.New("instance id or an IP address is required to resolve details")
}
if c.unownedInstances[ociInstance] {
// We already know this instance is not part of a configured pool. Return early and avoid additional API calls.
klog.V(4).Infof("Node " + ociInstance.Name + " is known to not be a member of any of the specified instance pool(s)")
return nil, errInstanceInstancePoolNotFound
}
// Look for the instance in each of the specified pool(s)
for _, nextInstancePool := range c.poolCache {
// Skip searching instance pool if it's instance count is 0.
if *nextInstancePool.Size == 0 {
klog.V(4).Infof("skipping over instance pool %s since it is empty", *nextInstancePool.Id)
continue
}
// List instances in the next pool
listInstancePoolInstances, err := c.computeManagementClient.ListInstancePoolInstances(context.Background(), core.ListInstancePoolInstancesRequest{
CompartmentId: common.String(ociInstance.CompartmentID),
InstancePoolId: nextInstancePool.Id,
})
if err != nil {
return nil, err
}
for _, poolMember := range listInstancePoolInstances.Items {
// Skip comparing this instance if it is not in the Running state
if strings.ToLower(*poolMember.State) != strings.ToLower(string(core.InstanceLifecycleStateRunning)) {
klog.V(4).Infof("skipping over instance %s: since it is not in the running state: %s", *poolMember.Id, *poolMember.State)
continue
}
listVnicAttachments, err := c.computeClient.ListVnicAttachments(context.Background(), core.ListVnicAttachmentsRequest{
CompartmentId: common.String(*poolMember.CompartmentId),
InstanceId: poolMember.Id,
})
if err != nil {
klog.Errorf("list vNIC attachments for %s failed: %v", *poolMember.Id, err)
return nil, err
}
klog.V(5).Infof("ListVnicAttachments() response for %s: %v", *poolMember.Id, listVnicAttachments.Items)
for _, vnicAttachment := range listVnicAttachments.Items {
// Skip this attachment if the vNIC is not live
if core.VnicAttachmentLifecycleStateAttached != vnicAttachment.LifecycleState {
klog.V(4).Infof("skipping vNIC on instance %s: since it is not active", *poolMember.Id)
continue
}
getVnicResp, err := c.virtualNetworkClient.GetVnic(context.Background(), core.GetVnicRequest{
VnicId: vnicAttachment.VnicId,
})
if err != nil {
klog.Errorf("get vNIC for %s failed: %v", *poolMember.Id, err)
return nil, err
}
klog.V(5).Infof("GetVnic() response for vNIC %s: %v", *vnicAttachment.Id, getVnicResp.Vnic)
// Preferably we match by instanceID, but we can match by private or public IP
if *poolMember.Id == ociInstance.InstanceID ||
(getVnicResp.Vnic.PrivateIp != nil && *getVnicResp.Vnic.PrivateIp == ociInstance.PrivateIPAddress) ||
(getVnicResp.Vnic.PublicIp != nil && *getVnicResp.Vnic.PublicIp == ociInstance.PublicIPAddress) {
klog.V(4).Info(*poolMember.DisplayName, " is a member of "+*nextInstancePool.Id)
// Return a complete instance details.
if ociInstance.Name == "" {
ociInstance.Name = *poolMember.DisplayName
}
ociInstance.InstanceID = *poolMember.Id
ociInstance.PoolID = *nextInstancePool.Id
ociInstance.CompartmentID = *poolMember.CompartmentId
ociInstance.AvailabilityDomain = strings.Split(*poolMember.AvailabilityDomain, ":")[1]
ociInstance.Shape = *poolMember.Shape
ociInstance.PrivateIPAddress = *getVnicResp.Vnic.PrivateIp
// Public IP is optional
if getVnicResp.Vnic.PublicIp != nil {
ociInstance.PublicIPAddress = *getVnicResp.Vnic.PublicIp
}
return &ociInstance, nil
}
}
}
}
c.unownedInstances[ociInstance] = true
klog.V(4).Infof(ociInstance.Name + " is not a member of any of the specified instance pool(s)")
return nil, errInstanceInstancePoolNotFound
}
func (c *instancePoolCache) getInstancePool(id string) (*core.InstancePool, error) {
c.mu.Lock()
defer c.mu.Unlock()
return c.getInstancePoolWithoutLock(id)
}
func (c *instancePoolCache) getInstancePoolWithoutLock(id string) (*core.InstancePool, error) {
instancePool := c.poolCache[id]
if instancePool == nil {
return nil, errors.New("instance pool was not found in the cache")
}
return instancePool, nil
}
func (c *instancePoolCache) setInstancePool(np *core.InstancePool) {
c.mu.Lock()
defer c.mu.Unlock()
c.poolCache[*np.Id] = np
c.targetSize[*np.Id] = *np.Size
}
func (c *instancePoolCache) getInstanceSummaries(poolID string) (*[]core.InstanceSummary, error) {
c.mu.Lock()
defer c.mu.Unlock()
return c.getInstanceSummariesWithoutLock(poolID)
}
func (c *instancePoolCache) getInstanceSummariesWithoutLock(poolID string) (*[]core.InstanceSummary, error) {
instanceSummaries := c.instanceSummaryCache[poolID]
if instanceSummaries == nil {
return nil, errors.New("instance summaries for instance pool id " + poolID + " were not found in cache")
}
return instanceSummaries, nil
}
func (c *instancePoolCache) setInstanceSummaries(instancePoolID string, is *[]core.InstanceSummary) {
c.mu.Lock()
defer c.mu.Unlock()
c.instanceSummaryCache[instancePoolID] = is
}
func (c *instancePoolCache) setSize(instancePoolID string, size int) error {
if instancePoolID == "" {
return errors.New("instance-pool is required")
}
getInstancePoolResp, err := c.computeManagementClient.GetInstancePool(context.Background(), core.GetInstancePoolRequest{
InstancePoolId: common.String(instancePoolID),
})
if err != nil {
return err
}
updateDetails := core.UpdateInstancePoolDetails{
Size: common.Int(size),
InstanceConfigurationId: getInstancePoolResp.InstanceConfigurationId,
}
_, err = c.computeManagementClient.UpdateInstancePool(context.Background(), core.UpdateInstancePoolRequest{
InstancePoolId: common.String(instancePoolID),
UpdateInstancePoolDetails: updateDetails,
})
if err != nil {
return err
}
c.mu.Lock()
defer c.mu.Unlock()
c.targetSize[instancePoolID] = size
return c.waitForInstancePoolState(context.Background(), instancePoolID, core.InstancePoolLifecycleStateRunning)
}
func (c *instancePoolCache) waitForInstancePoolState(ctx context.Context, instancePoolID string, desiredState core.InstancePoolLifecycleStateEnum) error {
timeoutCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
defer cancel()
err := wait.PollImmediateUntil(
// TODO we need a better implementation of this function
internalPollInterval,
func() (bool, error) {
getInstancePoolResp, err := c.computeManagementClient.GetInstancePool(context.Background(), core.GetInstancePoolRequest{
InstancePoolId: common.String(instancePoolID),
})
if err != nil {
klog.Errorf("getInstancePool failed. Retrying: %+v", err)
return false, err
} else if getInstancePoolResp.LifecycleState != desiredState {
klog.V(4).Infof("waiting for instance-pool %s to enter state: %s (current state: %s)", instancePoolID,
desiredState, getInstancePoolResp.LifecycleState)
return false, nil
}
klog.V(3).Infof("instance pool %s is in desired state: %s", instancePoolID, desiredState)
return true, nil
}, timeoutCtx.Done())
if err != nil {
// may be wait.ErrWaitTimeout
return err
}
return nil
}
func (c *instancePoolCache) getSize(id string) (int, error) {
c.mu.Lock()
defer c.mu.Unlock()
size, ok := c.targetSize[id]
if !ok {
return -1, errors.New("target size not found")
}
return size, nil
}

View File

@ -0,0 +1,513 @@
/*
Copyright 2021 Oracle and/or its affiliates.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package oci
import (
"fmt"
"gopkg.in/gcfg.v1"
"os"
"strconv"
"strings"
"time"
apiv1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
kubeletapis "k8s.io/kubelet/pkg/apis"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"github.com/pkg/errors"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/oci-go-sdk/v43/common"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/oci-go-sdk/v43/common/auth"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/oci-go-sdk/v43/core"
)
var (
internalPollInterval = 1 * time.Minute
errInstanceInstancePoolNotFound = errors.New("instance-pool not found for instance")
)
// InstancePoolManager defines the operations required for an *instance-pool based* autoscaler.
type InstancePoolManager interface {
// Refresh triggers refresh of cached resources.
Refresh() error
// Cleanup cleans up open resources before the cloud provider is destroyed, i.e. go routines etc.
Cleanup() error
// GetInstancePools returns list of registered InstancePools.
GetInstancePools() []*InstancePoolNodeGroup
// GetInstancePoolNodes returns InstancePool nodes.
GetInstancePoolNodes(ip InstancePoolNodeGroup) ([]cloudprovider.Instance, error)
// GetInstancePoolForInstance returns InstancePool to which the given instance belongs.
GetInstancePoolForInstance(instance OciRef) (*InstancePoolNodeGroup, error)
// GetInstancePoolTemplateNode returns a template node for InstancePool.
GetInstancePoolTemplateNode(ip InstancePoolNodeGroup) (*apiv1.Node, error)
// GetInstancePoolSize gets the InstancePool size.
GetInstancePoolSize(ip InstancePoolNodeGroup) (int, error)
// SetInstancePoolSize sets the InstancePool size.
SetInstancePoolSize(ip InstancePoolNodeGroup, size int) error
// DeleteInstances deletes the given instances. All instances must be controlled by the same InstancePool.
DeleteInstances(ip InstancePoolNodeGroup, instances []OciRef) error
}
// InstancePoolManagerImpl is the implementation of an instance-pool based autoscaler on OCI.
type InstancePoolManagerImpl struct {
cfg *CloudConfig
shapeGetter ShapeGetter
staticInstancePools map[string]*InstancePoolNodeGroup
lastRefresh time.Time
// caches the instance pool and instance summary objects received from OCI.
// All interactions with OCI's API should go through the poolCache.
instancePoolCache *instancePoolCache
}
// CreateInstancePoolManager constructs the InstancePoolManager object.
func CreateInstancePoolManager(cloudConfigPath string, discoveryOpts cloudprovider.NodeGroupDiscoveryOptions, kubeClient kubernetes.Interface) (InstancePoolManager, error) {
var err error
var configProvider common.ConfigurationProvider
var cloudConfig = &CloudConfig{}
// cloudConfigPath is the optional file of variables passed in with the --cloud-config flag, which takes precedence over environment variables
if cloudConfigPath != "" {
config, fileErr := os.Open(cloudConfigPath)
if fileErr != nil {
klog.Fatalf("could not open cloud provider configuration %s: %#v", cloudConfigPath, fileErr)
}
defer config.Close()
if config != nil {
if err := gcfg.ReadInto(cloudConfig, config); err != nil {
klog.Errorf("could not read config: %v", err)
return nil, err
}
}
}
// Fall back to environment variables
if cloudConfig.Global.CompartmentID == "" {
cloudConfig.Global.CompartmentID = os.Getenv(ociCompartmentEnvVar)
} else if !cloudConfig.Global.UseInstancePrinciples {
if os.Getenv(ociUseInstancePrincipalEnvVar) == "true" {
cloudConfig.Global.UseInstancePrinciples = true
}
if os.Getenv(ociRegionEnvVar) != "" {
cloudConfig.Global.Region = os.Getenv(ociRegionEnvVar)
}
}
if cloudConfig.Global.RefreshInterval == 0 {
if os.Getenv(ociRefreshInterval) != "" {
klog.V(4).Info("using a custom cache refresh interval %v...", os.Getenv(ociRefreshInterval))
cloudConfig.Global.RefreshInterval, _ = time.ParseDuration(os.Getenv(ociRefreshInterval))
} else {
cloudConfig.Global.RefreshInterval = defaultRefreshInterval
}
}
clientConfig := common.CustomClientConfiguration{
RetryPolicy: newRetryPolicy(),
}
if os.Getenv(ociUseInstancePrincipalEnvVar) == "true" {
klog.V(4).Info("using instance principals...")
region := os.Getenv(ociRegionEnvVar)
if region == "" {
klog.Fatalf("OCI_REGION is required when OCI_USE_INSTANCE_PRINCIPAL is set to true")
}
configProvider, err = auth.InstancePrincipalConfigurationProviderForRegion(common.StringToRegion(region))
if err != nil {
return nil, err
}
} else {
klog.Info("using default configuration provider")
configProvider = common.DefaultConfigProvider()
}
providerRegion, _ := configProvider.Region()
klog.Infof("OCI provider region: %s ", providerRegion)
computeMgmtClient, err := core.NewComputeManagementClientWithConfigurationProvider(configProvider)
if err != nil {
return nil, errors.Wrap(err, "unable to create compute management client")
}
computeMgmtClient.SetCustomClientConfiguration(clientConfig)
computeClient, err := core.NewComputeClientWithConfigurationProvider(configProvider)
if err != nil {
return nil, errors.Wrap(err, "unable to create compute client")
}
computeClient.SetCustomClientConfiguration(clientConfig)
networkClient, err := core.NewVirtualNetworkClientWithConfigurationProvider(configProvider)
if err != nil {
return nil, errors.Wrap(err, "unable to create virtual network client")
}
networkClient.SetCustomClientConfiguration(clientConfig)
cloudConfig.Global.CompartmentID = os.Getenv(ociCompartmentEnvVar)
// Not passed by --cloud-config or environment variable, attempt to use the tenancy ID as the compartment ID
if cloudConfig.Global.CompartmentID == "" {
tenancyID, err := configProvider.TenancyOCID()
if err != nil {
return nil, errors.Wrap(err, "unable to retrieve tenancy ID")
}
cloudConfig.Global.CompartmentID = tenancyID
}
ipManager := &InstancePoolManagerImpl{
cfg: cloudConfig,
staticInstancePools: map[string]*InstancePoolNodeGroup{},
shapeGetter: createShapeGetter(ShapeClientImpl{computeMgmtClient: computeMgmtClient, computeClient: computeClient}),
instancePoolCache: newInstancePoolCache(&computeMgmtClient, &computeClient, &networkClient),
}
// Contains all the specs from the args that give us the pools.
for _, arg := range discoveryOpts.NodeGroupSpecs {
ip, err := instancePoolFromArg(arg)
if err != nil {
return nil, fmt.Errorf("unable to construct instance pool from argument: %v", err)
}
ip.manager = ipManager
ip.kubeClient = kubeClient
ipManager.staticInstancePools[ip.Id()] = ip
}
// wait until we have an initial full poolCache.
err = wait.PollImmediateInfinite(
10*time.Second,
func() (bool, error) {
err := ipManager.Refresh()
if err != nil {
klog.Errorf("unable to fill cache on startup. Retrying: %+v", err)
return false, nil
}
return true, nil
})
if err != nil {
return nil, err
}
return ipManager, nil
}
// instancePoolFromArg parses a instancepool spec represented in the form of `<minSize>:<maxSize>:<ocid>` and produces an instance pool wrapper spec object
func instancePoolFromArg(value string) (*InstancePoolNodeGroup, error) {
if !strings.Contains(value, ociInstancePoolResourceIdent) {
return nil, fmt.Errorf("instance pool manager does not work with resources of type: %s", value)
}
tokens := strings.SplitN(value, ":", 3)
if len(tokens) != 3 || !strings.HasPrefix(tokens[2], "ocid") {
return nil, fmt.Errorf("incorrect instance configuration: %s", value)
}
spec := &InstancePoolNodeGroup{}
if size, err := strconv.Atoi(tokens[0]); err == nil {
spec.minSize = size
} else {
return nil, fmt.Errorf("failed to set pool min size: %s %v", tokens[0], err)
}
if size, err := strconv.Atoi(tokens[1]); err == nil {
spec.maxSize = size
} else {
return nil, fmt.Errorf("failed to set pool max size: %s %v", tokens[1], err)
}
spec.id = tokens[2]
klog.Infof("static instance pool wrapper spec constructed: %+v", spec)
return spec, nil
}
// Refresh triggers refresh of cached resources.
func (m *InstancePoolManagerImpl) Refresh() error {
if m.lastRefresh.Add(m.cfg.Global.RefreshInterval).After(time.Now()) {
return nil
}
return m.forceRefresh()
}
func (m *InstancePoolManagerImpl) forceRefresh() error {
if m.cfg == nil {
return errors.New("instance pool manager does have a required config")
}
err := m.instancePoolCache.rebuild(m.staticInstancePools, *m.cfg)
if err != nil {
return err
}
m.lastRefresh = time.Now()
klog.Infof("Refreshed instance-pool list, next refresh after %v", m.lastRefresh.Add(m.cfg.Global.RefreshInterval))
return nil
}
// Cleanup cleans up open resources before the cloud provider is destroyed, i.e. go routines etc.
func (m *InstancePoolManagerImpl) Cleanup() error {
return nil
}
// GetInstancePools returns list of registered InstancePools.
func (m *InstancePoolManagerImpl) GetInstancePools() []*InstancePoolNodeGroup {
var instancePools []*InstancePoolNodeGroup
for _, np := range m.staticInstancePools {
instancePools = append(instancePools, np)
}
return instancePools
}
// GetInstancePoolNodes returns InstancePool nodes that are not in a terminal state.
func (m *InstancePoolManagerImpl) GetInstancePoolNodes(ip InstancePoolNodeGroup) ([]cloudprovider.Instance, error) {
klog.V(4).Infof("getting instances for node pool: %q", ip.Id())
instanceSummaries, err := m.instancePoolCache.getInstanceSummaries(ip.Id())
if err != nil {
return nil, err
}
var providerInstances []cloudprovider.Instance
for _, instance := range *instanceSummaries {
status := &cloudprovider.InstanceStatus{}
switch *instance.State {
case string(core.InstanceLifecycleStateRunning):
status.State = cloudprovider.InstanceRunning
case string(core.InstanceLifecycleStateCreatingImage):
status.State = cloudprovider.InstanceCreating
case string(core.InstanceLifecycleStateStarting):
status.State = cloudprovider.InstanceCreating
case string(core.InstanceLifecycleStateMoving):
status.State = cloudprovider.InstanceCreating
case string(core.InstanceLifecycleStateProvisioning):
status.State = cloudprovider.InstanceCreating
case string(core.InstanceLifecycleStateTerminating):
status.State = cloudprovider.InstanceDeleting
case string(core.InstanceLifecycleStateStopping):
status.State = cloudprovider.InstanceDeleting
}
// Instance not in a terminal or unknown state, ok to add.
if status.State != 0 {
providerInstances = append(providerInstances, cloudprovider.Instance{
Id: *instance.Id,
Status: status,
})
}
}
return providerInstances, nil
}
// GetInstancePoolForInstance returns InstancePool to which the given instance belongs. If
// PoolID is not set on the specified OciRef, we will look for a match.
func (m *InstancePoolManagerImpl) GetInstancePoolForInstance(instanceDetails OciRef) (*InstancePoolNodeGroup, error) {
if instanceDetails.PoolID != "" {
// It's possible that this instance belongs to an instance pool that was not specified via --nodes argument.
return m.staticInstancePools[instanceDetails.PoolID], nil
}
if instanceDetails.CompartmentID == "" {
// cfg.Global.CompartmentID would be set to tenancy OCID at runtime if compartment was not set.
instanceDetails.CompartmentID = m.cfg.Global.CompartmentID
}
// Details are missing from this instance - including the pool ID.
// Try to resolve them, though it may not be a member of an instance-pool we manage.
resolvedInstanceDetails, err := m.instancePoolCache.findInstanceByDetails(instanceDetails)
if err != nil {
return nil, err
} else if resolvedInstanceDetails == nil {
return nil, nil
}
kubeClient := m.staticInstancePools[resolvedInstanceDetails.PoolID].kubeClient
// Optionally annotate & label the node so that it does not need to be searched for in subsequent iterations.
_ = annotateNode(kubeClient, resolvedInstanceDetails.Name, ociInstanceIDAnnotation, resolvedInstanceDetails.InstanceID)
_ = annotateNode(kubeClient, resolvedInstanceDetails.Name, ociInstancePoolIDAnnotation, resolvedInstanceDetails.PoolID)
_ = annotateNode(kubeClient, resolvedInstanceDetails.Name, ociAnnotationCompartmentID, resolvedInstanceDetails.CompartmentID)
_ = labelNode(kubeClient, resolvedInstanceDetails.Name, apiv1.LabelTopologyZone, resolvedInstanceDetails.AvailabilityDomain)
_ = labelNode(kubeClient, resolvedInstanceDetails.Name, apiv1.LabelFailureDomainBetaZone, resolvedInstanceDetails.AvailabilityDomain)
_ = labelNode(kubeClient, resolvedInstanceDetails.Name, apiv1.LabelInstanceType, resolvedInstanceDetails.Shape)
_ = labelNode(kubeClient, resolvedInstanceDetails.Name, apiv1.LabelInstanceTypeStable, resolvedInstanceDetails.Shape)
_ = setNodeProviderID(kubeClient, resolvedInstanceDetails.Name, resolvedInstanceDetails.InstanceID)
return m.staticInstancePools[resolvedInstanceDetails.PoolID], nil
}
// GetInstancePoolTemplateNode returns a template node for the InstancePool.
func (m *InstancePoolManagerImpl) GetInstancePoolTemplateNode(ip InstancePoolNodeGroup) (*apiv1.Node, error) {
instancePool, err := m.instancePoolCache.getInstancePool(ip.Id())
if err != nil {
return nil, err
}
node, err := m.buildNodeFromTemplate(instancePool)
if err != nil {
return nil, err
}
return node, nil
}
// GetInstancePoolSize gets the instance-pool size.
func (m *InstancePoolManagerImpl) GetInstancePoolSize(ip InstancePoolNodeGroup) (int, error) {
return m.instancePoolCache.getSize(ip.Id())
}
// SetInstancePoolSize sets instance-pool size.
func (m *InstancePoolManagerImpl) SetInstancePoolSize(np InstancePoolNodeGroup, size int) error {
err := m.instancePoolCache.setSize(np.Id(), size)
if err != nil {
return err
}
// Interface says this function should wait until node group size is updated.
// We do not wait for the work request to finish or nodes become active on purpose. This allows
// the autoscaler to make decisions quicker especially since the autoscaler is aware of
// unregistered nodes in addition to registered nodes.
return nil
}
// DeleteInstances deletes the given instances. All instances must be controlled by the same instance-pool.
func (m *InstancePoolManagerImpl) DeleteInstances(instancePool InstancePoolNodeGroup, instances []OciRef) error {
klog.Infof("DeleteInstances called on instance pool %s", instancePool.Id())
for _, instance := range instances {
// removeInstance auto decrements instance pool size.
detached := m.instancePoolCache.removeInstance(instancePool, instance.InstanceID)
if !detached {
return fmt.Errorf("could not delete instance %s from instance pool %s", instance.InstanceID, instancePool.Id())
}
}
return nil
}
func (m *InstancePoolManagerImpl) buildNodeFromTemplate(instancePool *core.InstancePool) (*apiv1.Node, error) {
node := apiv1.Node{}
nodeName := fmt.Sprintf("%s-%d", "inst", 555555)
ocidParts := strings.Split(*instancePool.Id, ".")
instanceIDPlaceholder := ocidParts[0] + "." + "instance" + "." + ocidParts[2] + "." + ocidParts[3] + ".tbd"
annotations := make(map[string]string)
annotations[ociAnnotationCompartmentID] = *instancePool.CompartmentId
annotations[ociInstancePoolIDAnnotation] = *instancePool.Id
annotations[ociInstanceIDAnnotation] = instanceIDPlaceholder
node.ObjectMeta = metav1.ObjectMeta{
Name: nodeName,
Labels: map[string]string{},
Annotations: annotations,
}
node.Status = apiv1.NodeStatus{
Capacity: apiv1.ResourceList{},
}
shape, err := m.shapeGetter.GetInstancePoolShape(instancePool)
if err != nil {
return nil, err
}
if shape.GPU > 0 {
node.Spec.Taints = append(node.Spec.Taints, apiv1.Taint{
Key: "nvidia.com/gpu",
Value: "",
Effect: "NoSchedule",
})
}
node.Status.Capacity[apiv1.ResourcePods] = *resource.NewQuantity(110, resource.DecimalSI)
node.Status.Capacity[apiv1.ResourceCPU] = *resource.NewQuantity(int64(shape.CPU), resource.DecimalSI)
node.Status.Capacity[apiv1.ResourceMemory] = *resource.NewQuantity(int64(shape.MemoryInBytes), resource.DecimalSI)
node.Status.Capacity[ResourceGPU] = *resource.NewQuantity(int64(shape.GPU), resource.DecimalSI)
node.Status.Allocatable = node.Status.Capacity
availabilityDomain, err := getInstancePoolAvailabilityDomain(instancePool)
if err != nil {
return nil, err
}
node.Labels = cloudprovider.JoinStringMaps(node.Labels, buildGenericLabelsForInstancePool(instancePool, nodeName, shape.Name, availabilityDomain))
node.Status.Conditions = cloudprovider.BuildReadyConditions()
return &node, nil
}
// getInstancePoolAvailabilityDomain determines the availability of the instance pool.
// This breaks down if the customer specifies more than one placement configuration,
// so best practices should be a node pool per AD if customers care about it during scheduling.
// if there are more than 1AD defined, then we return the first one always.
func getInstancePoolAvailabilityDomain(ip *core.InstancePool) (string, error) {
if len(ip.PlacementConfigurations) == 0 {
// At least one placement configuration is required for an instance pool, so we should not get here.
return "", fmt.Errorf("instance-pool %q does not have the required placement configurations", *ip.Id)
}
if len(ip.PlacementConfigurations) > 1 {
klog.Warningf("instance-pool %q has more than 1 placement config so picking first availability domain", *ip.Id)
}
// Get the availability domain which is by default in the format of `Uocm:PHX-AD-1`
// and remove the hash prefix.
availabilityDomain := strings.Split(*ip.PlacementConfigurations[0].AvailabilityDomain, ":")[1]
return availabilityDomain, nil
}
func buildGenericLabelsForInstancePool(instancePool *core.InstancePool, nodeName, shape, availabilityDomain string) map[string]string {
result := make(map[string]string)
result[kubeletapis.LabelArch] = cloudprovider.DefaultArch
result[apiv1.LabelArchStable] = cloudprovider.DefaultArch
result[kubeletapis.LabelOS] = cloudprovider.DefaultOS
result[apiv1.LabelOSStable] = cloudprovider.DefaultOS
parts := strings.Split(*instancePool.Id, ".")
if len(parts) == 5 {
// backward compatibility with older pod labels
result[apiv1.LabelZoneRegion] = parts[3]
result[apiv1.LabelZoneRegionStable] = parts[3]
}
result[apiv1.LabelInstanceType] = shape
result[apiv1.LabelInstanceTypeStable] = shape
result[apiv1.LabelZoneFailureDomain] = availabilityDomain
// backward compatibility with older pod labels
result[apiv1.LabelZoneFailureDomainStable] = availabilityDomain
result[apiv1.LabelHostname] = nodeName
return result
}

View File

@ -0,0 +1,511 @@
package oci
import (
"context"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/oci-go-sdk/v43/core"
"reflect"
"testing"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/oci-go-sdk/v43/common"
)
type mockComputeManagementClient struct {
err error
getInstancePoolResponse core.GetInstancePoolResponse
getInstancePoolInstanceResponse core.GetInstancePoolInstanceResponse
listInstancePoolInstancesResponse core.ListInstancePoolInstancesResponse
updateInstancePoolResponse core.UpdateInstancePoolResponse
detachInstancePoolInstanceResponse core.DetachInstancePoolInstanceResponse
}
type mockVirtualNetworkClient struct {
err error
getVnicResponse core.GetVnicResponse
}
type mockComputeClient struct {
err error
listVnicAttachmentsResponse core.ListVnicAttachmentsResponse
}
func (m *mockComputeClient) ListVnicAttachments(ctx context.Context, request core.ListVnicAttachmentsRequest) (core.ListVnicAttachmentsResponse, error) {
return m.listVnicAttachmentsResponse, m.err
}
func (m *mockVirtualNetworkClient) GetVnic(context.Context, core.GetVnicRequest) (core.GetVnicResponse, error) {
return m.getVnicResponse, m.err
}
func (m *mockComputeManagementClient) ListInstancePoolInstances(_ context.Context, _ core.ListInstancePoolInstancesRequest) (core.ListInstancePoolInstancesResponse, error) {
return m.listInstancePoolInstancesResponse, m.err
}
func (m *mockComputeManagementClient) GetInstancePool(context.Context, core.GetInstancePoolRequest) (core.GetInstancePoolResponse, error) {
return m.getInstancePoolResponse, m.err
}
func (m *mockComputeManagementClient) UpdateInstancePool(context.Context, core.UpdateInstancePoolRequest) (core.UpdateInstancePoolResponse, error) {
return m.updateInstancePoolResponse, m.err
}
func (m *mockComputeManagementClient) GetInstancePoolInstance(context.Context, core.GetInstancePoolInstanceRequest) (core.GetInstancePoolInstanceResponse, error) {
return m.getInstancePoolInstanceResponse, m.err
}
func (m *mockComputeManagementClient) DetachInstancePoolInstance(context.Context, core.DetachInstancePoolInstanceRequest) (core.DetachInstancePoolInstanceResponse, error) {
return m.detachInstancePoolInstanceResponse, m.err
}
var computeClient = &mockComputeClient{
err: nil,
listVnicAttachmentsResponse: core.ListVnicAttachmentsResponse{
RawResponse: nil,
Items: []core.VnicAttachment{{
Id: common.String("ocid1.vnic.oc1.phx.abc"),
LifecycleState: core.VnicAttachmentLifecycleStateAttached,
}},
},
}
var computeManagementClient = &mockComputeManagementClient{
err: nil,
getInstancePoolResponse: core.GetInstancePoolResponse{
InstancePool: core.InstancePool{
Id: common.String("ocid1.instancepool.oc1.phx.aaaaaaaa1"),
CompartmentId: common.String("ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
InstanceConfigurationId: common.String("ocid1.instanceconfiguration.oc1.phx.aaaaaaaa1"),
LifecycleState: core.InstancePoolLifecycleStateRunning,
Size: common.Int(2),
},
},
listInstancePoolInstancesResponse: core.ListInstancePoolInstancesResponse{
RawResponse: nil,
Items: []core.InstanceSummary{{
Id: common.String("ocid1.instance.oc1.phx.aaa1"),
AvailabilityDomain: common.String("Uocm:PHX-AD-2"),
CompartmentId: common.String("ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
DisplayName: common.String("inst-1ncvn-ociinstancepool"),
Shape: common.String("VM.Standard2.8"),
State: common.String(string(core.InstanceLifecycleStateRunning)),
}, {
Id: common.String("ocid1.instance.oc1.phx.aaacachemiss"),
AvailabilityDomain: common.String("Uocm:PHX-AD-2"),
CompartmentId: common.String("ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
DisplayName: common.String("inst-2ncvn-ociinstancepool"),
Shape: common.String("VM.Standard2.8"),
State: common.String(string(core.InstanceLifecycleStateRunning)),
}},
},
}
var virtualNetworkClient = &mockVirtualNetworkClient{
err: nil,
getVnicResponse: core.GetVnicResponse{
RawResponse: nil,
Vnic: core.Vnic{
Id: common.String("ocid1.vnic.oc1.phx.abyhqljsxigued23s7ywgcqlbpqfiysgnhxj672awzjluhoopzf7l7wvm6rq"),
PrivateIp: common.String("10.0.20.59"),
PublicIp: common.String("129.146.58.250"),
},
},
}
func TestInstancePoolFromArgs(t *testing.T) {
value := `1:5:ocid1.instancepool.oc1.phx.aaaaaaaah`
instanceNodePool, err := instancePoolFromArg(value)
if err != nil {
t.Fatalf("unexpected error: %+v", err)
}
if instanceNodePool.minSize != 1 {
t.Errorf("got minSize %d ; wanted minSize 1", instanceNodePool.minSize)
}
if instanceNodePool.maxSize != 5 {
t.Errorf("got maxSize %d ; wanted maxSize 1", instanceNodePool.maxSize)
}
if instanceNodePool.id != "ocid1.instancepool.oc1.phx.aaaaaaaah" {
t.Errorf("got ocid %q ; wanted id \"ocid1.instancepool.oc1.phx.aaaaaaaah\"", instanceNodePool.id)
}
value = `1:5:ocid1.nodepool.oc1.phx.aaaaaaaah`
_, err = instancePoolFromArg(value)
if err == nil {
t.Fatal("expected error processing an oke based node-pool")
}
value = `1:5:incorrect:ocid1.instancepool.oc1.phx.aaaaaaaah`
_, err = instancePoolFromArg(value)
if err == nil {
t.Fatal("expected error of an invalid instance pool")
}
}
func TestGetSetInstancePoolSize(t *testing.T) {
nodePoolCache := newInstancePoolCache(computeManagementClient, computeClient, virtualNetworkClient)
nodePoolCache.targetSize["ocid1.instancepool.oc1.phx.aaaaaaaai"] = 5
manager := &InstancePoolManagerImpl{instancePoolCache: nodePoolCache}
size, err := manager.GetInstancePoolSize(InstancePoolNodeGroup{id: "ocid1.instancepool.oc1.phx.aaaaaaaai"})
if err != nil {
t.Fatalf("unexpected error: %+v", err)
}
if size != 5 {
t.Errorf("got size %d ; wanted size 5", size)
}
err = manager.SetInstancePoolSize(InstancePoolNodeGroup{id: "ocid1.instancepool.oc1.phx.aaaaaaaai"}, 6)
if err != nil {
t.Fatalf("unexpected error: %+v", err)
}
size, err = manager.GetInstancePoolSize(InstancePoolNodeGroup{id: "ocid1.instancepool.oc1.phx.aaaaaaaai"})
if err != nil {
t.Fatalf("unexpected error: %+v", err)
}
if size != 6 {
t.Errorf("got size %d ; wanted size 6", size)
}
}
func TestGetInstancePoolForInstance(t *testing.T) {
nodePoolCache := newInstancePoolCache(computeManagementClient, computeClient, virtualNetworkClient)
nodePoolCache.poolCache["ocid1.instancepool.oc1.phx.aaaaaaaa1"] = &core.InstancePool{
Id: common.String("ocid1.instancepool.oc1.phx.aaaaaaaa1"),
Size: common.Int(1),
}
var cloudConfig = CloudConfig{}
cloudConfig.Global.CompartmentID = "compartment.oc1..aaaaaaaa7ey4sg3a6b5wnv5hlkjlkjadslkfjalskfjalsadfadsf"
manager := &InstancePoolManagerImpl{
cfg: &cloudConfig,
staticInstancePools: map[string]*InstancePoolNodeGroup{
"ocid1.instancepool.oc1.phx.aaaaaaaa1": {id: "ocid1.instancepool.oc1.phx.aaaaaaaa1"},
"ocid1.instancepool.oc1.phx.aaaaaaaa2": {id: "ocid1.instancepool.oc1.phx.aaaaaaaa2"},
},
instancePoolCache: nodePoolCache,
}
// first verify instance pool can be found when only the instance id is specified.
np, err := manager.GetInstancePoolForInstance(OciRef{InstanceID: "ocid1.instance.oc1.phx.aaa1"})
if err != nil {
t.Fatalf("unexpected error: %+v", err)
}
if np.Id() != "ocid1.instancepool.oc1.phx.aaaaaaaa1" {
t.Fatalf("got unexpected ocid %q ; wanted \"ocid1.instancepool.oc1.phx.aaaaaaaa1\"", np.Id())
}
// next, verify a valid instance can be found if it is currently missing from the cache.
np, err = manager.GetInstancePoolForInstance(OciRef{InstanceID: "ocid1.instance.oc1.phx.aaacachemiss"})
if err != nil {
t.Fatalf("unexpected error: %+v", err)
}
if np.Id() != "ocid1.instancepool.oc1.phx.aaaaaaaa1" {
t.Fatalf("got unexpected ocid %q ; wanted \"ocid1.instancepool.oc1.phx.aaaaaaaa1s\"", np.Id())
}
// next, verify an invalid instance cant be found if it is missing from the cache and pool.
_, err = manager.GetInstancePoolForInstance(OciRef{InstanceID: "ocid1.instance.oc1.phx.aaadne"})
if err != errInstanceInstancePoolNotFound {
t.Fatalf("epected error looking for an invalid instance")
}
// verify an invalid instance pool produces an error.
ip, err := manager.GetInstancePoolForInstance(OciRef{InstanceID: "ocid1.instance.oc1.phx.aaadne", PoolID: "ocid1.instancepool.oc1.phx.aaaaaaaadne"})
if err != nil {
t.Fatalf("unexpected error: %+v", err)
}
if ip != nil {
t.Fatalf("expected nil looking for an instance with invalid instance & pool ids")
}
// next verify instance pool can be found when the instance pool id is specified directly.
_, err = manager.GetInstancePoolForInstance(OciRef{PoolID: "ocid1.instancepool.oc1.phx.aaaaaaaa1"})
if err != nil {
t.Fatalf("unexpected error: %+v", err)
}
// next verify instance pool can be found when only the private IP is specified.
np, err = manager.GetInstancePoolForInstance(OciRef{
PrivateIPAddress: "10.0.20.59"})
if err != nil {
t.Fatalf("unexpected error: %+v", err)
}
if np.Id() != "ocid1.instancepool.oc1.phx.aaaaaaaa1" {
t.Fatalf("got unexpected ocid %q ; wanted \"ocid1.instancepool.oc1.phx.aaaaaaaa1\"", np.Id())
}
// now verify node pool can be found via lookup up by instance id in poolCache
np, err = manager.GetInstancePoolForInstance(OciRef{InstanceID: "ocid1.instance.oc1.phx.aaa1"})
if err != nil {
t.Fatalf("unexpected error: %+v", err)
}
if np.Id() != "ocid1.instancepool.oc1.phx.aaaaaaaa1" {
t.Fatalf("got unexpected ocid %q ; wanted \"ocid1.instancepool.oc1.phx.aaaaaaaa1\"", np.Id())
}
}
func TestGetInstancePoolNodes(t *testing.T) {
nodePoolCache := newInstancePoolCache(computeManagementClient, computeClient, virtualNetworkClient)
nodePoolCache.poolCache["ocid1.instancepool.oc1.phx.aaaaaaaa1"] = &core.InstancePool{
Id: common.String("ocid1.instancepool.oc1.phx.aaaaaaaa1"),
CompartmentId: common.String("ocid1.compartment.oc1..aaaaaaaa1"),
LifecycleState: core.InstancePoolLifecycleStateRunning,
}
nodePoolCache.instanceSummaryCache["ocid1.instancepool.oc1.phx.aaaaaaaa1"] = &[]core.InstanceSummary{{
Id: common.String("ocid1.instance.oc1.phx.aaa1"),
AvailabilityDomain: common.String("PHX-AD-2"),
State: common.String(string(core.InstanceLifecycleStateRunning)),
}, {
Id: common.String("ocid1.instance.oc1.phx.aaa2"),
AvailabilityDomain: common.String("PHX-AD-1"),
State: common.String(string(core.InstanceLifecycleStateTerminating)),
},
}
expected := []cloudprovider.Instance{
{
Id: "ocid1.instance.oc1.phx.aaa1",
Status: &cloudprovider.InstanceStatus{
State: cloudprovider.InstanceRunning,
},
},
{
Id: "ocid1.instance.oc1.phx.aaa2",
Status: &cloudprovider.InstanceStatus{
State: cloudprovider.InstanceDeleting,
},
},
}
manager := &InstancePoolManagerImpl{instancePoolCache: nodePoolCache, cfg: &CloudConfig{}}
instances, err := manager.GetInstancePoolNodes(InstancePoolNodeGroup{id: "ocid1.instancepool.oc1.phx.aaaaaaaa1"})
if err != nil {
t.Fatalf("received unexpected error; %+v", err)
}
if !reflect.DeepEqual(instances, expected) {
t.Errorf("got %+v\nwanted %+v", instances, expected)
}
err = manager.forceRefresh()
if err != nil {
t.Fatalf("received unexpected error refreshing cache; %+v", err)
}
}
func TestGetInstancePoolAvailabilityDomain(t *testing.T) {
testCases := map[string]struct {
np *core.InstancePool
result string
expectedErr bool
}{
"single ad": {
np: &core.InstancePool{
Id: common.String("id"),
LifecycleState: "",
PlacementConfigurations: []core.InstancePoolPlacementConfiguration{{
AvailabilityDomain: common.String("hash:US-ASHBURN-1"),
PrimarySubnetId: common.String("ocid1.subnet.oc1.phx.aaaaaaaa1"),
}},
Size: common.Int(2),
},
result: "US-ASHBURN-1",
},
"multi-ad": {
np: &core.InstancePool{
Id: common.String("id"),
LifecycleState: "",
PlacementConfigurations: []core.InstancePoolPlacementConfiguration{{
AvailabilityDomain: common.String("hash:US-ASHBURN-1"),
PrimarySubnetId: common.String("ocid1.subnet.oc1.phx.aaaaaaaa1"),
}, {
AvailabilityDomain: common.String("hash:US-ASHBURN-2"),
PrimarySubnetId: common.String("ocid1.subnet.oc1.phx.aaaaaaaa2"),
}},
Size: common.Int(2),
},
result: "US-ASHBURN-1",
},
}
for name, tc := range testCases {
t.Run(name, func(t *testing.T) {
ad, err := getInstancePoolAvailabilityDomain(tc.np)
if tc.expectedErr {
if err == nil {
t.Fatalf("expected err but not nil")
}
return
}
if ad != tc.result {
t.Errorf("got %q ; wanted %q", ad, tc.result)
}
})
}
}
func TestGetInstancePoolsAndInstances(t *testing.T) {
var computeManagementClient = &mockComputeManagementClient{
getInstancePoolResponse: core.GetInstancePoolResponse{
InstancePool: core.InstancePool{
Id: common.String("ocid1.instancepool.oc1.phx.aaaaaaaa1"),
CompartmentId: common.String("ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
InstanceConfigurationId: common.String("ocid1.instanceconfiguration.oc1.phx.aaaaaaaa1"),
PlacementConfigurations: nil,
Size: common.Int(2),
},
},
listInstancePoolInstancesResponse: core.ListInstancePoolInstancesResponse{
Items: []core.InstanceSummary{{
Id: common.String("ocid1.instance.oc1.phx.aaa1"),
AvailabilityDomain: common.String("Uocm:PHX-AD-2"),
CompartmentId: common.String("ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
DisplayName: common.String("inst-1ncvn-ociinstancepool"),
Shape: common.String("VM.Standard2.8"),
State: common.String(string(core.InstanceLifecycleStateRunning)),
}, {
Id: common.String("ocid1.instance.oc1.phx.aaaterminal"),
AvailabilityDomain: common.String("Uocm:PHX-AD-2"),
CompartmentId: common.String("ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
DisplayName: common.String("inst-2ncvn-ociinstancepool"),
Shape: common.String("VM.Standard2.8"),
State: common.String(string(core.InstanceLifecycleStateTerminated)),
}},
},
}
cloudConfig := &CloudConfig{}
cloudConfig.Global.CompartmentID = "ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
manager := &InstancePoolManagerImpl{
cfg: cloudConfig,
staticInstancePools: map[string]*InstancePoolNodeGroup{
"ocid1.instancepool.oc1.phx.aaaaaaaa1": {id: "ocid1.instancepool.oc1.phx.aaaaaaaa1"},
},
instancePoolCache: newInstancePoolCache(computeManagementClient, computeClient, virtualNetworkClient),
}
// Populate cache(s) (twice to increase code coverage).
_ = manager.Refresh()
err := manager.Refresh()
if err != nil {
t.Fatalf("unexpected error: %+v", err)
}
err = manager.forceRefresh()
if err != nil {
t.Fatalf("unexpected error: %+v", err)
}
instancePoolNodeGroups := manager.GetInstancePools()
if got := len(instancePoolNodeGroups); got != 1 {
t.Fatalf("expected 1 (static) instance pool, got %d", got)
}
instances, err := manager.GetInstancePoolNodes(*instancePoolNodeGroups[0])
if err != nil {
t.Fatalf("unexpected error: %+v", err)
}
// Should not pick up terminated instance.
if got := len(instances); got != 1 {
t.Fatalf("expected 1 instance, got %d", got)
}
instancePoolNodeGroup, err := manager.GetInstancePoolForInstance(OciRef{InstanceID: instances[0].Id})
if err != nil {
t.Fatalf("unexpected error: %+v", err)
}
if !reflect.DeepEqual(instancePoolNodeGroup, instancePoolNodeGroups[0]) {
t.Errorf("got %+v\nwanted %+v", instancePoolNodeGroup, instancePoolNodeGroups[0])
}
}
func TestDeleteInstances(t *testing.T) {
var computeManagementClient = &mockComputeManagementClient{
getInstancePoolResponse: core.GetInstancePoolResponse{
InstancePool: core.InstancePool{
Id: common.String("ocid1.instancepool.oc1.phx.aaaaaaaa1"),
CompartmentId: common.String("ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
InstanceConfigurationId: common.String("ocid1.instanceconfiguration.oc1.phx.aaaaaaaa1"),
LifecycleState: core.InstancePoolLifecycleStateRunning,
Size: common.Int(2),
},
},
listInstancePoolInstancesResponse: core.ListInstancePoolInstancesResponse{
Items: []core.InstanceSummary{{
Id: common.String("ocid1.instance.oc1.phx.aaa1"),
AvailabilityDomain: common.String("Uocm:PHX-AD-1"),
CompartmentId: common.String("ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
DisplayName: common.String("inst-1ncvn-ociinstancepool"),
Shape: common.String("VM.Standard2.16"),
State: common.String(string(core.InstanceLifecycleStateRunning)),
}, {
Id: common.String("ocid1.instance.oc1.phx.aaa2"),
AvailabilityDomain: common.String("Uocm:PHX-AD-1"),
CompartmentId: common.String("ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
DisplayName: common.String("inst-2ncvn-ociinstancepool"),
Shape: common.String("VM.Standard2.16"),
State: common.String(string(core.InstanceLifecycleStateRunning)),
}},
},
}
cloudConfig := &CloudConfig{}
cloudConfig.Global.CompartmentID = "ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
manager := &InstancePoolManagerImpl{
cfg: cloudConfig,
staticInstancePools: map[string]*InstancePoolNodeGroup{
"ocid1.instancepool.oc1.phx.aaaaaaaa1": {id: "ocid1.instancepool.oc1.phx.aaaaaaaa1"},
},
instancePoolCache: newInstancePoolCache(computeManagementClient, computeClient, virtualNetworkClient),
}
// Populate cache(s).
manager.Refresh()
instances, err := manager.GetInstancePoolNodes(InstancePoolNodeGroup{id: "ocid1.instancepool.oc1.phx.aaaaaaaa1"})
if err != nil {
t.Fatalf("unexpected error: %+v", err)
}
// Should not pick up terminated instance.
if got := len(instances); got != 2 {
t.Fatalf("expected 2 instance, got %d", got)
}
// Check size before and after delete
size, err := manager.GetInstancePoolSize(InstancePoolNodeGroup{id: "ocid1.instancepool.oc1.phx.aaaaaaaa1"})
if err != nil {
t.Fatalf("unexpected error: %+v", err)
}
if size != 2 {
t.Errorf("got size %d ; wanted size 2 before delete", size)
}
instanceToDelete := OciRef{
AvailabilityDomain: "PHX-AD-1",
Name: "inst-2ncvn-ociinstancepool",
InstanceID: "ocid1.instance.oc1.phx.aaa2",
PoolID: "ocid1.instancepool.oc1.phx.aaaaaaaa1",
}
err = manager.DeleteInstances(InstancePoolNodeGroup{id: "ocid1.instancepool.oc1.phx.aaaaaaaa1"}, []OciRef{instanceToDelete})
if err != nil {
t.Fatalf("unexpected error: %+v", err)
}
size, err = manager.GetInstancePoolSize(InstancePoolNodeGroup{id: "ocid1.instancepool.oc1.phx.aaaaaaaa1"})
if err != nil {
t.Fatalf("unexpected error: %+v", err)
}
if size != 1 {
t.Errorf("got size %d ; wanted size 1 *after* delete", size)
}
}

View File

@ -0,0 +1,66 @@
/*
Copyright 2021 Oracle and/or its affiliates.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package oci
import (
apiv1 "k8s.io/api/core/v1"
)
// OciRef contains s reference to some entity in OCI world.
type OciRef struct {
AvailabilityDomain string
Name string
CompartmentID string
InstanceID string
PoolID string
PrivateIPAddress string
PublicIPAddress string
Shape string
}
func nodeToOciRef(n *apiv1.Node) (OciRef, error) {
return OciRef{
Name: n.ObjectMeta.Name,
AvailabilityDomain: n.Labels[apiv1.LabelZoneFailureDomain],
CompartmentID: n.Annotations[ociAnnotationCompartmentID],
InstanceID: n.Annotations[ociInstanceIDAnnotation],
PoolID: n.Annotations[ociInstancePoolIDAnnotation],
PrivateIPAddress: getNodeInternalAddress(n),
PublicIPAddress: getNodeExternalAddress(n),
Shape: n.Labels[apiv1.LabelInstanceType],
}, nil
}
// getNodeInternalAddress returns the first private address of the node and an empty string if none are found.
func getNodeInternalAddress(node *apiv1.Node) string {
for _, address := range node.Status.Addresses {
if address.Type == apiv1.NodeInternalIP {
return address.Address
}
}
return ""
}
// getNodeExternalAddress returns the first public address of the node and an empty string if none are found.
func getNodeExternalAddress(node *apiv1.Node) string {
for _, address := range node.Status.Addresses {
if address.Type == apiv1.NodeExternalIP {
return address.Address
}
}
return ""
}

View File

@ -0,0 +1,136 @@
/*
Copyright 2021 Oracle and/or its affiliates.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package oci
import (
"context"
"fmt"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/oci-go-sdk/v43/core"
"k8s.io/klog/v2"
)
// ShapeGetter returns the oci shape attributes for the pool.
type ShapeGetter interface {
GetInstancePoolShape(pool *core.InstancePool) (*Shape, error)
}
// ShapeClient is an interface around the GetInstanceConfiguration and ListShapes calls.
type ShapeClient interface {
GetInstanceConfiguration(context.Context, core.GetInstanceConfigurationRequest) (core.GetInstanceConfigurationResponse, error)
ListShapes(context.Context, core.ListShapesRequest) (core.ListShapesResponse, error)
}
// ShapeClientImpl is the implementation for fetching shape information.
type ShapeClientImpl struct {
// Can fetch instance configs (flexible shapes)
computeMgmtClient core.ComputeManagementClient
// Can fetch shapes directly
computeClient core.ComputeClient
}
// GetInstanceConfiguration gets the instance configuration.
func (cc ShapeClientImpl) GetInstanceConfiguration(ctx context.Context, req core.GetInstanceConfigurationRequest) (core.GetInstanceConfigurationResponse, error) {
return cc.computeMgmtClient.GetInstanceConfiguration(ctx, req)
}
// ListShapes lists the shapes.
func (cc ShapeClientImpl) ListShapes(ctx context.Context, req core.ListShapesRequest) (core.ListShapesResponse, error) {
return cc.computeClient.ListShapes(ctx, req)
}
// Shape includes the resource attributes of a given shape which should be used
// for constructing node templates.
type Shape struct {
Name string
CPU float32
GPU int
MemoryInBytes float32
}
// createShapeGetter creates a new oci shape getter.
func createShapeGetter(shapeClient ShapeClient) ShapeGetter {
return &shapeGetterImpl{
shapeClient: shapeClient,
cache: map[string]*Shape{},
}
}
type shapeGetterImpl struct {
shapeClient ShapeClient
cache map[string]*Shape
}
func (osf *shapeGetterImpl) GetInstancePoolShape(ip *core.InstancePool) (*Shape, error) {
shape := &Shape{}
klog.V(5).Info("fetching shape configuration details for instance-pool " + *ip.Id)
instanceConfig, err := osf.shapeClient.GetInstanceConfiguration(context.Background(), core.GetInstanceConfigurationRequest{
InstanceConfigurationId: ip.InstanceConfigurationId,
})
if err != nil {
return nil, err
}
if instanceConfig.InstanceDetails == nil {
return nil, fmt.Errorf("instance configuration details for instance %s has not been set", *ip.Id)
}
if instanceDetails, ok := instanceConfig.InstanceDetails.(core.ComputeInstanceDetails); ok {
// flexible shape use details or look up the static shape details below.
if instanceDetails.LaunchDetails != nil && instanceDetails.LaunchDetails.ShapeConfig != nil {
if instanceDetails.LaunchDetails.Shape != nil {
shape.Name = *instanceDetails.LaunchDetails.Shape
}
if instanceDetails.LaunchDetails.ShapeConfig.Ocpus != nil {
shape.CPU = *instanceDetails.LaunchDetails.ShapeConfig.Ocpus
// Minimum amount of memory unless explicitly set higher
shape.MemoryInBytes = *instanceDetails.LaunchDetails.ShapeConfig.Ocpus * 1024 * 1024 * 1024
}
if instanceDetails.LaunchDetails.ShapeConfig.MemoryInGBs != nil {
shape.MemoryInBytes = *instanceDetails.LaunchDetails.ShapeConfig.MemoryInGBs * 1024 * 1024 * 1024
}
} else {
allShapes, _ := osf.shapeClient.ListShapes(context.Background(), core.ListShapesRequest{
CompartmentId: instanceConfig.CompartmentId,
})
for _, nextShape := range allShapes.Items {
if *nextShape.Shape == *instanceDetails.LaunchDetails.Shape {
shape.Name = *nextShape.Shape
if nextShape.Ocpus != nil {
shape.CPU = *nextShape.Ocpus
}
if nextShape.MemoryInGBs != nil {
shape.MemoryInBytes = *nextShape.MemoryInGBs * 1024 * 1024 * 1024
}
if nextShape.Gpus != nil {
shape.GPU = *nextShape.Gpus
}
}
}
}
} else {
return nil, fmt.Errorf("(compute) instance configuration for instance-pool %s not found", *ip.Id)
}
// Didn't find a match
if shape.Name == "" {
return nil, fmt.Errorf("shape information for instance-pool %s not found", *ip.Id)
}
return shape, nil
}

View File

@ -0,0 +1,237 @@
package oci
import (
"context"
apiv1 "k8s.io/api/core/v1"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
kubeletapis "k8s.io/kubelet/pkg/apis"
"reflect"
"strings"
"testing"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/oci-go-sdk/v43/common"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/oci-go-sdk/v43/core"
)
type mockShapeClient struct {
err error
listShapeResp core.ListShapesResponse
getInstanceConfigResp core.GetInstanceConfigurationResponse
}
func (m *mockShapeClient) ListShapes(_ context.Context, _ core.ListShapesRequest) (core.ListShapesResponse, error) {
return m.listShapeResp, m.err
}
func (m *mockShapeClient) GetInstanceConfiguration(context.Context, core.GetInstanceConfigurationRequest) (core.GetInstanceConfigurationResponse, error) {
return m.getInstanceConfigResp, m.err
}
var launchDetails = core.InstanceConfigurationLaunchInstanceDetails{
CompartmentId: nil,
DisplayName: nil,
CreateVnicDetails: nil,
Shape: common.String("VM.Standard.E3.Flex"),
ShapeConfig: &core.InstanceConfigurationLaunchInstanceShapeConfigDetails{
Ocpus: common.Float32(8),
MemoryInGBs: common.Float32(128),
},
SourceDetails: nil,
}
var instanceDetails = core.ComputeInstanceDetails{
LaunchDetails: &launchDetails,
}
var shapeClient = &mockShapeClient{
err: nil,
listShapeResp: core.ListShapesResponse{
Items: []core.Shape{
{
Shape: common.String("VM.Standard2.8"),
Ocpus: common.Float32(8),
MemoryInGBs: common.Float32(120),
},
},
},
getInstanceConfigResp: core.GetInstanceConfigurationResponse{
RawResponse: nil,
InstanceConfiguration: core.InstanceConfiguration{
CompartmentId: nil,
Id: common.String("ocid1.instanceconfiguration.oc1.phx.aaaaaaaa1"),
TimeCreated: nil,
DefinedTags: nil,
DisplayName: nil,
FreeformTags: nil,
InstanceDetails: instanceDetails,
DeferredFields: nil,
},
Etag: nil,
OpcRequestId: nil,
},
}
func TestGetShape(t *testing.T) {
testCases := map[string]struct {
shape string
expected *Shape
}{
"flex shape": {
shape: "VM.Standard.E3.Flex",
expected: &Shape{
Name: "VM.Standard.E3.Flex",
CPU: 8,
MemoryInBytes: float32(128) * 1024 * 1024 * 1024,
GPU: 0,
},
},
}
for name, tc := range testCases {
shapeGetter := createShapeGetter(shapeClient)
t.Run(name, func(t *testing.T) {
shape, err := shapeGetter.GetInstancePoolShape(&core.InstancePool{Id: &tc.shape, InstanceConfigurationId: common.String("ocid1.instanceconfiguration.oc1.phx.aaaaaaaa1")})
if err != nil {
t.Fatal(err)
}
if !reflect.DeepEqual(shape, tc.expected) {
t.Errorf("wanted %+v ; got %+v", tc.expected, shape)
}
if !strings.Contains(tc.shape, "Flex") {
// we can't poolCache flex shapes so only check poolCache on non flex shapes
cacheShape, ok := shapeGetter.(*shapeGetterImpl).cache[tc.shape]
if !ok {
t.Error("shape not found in poolCache")
}
if !reflect.DeepEqual(cacheShape, tc.expected) {
t.Errorf("wanted %+v ; got %+v", tc.expected, shape)
}
}
})
}
}
func TestGetInstancePoolTemplateNode(t *testing.T) {
instancePoolCache := newInstancePoolCache(computeManagementClient, computeClient, virtualNetworkClient)
instancePoolCache.poolCache["ocid1.instancepool.oc1.phx.aaaaaaaa1"] = &core.InstancePool{
Id: common.String("ocid1.instancepool.oc1.phx.aaaaaaaa1"),
CompartmentId: common.String("ocid1.compartment.oc1..aaaaaaaa1"),
LifecycleState: core.InstancePoolLifecycleStateRunning,
PlacementConfigurations: []core.InstancePoolPlacementConfiguration{{
AvailabilityDomain: common.String("hash:US-ASHBURN-1"),
PrimarySubnetId: common.String("ocid1.subnet.oc1.phx.aaaaaaaa1"),
}},
}
instancePoolCache.instanceSummaryCache["ocid1.instancepool.oc1.phx.aaaaaaaa1"] = &[]core.InstanceSummary{{
Id: common.String("ocid1.instance.oc1.phx.aaa1"),
AvailabilityDomain: common.String("PHX-AD-2"),
State: common.String(string(core.InstanceLifecycleStateRunning)),
},
}
cloudConfig := &CloudConfig{}
cloudConfig.Global.CompartmentID = "ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
var manager = &InstancePoolManagerImpl{
cfg: cloudConfig,
shapeGetter: createShapeGetter(shapeClient),
staticInstancePools: map[string]*InstancePoolNodeGroup{
"ocid1.instancepool.oc1.phx.aaaaaaaa1": {id: "ocid1.instancepool.oc1.phx.aaaaaaaa1"},
},
instancePoolCache: instancePoolCache,
}
instancePoolNodeGroups := manager.GetInstancePools()
if got := len(instancePoolNodeGroups); got != 1 {
t.Fatalf("expected 1 (static) instance pool, got %d", got)
}
nodeTemplate, err := manager.GetInstancePoolTemplateNode(*instancePoolNodeGroups[0])
if err != nil {
t.Fatalf("received unexpected error refreshing cache; %+v", err)
}
labels := nodeTemplate.GetLabels()
if labels == nil {
t.Fatalf("expected labels on node object")
}
// Double check the shape label.
if got := labels[apiv1.LabelInstanceTypeStable]; got != "VM.Standard.E3.Flex" {
t.Fatalf("expected shape label %s to be set to VM.Standard.E3.Flex: %v", apiv1.LabelInstanceTypeStable, nodeTemplate.Labels)
}
// Also check the AD label for good measure.
if got := labels[apiv1.LabelTopologyZone]; got != "US-ASHBURN-1" {
t.Fatalf("expected AD zone label %s to be set to US-ASHBURN-1: %v", apiv1.LabelTopologyZone, nodeTemplate.Labels)
}
}
func TestBuildGenericLabels(t *testing.T) {
shapeName := "VM.Standard2.8"
np := &core.InstancePool{
Id: common.String("ocid1.instancepool.oc1.phx.aaaaaaaah"),
Size: common.Int(2),
}
nodeName := "node1"
availabilityDomain := "US-ASHBURN-1"
expected := map[string]string{
kubeletapis.LabelArch: cloudprovider.DefaultArch,
apiv1.LabelArchStable: cloudprovider.DefaultArch,
kubeletapis.LabelOS: cloudprovider.DefaultOS,
apiv1.LabelOSStable: cloudprovider.DefaultOS,
apiv1.LabelZoneRegion: "phx",
apiv1.LabelZoneRegionStable: "phx",
apiv1.LabelInstanceType: shapeName,
apiv1.LabelInstanceTypeStable: shapeName,
apiv1.LabelZoneFailureDomain: availabilityDomain,
apiv1.LabelZoneFailureDomainStable: availabilityDomain,
apiv1.LabelHostname: nodeName,
}
launchDetails := core.InstanceConfigurationLaunchInstanceDetails{
Shape: common.String("VM.Standard2.8"),
}
instanceDetails := core.ComputeInstanceDetails{
LaunchDetails: &launchDetails,
}
// For list shapes
mockShapeClient := &mockShapeClient{
err: nil,
listShapeResp: core.ListShapesResponse{
Items: []core.Shape{
{Shape: common.String("VM.Standard2.4"), Ocpus: common.Float32(4), MemoryInGBs: common.Float32(60)},
{Shape: common.String("VM.Standard2.8"), Ocpus: common.Float32(8), MemoryInGBs: common.Float32(120)}},
},
getInstanceConfigResp: core.GetInstanceConfigurationResponse{
InstanceConfiguration: core.InstanceConfiguration{
Id: common.String("ocid1.instanceconfiguration.oc1.phx.aaaaaaaa1"),
InstanceDetails: instanceDetails,
},
},
}
shapeGetter := createShapeGetter(mockShapeClient)
manager := InstancePoolManagerImpl{
shapeGetter: shapeGetter,
}
shape, err := manager.shapeGetter.GetInstancePoolShape(np)
if err != nil {
t.Fatalf("unexpected error: %+v", err)
}
output := buildGenericLabelsForInstancePool(np, nodeName, shape.Name, availabilityDomain)
if !reflect.DeepEqual(output, expected) {
t.Fatalf("got %+v\nwanted %+v", output, expected)
}
}

View File

@ -0,0 +1,211 @@
/*
Copyright 2021 Oracle and/or its affiliates.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package oci
import (
"context"
"fmt"
"k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
"math"
"math/rand"
"net"
"net/http"
"time"
"github.com/pkg/errors"
apiv1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/oci-go-sdk/v43/common"
"k8s.io/kubernetes/pkg/apis/scheduling"
)
// IsRetryable returns true if the given error is retryable.
func IsRetryable(err error) bool {
if err == nil {
return false
}
err = errors.Cause(err)
// Retry on network timeout errors
if err, ok := err.(net.Error); ok && err.Timeout() {
return true
}
// handle oci retryable errors.
serviceErr, ok := common.IsServiceError(err)
if !ok {
return false
}
switch serviceErr.GetHTTPStatusCode() {
case http.StatusTooManyRequests, http.StatusGatewayTimeout,
http.StatusInternalServerError, http.StatusBadGateway:
return true
default:
return false
}
}
func newRetryPolicy() *common.RetryPolicy {
return NewRetryPolicyWithMaxAttempts(uint(8))
}
// NewRetryPolicyWithMaxAttempts returns a RetryPolicy with the specified max retryAttempts
func NewRetryPolicyWithMaxAttempts(retryAttempts uint) *common.RetryPolicy {
isRetryableOperation := func(r common.OCIOperationResponse) bool {
return IsRetryable(r.Error)
}
nextDuration := func(r common.OCIOperationResponse) time.Duration {
// you might want wait longer for next retry when your previous one failed
// this function will return the duration as:
// 1s, 2s, 4s, 8s, 16s, 32s, 64s etc...
return time.Duration(math.Pow(float64(2), float64(r.AttemptNumber-1))) * time.Second
}
policy := common.NewRetryPolicy(
retryAttempts, isRetryableOperation, nextDuration,
)
return &policy
}
// Missing resource requests on kube-proxy
// Flannel missing priority
func buildCSINodePod() *apiv1.Pod {
priority := scheduling.SystemCriticalPriority
return &apiv1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("csi-oci-node-%d", rand.Int63()),
Namespace: "kube-system",
Labels: map[string]string{
"app": "csi-oci-node",
},
},
Spec: apiv1.PodSpec{
Containers: []apiv1.Container{
{
Image: "iad.ocir.io/oracle/cloud-provider-oci:latest",
},
},
Priority: &priority,
},
Status: apiv1.PodStatus{
Phase: apiv1.PodRunning,
Conditions: []apiv1.PodCondition{
{
Type: apiv1.PodReady,
Status: apiv1.ConditionTrue,
},
},
},
}
}
func annotateNode(kubeClient kubernetes.Interface, nodeName string, key string, value string) error {
if nodeName == "" {
return errors.New("node name is required")
}
if kubeClient == nil {
return errors.New("kubeconfig is required")
}
node, err := kubeClient.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{})
if err != nil {
klog.Errorf("failed to get node %s %+v", nodeName, err)
return err
}
annotations := node.GetAnnotations()
if annotations == nil {
annotations = map[string]string{}
}
if v := annotations[key]; v != value {
node.Annotations[key] = value
_, err := kubeClient.CoreV1().Nodes().Update(context.Background(), node, metav1.UpdateOptions{})
if err != nil {
klog.Errorf("failed to annotate node %s %+v", nodeName, err)
return err
}
klog.V(3).Infof("updated annotation %s=%s on node: %s", key, value, nodeName)
}
return nil
}
func labelNode(kubeClient kubernetes.Interface, nodeName string, key string, value string) error {
if nodeName == "" {
return errors.New("node name is required")
}
if kubeClient == nil {
return errors.New("kubeconfig is required")
}
node, err := kubeClient.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{})
if err != nil {
klog.Errorf("failed to get node %s %+v", nodeName, err)
return err
}
labels := node.GetLabels()
if labels == nil {
labels = map[string]string{}
}
if v := labels[key]; v != value {
node.Labels[key] = value
_, err := kubeClient.CoreV1().Nodes().Update(context.Background(), node, metav1.UpdateOptions{})
if err != nil {
klog.Errorf("failed to label node %s %+v", nodeName, err)
return err
}
klog.V(3).Infof("updated label %s=%s on node: %s", key, value, nodeName)
}
return nil
}
func setNodeProviderID(kubeClient kubernetes.Interface, nodeName string, value string) error {
if nodeName == "" {
return errors.New("node name is required")
}
if kubeClient == nil {
return errors.New("kubeconfig is required")
}
node, err := kubeClient.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{})
if err != nil {
klog.Errorf("failed to get node %s %+v", nodeName, err)
return err
}
if node.Spec.ProviderID != value {
node.Spec.ProviderID = value
_, err := kubeClient.CoreV1().Nodes().Update(context.Background(), node, metav1.UpdateOptions{})
if err != nil {
klog.Errorf("failed to update node's provider ID %s %+v", nodeName, err)
return err
}
klog.V(3).Infof("updated provider ID on node: %s", nodeName)
}
return nil
}

View File

@ -0,0 +1,12 @@
package oci
import (
"testing"
)
func TestSetProviderID(t *testing.T) {
err := setNodeProviderID(nil, "", "")
if err == nil {
t.Fatal("expected error")
}
}

View File

@ -157,7 +157,8 @@ skipped_dirs = ['Godeps', 'third_party', '_gopath', '_output', '.git', 'cluster/
"cluster-autoscaler/cloudprovider/digitalocean/godo",
"cluster-autoscaler/cloudprovider/magnum/gophercloud",
"cluster-autoscaler/cloudprovider/ionoscloud/ionos-cloud-sdk-go",
"cluster-autoscaler/cloudprovider/hetzner/hcloud-go"]
"cluster-autoscaler/cloudprovider/hetzner/hcloud-go",
"cluster-autoscaler/cloudprovider/oci"]
# list all the files contain 'DO NOT EDIT', but are not generated
skipped_ungenerated_files = ['hack/build-ui.sh', 'hack/lib/swagger.sh',

View File

@ -23,4 +23,4 @@ DIR=$(dirname $0)
go install ${DIR}/../../../github.com/client9/misspell/cmd/misspell
# Spell checking
git ls-files --full-name | grep -v -e vendor | grep -v cluster-autoscaler/cloudprovider/magnum/gophercloud| grep -v cluster-autoscaler/cloudprovider/huaweicloud/huaweicloud-sdk-go-v3 | grep -v cluster-autoscaler/cloudprovider/digitalocean/godo | grep -v cluster-autoscaler/cloudprovider/hetzner/hcloud-go | grep -v cluster-autoscaler/cloudprovider/bizflycloud/gobizfly | grep -E -v 'cluster-autoscaler/cloudprovider/brightbox/(go-cache|gobrightbox|k8ssdk|linkheader)' | xargs misspell -error -o stderr
git ls-files --full-name | grep -v -e vendor | grep -v cluster-autoscaler/cloudprovider/magnum/gophercloud| grep -v cluster-autoscaler/cloudprovider/huaweicloud/huaweicloud-sdk-go-v3 | grep -v cluster-autoscaler/cloudprovider/digitalocean/godo | grep -v cluster-autoscaler/cloudprovider/hetzner/hcloud-go | grep -v cluster-autoscaler/cloudprovider/bizflycloud/gobizfly | grep -v cluster-autoscaler/cloudprovider/oci/oci-go-sdk | grep -E -v 'cluster-autoscaler/cloudprovider/brightbox/(go-cache|gobrightbox|k8ssdk|linkheader)' | xargs misspell -error -o stderr