OCI provider based on instance-pools and instance-configurations.
This commit is contained in:
parent
d3784072e4
commit
65ebd8db83
|
|
@ -25,6 +25,7 @@ You should also take a look at the notes and "gotchas" for your specific cloud p
|
|||
* [IonosCloud](./cloudprovider/ionoscloud/README.md)
|
||||
* [OVHcloud](./cloudprovider/ovhcloud/README.md)
|
||||
* [Linode](./cloudprovider/linode/README.md)
|
||||
* [OracleCloud](./cloudprovider/oci/README.md)
|
||||
* [ClusterAPI](./cloudprovider/clusterapi/README.md)
|
||||
* [BizflyCloud](./cloudprovider/bizflycloud/README.md)
|
||||
|
||||
|
|
@ -165,5 +166,6 @@ Supported cloud providers:
|
|||
* Equinix Metal https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/packet/README.md
|
||||
* OVHcloud https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/ovhcloud/README.md
|
||||
* Linode https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/linode/README.md
|
||||
* OCI https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/oci/README.md
|
||||
* Hetzner https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/hetzner/README.md
|
||||
* Cluster API https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/clusterapi/README.md
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
//go:build !gce && !aws && !azure && !kubemark && !alicloud && !magnum && !digitalocean && !clusterapi && !huaweicloud && !ionoscloud && !linode && !hetzner && !bizflycloud && !brightbox && !packet
|
||||
// +build !gce,!aws,!azure,!kubemark,!alicloud,!magnum,!digitalocean,!clusterapi,!huaweicloud,!ionoscloud,!linode,!hetzner,!bizflycloud,!brightbox,!packet
|
||||
//go:build !gce && !aws && !azure && !kubemark && !alicloud && !magnum && !digitalocean && !clusterapi && !huaweicloud && !ionoscloud && !linode && !hetzner && !bizflycloud && !brightbox && !packet && !oci
|
||||
// +build !gce,!aws,!azure,!kubemark,!alicloud,!magnum,!digitalocean,!clusterapi,!huaweicloud,!ionoscloud,!linode,!hetzner,!bizflycloud,!brightbox,!packet,!oci
|
||||
|
||||
/*
|
||||
Copyright 2018 The Kubernetes Authors.
|
||||
|
|
@ -37,6 +37,7 @@ import (
|
|||
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/ionoscloud"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/linode"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/magnum"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/ovhcloud"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/packet"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/config"
|
||||
|
|
@ -55,6 +56,7 @@ var AvailableCloudProviders = []string{
|
|||
cloudprovider.ExoscaleProviderName,
|
||||
cloudprovider.HuaweicloudProviderName,
|
||||
cloudprovider.HetznerProviderName,
|
||||
cloudprovider.OracleCloudProviderName,
|
||||
cloudprovider.OVHcloudProviderName,
|
||||
cloudprovider.ClusterAPIProviderName,
|
||||
cloudprovider.IonoscloudProviderName,
|
||||
|
|
@ -105,6 +107,8 @@ func buildCloudProvider(opts config.AutoscalingOptions, do cloudprovider.NodeGro
|
|||
return ionoscloud.BuildIonosCloud(opts, do, rl)
|
||||
case cloudprovider.LinodeProviderName:
|
||||
return linode.BuildLinode(opts, do, rl)
|
||||
case cloudprovider.OracleCloudProviderName:
|
||||
return oci.BuildOCI(opts, do, rl)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,43 @@
|
|||
//go:build oci
|
||||
// +build oci
|
||||
|
||||
/*
|
||||
Copyright 2020 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package builder
|
||||
|
||||
import (
|
||||
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/config"
|
||||
)
|
||||
|
||||
// AvailableCloudProviders supported by the cloud provider builder.
|
||||
var AvailableCloudProviders = []string{
|
||||
cloudprovider.OracleCloudProviderName,
|
||||
}
|
||||
|
||||
// DefaultCloudProvider for oci-only build is oci.
|
||||
const DefaultCloudProvider = cloudprovider.OracleCloudProviderName
|
||||
|
||||
func buildCloudProvider(opts config.AutoscalingOptions, do cloudprovider.NodeGroupDiscoveryOptions, rl *cloudprovider.ResourceLimiter) cloudprovider.CloudProvider {
|
||||
switch opts.CloudProviderName {
|
||||
case cloudprovider.OracleCloudProviderName:
|
||||
return oci.BuildOCI(opts, do, rl)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
|
@ -60,6 +60,8 @@ const (
|
|||
HuaweicloudProviderName = "huaweicloud"
|
||||
// IonoscloudProviderName gets the provider name of ionoscloud
|
||||
IonoscloudProviderName = "ionoscloud"
|
||||
// OracleCloudProviderName gets the provider name of oci
|
||||
OracleCloudProviderName = "oci"
|
||||
// OVHcloudProviderName gets the provider name of ovhcloud
|
||||
OVHcloudProviderName = "ovhcloud"
|
||||
// LinodeProviderName gets the provider name of linode
|
||||
|
|
|
|||
|
|
@ -0,0 +1,5 @@
|
|||
approvers:
|
||||
#- jlamillan
|
||||
reviewers:
|
||||
#- jlamillan
|
||||
#- ericrrath
|
||||
|
|
@ -0,0 +1,220 @@
|
|||
# Cluster Autoscaler for Oracle Cloud Infrastructure (OCI)
|
||||
|
||||
On OCI, the cluster-autoscaler utilizes [Instance Pools](https://docs.oracle.com/en-us/iaas/Content/Compute/Tasks/creatinginstancepool.htm)
|
||||
combined with [Instance Configurations](https://docs.oracle.com/en-us/iaas/Content/Compute/Tasks/creatinginstanceconfig.htm) to
|
||||
automatically resize a cluster's nodes based on application workload demands by:
|
||||
|
||||
- adding nodes to static instance-pool(s) when a pod cannot be scheduled in the cluster because of insufficient resource constraints.
|
||||
- removing nodes from an instance-pool(s) when the nodes have been underutilized for an extended time, and when pods can be placed on other existing nodes.
|
||||
|
||||
The cluster-autoscaler works on a per-instance pool basis. You configure the cluster-autoscaler to tell it which instance pools to target
|
||||
for expansion and contraction, the minimum and maximum sizes for each pool, and how you want the autoscaling to take place.
|
||||
Instance pools not referenced in the configuration file are not managed by the cluster-autoscaler.
|
||||
|
||||
## Create Required OCI Resources
|
||||
|
||||
### IAM Policy (if using Instance Principals)
|
||||
|
||||
We recommend setting up and configuring the cluster-autoscaler to use
|
||||
[Instance Principals](https://docs.oracle.com/en-us/iaas/Content/Identity/Tasks/callingservicesfrominstances.htm)
|
||||
to authenticate to the OCI APIs.
|
||||
|
||||
The following policy provides the minimum privileges necessary for Cluster Autoscaler to run:
|
||||
|
||||
1: Create a compartment-level dynamic group containing the nodes (compute instances) in the cluster:
|
||||
|
||||
```
|
||||
All {instance.compartment.id = 'ocid1.compartment.oc1..aaaaaaaa7ey4sg3a6b5wnv5hlkjlkjadslkfjalskfjalsadfadsf'}
|
||||
```
|
||||
|
||||
2: Create a *tenancy-level* policy to allow nodes to manage instance-pools:
|
||||
|
||||
```
|
||||
Allow dynamic-group acme-oci-cluster-autoscaler-dyn-grp to manage instance-pools in compartment <compartment-name>
|
||||
Allow dynamic-group acme-oci-cluster-autoscaler-dyn-grp to manage instance-configurations in compartment <compartment-name>
|
||||
Allow dynamic-group acme-oci-cluster-autoscaler-dyn-grp to manage instance-family in compartment <compartment-name>
|
||||
Allow dynamic-group acme-oci-cluster-autoscaler-dyn-grp to use subnets in compartment <compartment-name>
|
||||
Allow dynamic-group acme-oci-cluster-autoscaler-dyn-grp to read virtual-network-family in compartment <compartment-name>
|
||||
Allow dynamic-group acme-oci-cluster-autoscaler-dyn-grp to use vnics in compartment <compartment-name>
|
||||
Allow dynamic-group acme-oci-cluster-autoscaler-dyn-grp to inspect compartments in compartment <compartment-name>
|
||||
```
|
||||
|
||||
### Instance Pool and Instance Configurations
|
||||
|
||||
Before you deploy the cluster auto-scaler on OCI, your need to create one or more static Instance Pools and Instance
|
||||
Configuration with `cloud-init` specified in the launch details so new nodes automatically joins the existing cluster on
|
||||
start up.
|
||||
|
||||
Advanced Instance Pool and Instance Configuration configuration is out of scope for this document. However, a
|
||||
working [instance-details.json](./examples/instance-details.json) and [placement-config.json](./examples/placement-config.json)
|
||||
([example](./examples/instance-details.json) based on Rancher [RKE](https://rancher.com/products/rke/)) using [cloud-init](https://cloudinit.readthedocs.io/en/latest/) are
|
||||
included in the examples, which can be applied using the [OCI CLI](https://docs.oracle.com/en-us/iaas/Content/API/SDKDocs/cliinstall.htm).
|
||||
|
||||
Modify the `user_data` in the example [instance-details.json](./examples/instance-details.json) to suit your needs, re-base64 encode, apply:
|
||||
|
||||
```bash
|
||||
# e.g. cloud-init. Modify, re-encode, and update user_data in instance-details.json to suit your needs:
|
||||
|
||||
$ echo IyEvYmluL2Jhc2gKdG91hci9saWIvYXB0L....1yZXRyeSAzIGhG91Y2ggL3RtcC9jbG91ZC1pbml0LWZpbmlzaGVkCg== | base64 -D
|
||||
|
||||
#!/bin/bash
|
||||
groupadd docker
|
||||
usermod -aG docker ubuntu
|
||||
curl --retry 3 https://releases.rancher.com/install-docker/20.10.sh | sh
|
||||
docker run -d --privileged --restart=unless-stopped --net=host -v /etc/kubernetes:/etc/kubernetes -v /var/run:/var/run rancher/rancher-agent:v2.5.5 --server https://my-rancher.com --token xxxxxx --worker
|
||||
```
|
||||
|
||||
```bash
|
||||
$ oci compute-management instance-configuration create --instance-details file://./cluster-autoscaler/cloudprovider/oci/examples/instance-details.json --compartment-id ocid1.compartment.oc1..aaaaaaaa7ey4sg3a6b5wnv5hlkjlkjadslkfjalskfjalsadfadsf --query 'data.id' --raw-output
|
||||
|
||||
ocid1.instanceconfiguration.oc1.phx.aaaaaaaa3neul67zb3goz43lybosc2o3fv67gj3zazexbb3vfcbypmpznhtq
|
||||
|
||||
$ oci compute-management instance-pool create --compartment-id ocid1.compartment.oc1..aaaaaaaa7ey4sg3a6b5wnv5hlkjlkjadslkfjalskfjalsadfadsf --instance-configuration-id ocid1.instanceconfiguration.oc1.phx.aaaaaaaa3neul67zb3goz43lybosc2o3fv67gj3zazexbb3vfcbypmpznhtq --placement-configurations file://./cluster-autoscaler/cloudprovider/oci/examples/placement-config.json --size 0 --wait-for-state RUNNING --query 'data.id' --raw-output
|
||||
|
||||
Action completed. Waiting until the resource has entered state: ('RUNNING',)
|
||||
ocid1.instancepool.oc1.phx.aaaaaaaayd5bxwrzomzr2b2enchm4mof7uhw7do5hc2afkhks576syikk2ca
|
||||
```
|
||||
|
||||
## Configure Autoscaler
|
||||
|
||||
Use the `--nodes=<min-nodes>:<max-nodes>:<instancepool-ocid>` parameter to specify which pre-existing instance
|
||||
pools to target for automatic expansion and contraction, the minimum and maximum sizes for each node pool, and how you
|
||||
want the autoscaling to take place. Instance pools not referenced in the configuration file are not managed by the
|
||||
autoscaler where:
|
||||
|
||||
- `<min-nodes>` is the minimum number of nodes allowed in the instance-pool.
|
||||
- `<max-nodes>` is the maximum number of nodes allowed in the instance-pool. Make sure the maximum number of nodes you specify does not exceed the tenancy limits for the node shape defined for the node pool.
|
||||
- `<instancepool-ocid>` is the OCIDs of a pre-existing instance-pool.
|
||||
|
||||
If you are authenticating via instance principals, be sure the `OCI_REGION` environment variable is set to the correct
|
||||
value in the deployment e.g.:
|
||||
|
||||
```yaml
|
||||
env:
|
||||
- name: OCI_REGION
|
||||
value: "us-phoenix-1"
|
||||
```
|
||||
|
||||
### Optional cloud-config file
|
||||
|
||||
_Optional_ cloud-config file mounted in the path specified by `--cloud-config`.
|
||||
|
||||
An example, of passing optional configuration via `cloud-config` file that uses configures the cluster-autoscaler to use
|
||||
instance-principals authenticating via instance principalsand only see configured instance-pools in a single compartment:
|
||||
|
||||
```ini
|
||||
[Global]
|
||||
compartment-id = ocid1.compartment.oc1..aaaaaaaa7ey4sg3a6b5wnv5hlkjlkjadslkfjalskfjalsadfadsf
|
||||
region = uk-london-1
|
||||
use-instance-principals = true
|
||||
```
|
||||
|
||||
### Environment variables
|
||||
|
||||
Configuration via environment-variables:
|
||||
|
||||
- `OCI_USE_INSTANCE_PRINCIPAL` - Whether to use Instance Principals for authentication rather than expecting an OCI config file to be mounted in the container. Defaults to false.
|
||||
- `OCI_REGION` - **Required** when using Instance Principals. e.g. `OCI_REGION=us-phoenix-1`. See [region list](https://docs.oracle.com/en-us/iaas/Content/General/Concepts/regions.htm) for identifiers.
|
||||
- `OCI_COMPARTMENT_ID` - Restrict the cluster-autoscaler to instance-pools in a single compartment. When unset, the cluster-autoscaler will manage each specified instance-pool no matter which compartment they are in.
|
||||
- `OCI_REFRESH_INTERVAL` - Optional refresh interval to sync internal cache with OCI API defaults to `2m`.
|
||||
|
||||
## Deployment
|
||||
|
||||
### Create OCI config secret (only if _not_ using Instance Principals)
|
||||
|
||||
If you are opting for a file based OCI configuration (as opposed to instance principals), the OCI config file and private key need to be mounted into the container filesystem using a secret volume.
|
||||
|
||||
The following policy is required when the specified is not an administrator to run the cluster-autoscaler:
|
||||
|
||||
```
|
||||
Allow group acme-oci-cluster-autoscaler-user-grp to manage instance-pools in compartment <compartment-name>
|
||||
Allow group acme-oci-cluster-autoscaler-user-grp to manage instance-configurations in compartment <compartment-name>
|
||||
Allow group acme-oci-cluster-autoscaler-user-grp to manage instance-family in compartment <compartment-name>
|
||||
Allow group acme-oci-cluster-autoscaler-user-grp to use subnets in compartment <compartment-name>
|
||||
Allow group acme-oci-cluster-autoscaler-user-grp to read virtual-network-family in compartment <compartment-name>
|
||||
Allow group acme-oci-cluster-autoscaler-user-grp to use vnics in compartment <compartment-name>
|
||||
Allow group acme-oci-cluster-autoscaler-user-grp to inspect compartments in compartment <compartment-name>
|
||||
```
|
||||
|
||||
Example OCI config file (note `key_file` is the expected path and filename of the OCI API private-key from the perspective of the container):
|
||||
|
||||
```bash
|
||||
$ cat ~/.oci/config
|
||||
|
||||
[DEFAULT]
|
||||
user=ocid1.user.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
|
||||
fingerprint=xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx
|
||||
key_file=/root/.oci/api_key.pem
|
||||
tenancy=ocid1.tenancy.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
|
||||
pass_phrase=
|
||||
region=us-phoenix-1
|
||||
```
|
||||
|
||||
Create the secret (`api_key.pem` key name is required):
|
||||
|
||||
```bash
|
||||
kubectl create secret generic oci-config -n kube-system --from-file=/Users/me/.oci/config --from-file=api_key.pem=/Users/me/.oci/my_api_key.pem
|
||||
```
|
||||
|
||||
### Example Deployment
|
||||
|
||||
Two example deployments of the cluster-autoscaler that manage instancepools are located in the [examples](./examples/) directory.
|
||||
[oci-ip-cluster-autoscaler-w-principals.yaml](./examples/oci-ip-cluster-autoscaler-w-principals.yaml) uses
|
||||
instance principals, and [oci-ip-cluster-autoscaler-w-config.yaml](./examples/oci-ip-cluster-autoscaler-w-config.yaml) uses file
|
||||
based authentication.
|
||||
|
||||
Note the 3 specified instance-pools are intended to correspond to different availability domains in the Phoenix, AZ region:
|
||||
|
||||
```yaml
|
||||
...
|
||||
containers:
|
||||
- image: docker.io/jlamillan/autoscaler:oci-pr-rc6
|
||||
name: cluster-autoscaler
|
||||
command:
|
||||
- ./cluster-autoscaler
|
||||
- --cloud-provider=oci
|
||||
- --nodes=1:10:ocid1.instancepool.oc1.phx.aaaaaaaaqdxy35acq32zjfvkybjmvlbdgj6q3m55qkwwctxhsprmz633k62q
|
||||
- --nodes=0:10:ocid1.instancepool.oc1.phx.aaaaaaaazldzcu4mi5spz56upbtwnsynz2nk6jvmx7zi4hsta4uggxbulbua
|
||||
- --nodes=0:20:ocid1.instancepool.oc1.phx.aaaaaaaal3jhoc32ljsfaeif4x2ssfa2a63oehjgqryiueivieee6yaqbkia
|
||||
```
|
||||
|
||||
Instance principal based authentication deployment:
|
||||
|
||||
Substitute the OCIDs of _your_ instance pool(s) and set the `OCI_REGION` environment variable to the region where your
|
||||
instance pool(s) reside before applying the deployment:
|
||||
|
||||
```
|
||||
kubectl apply -f ./cloudprovider/oci/examples/oci-ip-cluster-autoscaler-w-principals.yaml
|
||||
```
|
||||
|
||||
OCI config file based authentication deployment:
|
||||
|
||||
```
|
||||
kubectl apply -f ./cloudprovider/oci/examples/oci-ip-cluster-autoscaler-w-config.yaml
|
||||
```
|
||||
|
||||
## Common Notes and Gotchas:
|
||||
- You must configure the instance configuration of new compute instances to join the existing cluster when they start. This can
|
||||
be accomplished with `cloud-init` / `user-data` in the instance launch configuration [example](./examples/instance-details.json).
|
||||
- If opting for a file based OCI configuration (as opposed to instance principals), ensure the OCI config and private-key
|
||||
PEM files are mounted into the container filesystem at the [expected path](https://docs.oracle.com/en-us/iaas/Content/API/Concepts/sdkconfig.htm). Note the `key_file` option in the example `~/.oci/config` above references a private-key file mounted into container by the example [volumeMount](./examples/oci-ip-cluster-autoscaler-w-config.yaml#L165)
|
||||
- Make sure the maximum number of nodes you specify does not exceed the limit for the instance-pool or the tenancy.
|
||||
- We recommend creating multiple instance-pools with one availability domain specified so new nodes can be created to meet
|
||||
affinity requirements across availability domains.
|
||||
- If you are authenticating via instance principals, be sure the `OCI_REGION` environment variable is set to the correct
|
||||
value in the deployment.
|
||||
- The cluster-autoscaler will not automatically remove scaled down (terminated) `Node` objects from the Kubernetes API
|
||||
without assistance from the [OCI Cloud Controller Manager](https://github.com/oracle/oci-cloud-controller-manager) (CCM).
|
||||
If scaled down nodes are lingering in your cluster in the `NotReady` status, ensure the OCI CCM is installed and running
|
||||
correctly (`oci-cloud-controller-manager`).
|
||||
- Avoid manually changing node pools that are managed by the cluster-autoscaler. For example, do not add or remove nodes
|
||||
using kubectl, or using the Console (or the Oracle Cloud Infrastructure CLI or API).
|
||||
- `--node-group-auto-discovery` and `--node-autoprovisioning-enabled=true` are not supported.
|
||||
- We set a `nvidia.com/gpu:NoSchedule` taint on nodes in a GPU enabled instance-pool.
|
||||
|
||||
## Helpful links
|
||||
- [Oracle Cloud Infrastructure home](https://cloud.oracle.com)
|
||||
- [OCI instance configuration documentation](https://docs.oracle.com/en-us/iaas/Content/Compute/Tasks/creatinginstanceconfig.htm)
|
||||
- [instance principals](https://docs.oracle.com/en-us/iaas/Content/Identity/Tasks/callingservicesfrominstances.htm)
|
||||
- [OCI Cloud Controller Manager](https://github.com/oracle/oci-cloud-controller-manager)
|
||||
- [OCI Container Storage Interface driver](https://github.com/oracle/oci-cloud-controller-manager/blob/master/container-storage-interface.md)
|
||||
|
|
@ -0,0 +1,19 @@
|
|||
{
|
||||
"instanceType": "compute",
|
||||
"launchDetails": {
|
||||
"compartmentId": "ocid1.compartment.oc1..aaaaaaaa7ey4sg3a6b5wnv5hlkjlkjadslkfjalskfjalsadfadsf",
|
||||
"shape": "VM.Standard2.8",
|
||||
"sourceDetails":
|
||||
{
|
||||
"imageId": "ocid1.image.oc1.phx.aaaaaaaa55tzajot4gbiw2p7gquwjnvfzrasosbrq4h6wywkff4zjosp2fia",
|
||||
"sourceType": "image",
|
||||
"bootVolumeSizeInGBs": 100
|
||||
},
|
||||
"metadata": {
|
||||
"user_data": "IyEvYmluL2Jhc2gKdG91Y2ggL3RtcC9jbG91ZC1pbml0LXN0YXJ0ZWQKaXB0YWJsZXMgLUYKZ3JvdXBhZGQgZG9ja2VyCnVzZXJtb2QgLWFHIGRvY2tlciB1YnVudHUKcm0gL3Zhci9saWIvYXB0L2xpc3RzL2xvY2sKcGtpbGwgLTkgLWYgYXB0CmN1cmwgLS1yZXRyeSAzIGh0dHBzOi8vcmVsZWFzZXMucmFuY2hlci5jb20vaW5zdGFsbC1kb2NrZXIvMjAuMTAuc2ggfCBzaApkb2NrZXIgcnVuIC1kIC0tcHJpdmlsZWdlZCAtLXJlc3RhcnQ9dW5sZXNzLXN0b3BwZWQgLS1uZXQ9aG9zdCAtdiAvZXRjL2t1YmVybmV0ZXM6L2V0Yy9rdWJlcm5ldGVzIC12IC92YXIvcnVuOi92YXIvcnVuIHJhbmNoZXIvcmFuY2hlci1hZ2VudDp2Mi41LjUgLS1zZXJ2ZXIgaHR0cHM6Ly9teS1yYW5jaGVyLmNvbSAtLXRva2VuIHh4eHh4eCAgLS13b3JrZXIKdG91Y2ggL3RtcC9jbG91ZC1pbml0LWZpbmlzaGVkCg=="
|
||||
},
|
||||
"createVnicDetails": {
|
||||
"assignPublicIp": true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,174 @@
|
|||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
labels:
|
||||
k8s-addon: cluster-autoscaler.addons.k8s.io
|
||||
k8s-app: cluster-autoscaler
|
||||
name: cluster-autoscaler
|
||||
namespace: kube-system
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: cluster-autoscaler
|
||||
labels:
|
||||
k8s-addon: cluster-autoscaler.addons.k8s.io
|
||||
k8s-app: cluster-autoscaler
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["events", "endpoints"]
|
||||
verbs: ["create", "patch"]
|
||||
- apiGroups: [""]
|
||||
resources: ["pods/eviction"]
|
||||
verbs: ["create"]
|
||||
- apiGroups: [""]
|
||||
resources: ["pods/status"]
|
||||
verbs: ["update"]
|
||||
- apiGroups: [""]
|
||||
resources: ["endpoints"]
|
||||
resourceNames: ["cluster-autoscaler"]
|
||||
verbs: ["get", "update"]
|
||||
- apiGroups: [""]
|
||||
resources: ["nodes"]
|
||||
verbs: ["watch", "list", "get", "update"]
|
||||
- apiGroups: [""]
|
||||
resources: ["namepaces"]
|
||||
verbs: ["list"]
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- "pods"
|
||||
- "services"
|
||||
- "replicationcontrollers"
|
||||
- "persistentvolumeclaims"
|
||||
- "persistentvolumes"
|
||||
verbs: ["watch", "list", "get"]
|
||||
- apiGroups: ["extensions"]
|
||||
resources: ["replicasets", "daemonsets"]
|
||||
verbs: ["watch", "list", "get"]
|
||||
- apiGroups: ["policy"]
|
||||
resources: ["poddisruptionbudgets"]
|
||||
verbs: ["watch", "list"]
|
||||
- apiGroups: ["apps"]
|
||||
resources: ["statefulsets", "replicasets", "daemonsets"]
|
||||
verbs: ["watch", "list", "get"]
|
||||
- apiGroups: ["storage.k8s.io"]
|
||||
resources: ["storageclasses", "csinodes"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
- apiGroups: ["batch"]
|
||||
resources: ["jobs", "cronjobs"]
|
||||
verbs: ["watch", "list", "get"]
|
||||
- apiGroups: ["coordination.k8s.io"]
|
||||
resources: ["leases"]
|
||||
verbs: ["create"]
|
||||
- apiGroups: ["coordination.k8s.io"]
|
||||
resourceNames: ["cluster-autoscaler"]
|
||||
resources: ["leases"]
|
||||
verbs: ["get", "update"]
|
||||
- apiGroups: ["storage.k8s.io"]
|
||||
resources: ["csidrivers", "csistoragecapacities"]
|
||||
verbs: ["get", "list"]
|
||||
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: cluster-autoscaler
|
||||
namespace: kube-system
|
||||
labels:
|
||||
k8s-addon: cluster-autoscaler.addons.k8s.io
|
||||
k8s-app: cluster-autoscaler
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["configmaps"]
|
||||
verbs: ["create","list","watch"]
|
||||
- apiGroups: [""]
|
||||
resources: ["configmaps"]
|
||||
resourceNames:
|
||||
- "cluster-autoscaler-status"
|
||||
- "cluster-autoscaler-priority-expander"
|
||||
verbs: ["delete", "get", "update", "watch"]
|
||||
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: cluster-autoscaler
|
||||
labels:
|
||||
k8s-addon: cluster-autoscaler.addons.k8s.io
|
||||
k8s-app: cluster-autoscaler
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: cluster-autoscaler
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: cluster-autoscaler
|
||||
namespace: kube-system
|
||||
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: cluster-autoscaler
|
||||
namespace: kube-system
|
||||
labels:
|
||||
k8s-addon: cluster-autoscaler.addons.k8s.io
|
||||
k8s-app: cluster-autoscaler
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: cluster-autoscaler
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: cluster-autoscaler
|
||||
namespace: kube-system
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: cluster-autoscaler
|
||||
namespace: kube-system
|
||||
labels:
|
||||
app: cluster-autoscaler
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: cluster-autoscaler
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: cluster-autoscaler
|
||||
spec:
|
||||
serviceAccountName: cluster-autoscaler
|
||||
containers:
|
||||
- image: docker.io/jlamillan/autoscaler:oci-pr-rc6
|
||||
name: cluster-autoscaler
|
||||
command:
|
||||
- ./cluster-autoscaler
|
||||
- --v=5
|
||||
- --logtostderr=true
|
||||
- --cloud-provider=oci
|
||||
- --nodes=0:10:ocid1.instancepool.oc1.phx.aaaaaaaaqdxy35acq32zjfvkybjmvlbdgj6q3m55qkwwctxhsprmz633k62q
|
||||
- --nodes=0:10:ocid1.instancepool.oc1.phx.aaaaaaaazldzcu4mi5spz56upbtwnsynz2nk6jvmx7zi4hsta4uggxbulbua
|
||||
- --nodes=0:10:ocid1.instancepool.oc1.phx.aaaaaaaal3jhoc32ljsfaeif4x2ssfa2a63oehjgqryiueivieee6yaqbkia
|
||||
- --scale-down-delay-after-add=1m
|
||||
- --scale-down-unneeded-time=1m
|
||||
- --namespace=kube-system
|
||||
imagePullPolicy: "Always"
|
||||
env:
|
||||
- name: OCI_USE_INSTANCE_PRINCIPAL
|
||||
value: "false"
|
||||
volumeMounts:
|
||||
- name: oci-config-vol
|
||||
mountPath: "/root/.oci"
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: oci-config-vol
|
||||
secret:
|
||||
secretName: oci-config
|
||||
items:
|
||||
- key: config
|
||||
path: config
|
||||
- key: api_key.pem
|
||||
path: api_key.pem
|
||||
|
|
@ -0,0 +1,163 @@
|
|||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
labels:
|
||||
k8s-addon: cluster-autoscaler.addons.k8s.io
|
||||
k8s-app: cluster-autoscaler
|
||||
name: cluster-autoscaler
|
||||
namespace: kube-system
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: cluster-autoscaler
|
||||
labels:
|
||||
k8s-addon: cluster-autoscaler.addons.k8s.io
|
||||
k8s-app: cluster-autoscaler
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["events", "endpoints"]
|
||||
verbs: ["create", "patch"]
|
||||
- apiGroups: [""]
|
||||
resources: ["pods/eviction"]
|
||||
verbs: ["create"]
|
||||
- apiGroups: [""]
|
||||
resources: ["pods/status"]
|
||||
verbs: ["update"]
|
||||
- apiGroups: [""]
|
||||
resources: ["endpoints"]
|
||||
resourceNames: ["cluster-autoscaler"]
|
||||
verbs: ["get", "update"]
|
||||
- apiGroups: [""]
|
||||
resources: ["nodes"]
|
||||
verbs: ["watch", "list", "get", "update"]
|
||||
- apiGroups: [""]
|
||||
resources: ["namespaces"]
|
||||
verbs: ["list"]
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- "pods"
|
||||
- "services"
|
||||
- "replicationcontrollers"
|
||||
- "persistentvolumeclaims"
|
||||
- "persistentvolumes"
|
||||
verbs: ["watch", "list", "get"]
|
||||
- apiGroups: ["extensions"]
|
||||
resources: ["replicasets", "daemonsets"]
|
||||
verbs: ["watch", "list", "get"]
|
||||
- apiGroups: ["policy"]
|
||||
resources: ["poddisruptionbudgets"]
|
||||
verbs: ["watch", "list"]
|
||||
- apiGroups: ["apps"]
|
||||
resources: ["statefulsets", "replicasets", "daemonsets"]
|
||||
verbs: ["watch", "list", "get"]
|
||||
- apiGroups: ["storage.k8s.io"]
|
||||
resources: ["storageclasses", "csinodes"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
- apiGroups: ["batch"]
|
||||
resources: ["jobs", "cronjobs"]
|
||||
verbs: ["watch", "list", "get"]
|
||||
- apiGroups: ["coordination.k8s.io"]
|
||||
resources: ["leases"]
|
||||
verbs: ["create"]
|
||||
- apiGroups: ["coordination.k8s.io"]
|
||||
resourceNames: ["cluster-autoscaler"]
|
||||
resources: ["leases"]
|
||||
verbs: ["get", "update"]
|
||||
- apiGroups: ["storage.k8s.io"]
|
||||
resources: ["csidrivers", "csistoragecapacities"]
|
||||
verbs: ["get", "list"]
|
||||
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: cluster-autoscaler
|
||||
namespace: kube-system
|
||||
labels:
|
||||
k8s-addon: cluster-autoscaler.addons.k8s.io
|
||||
k8s-app: cluster-autoscaler
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["configmaps"]
|
||||
verbs: ["create","list","watch"]
|
||||
- apiGroups: [""]
|
||||
resources: ["configmaps"]
|
||||
resourceNames:
|
||||
- "cluster-autoscaler-status"
|
||||
- "cluster-autoscaler-priority-expander"
|
||||
verbs: ["delete", "get", "update", "watch"]
|
||||
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: cluster-autoscaler
|
||||
labels:
|
||||
k8s-addon: cluster-autoscaler.addons.k8s.io
|
||||
k8s-app: cluster-autoscaler
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: cluster-autoscaler
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: cluster-autoscaler
|
||||
namespace: kube-system
|
||||
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: cluster-autoscaler
|
||||
namespace: kube-system
|
||||
labels:
|
||||
k8s-addon: cluster-autoscaler.addons.k8s.io
|
||||
k8s-app: cluster-autoscaler
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: cluster-autoscaler
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: cluster-autoscaler
|
||||
namespace: kube-system
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: cluster-autoscaler
|
||||
namespace: kube-system
|
||||
labels:
|
||||
app: cluster-autoscaler
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: cluster-autoscaler
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: cluster-autoscaler
|
||||
spec:
|
||||
serviceAccountName: cluster-autoscaler
|
||||
containers:
|
||||
- image: docker.io/jlamillan/autoscaler:oci-pr-rc6
|
||||
name: cluster-autoscaler
|
||||
command:
|
||||
- ./cluster-autoscaler
|
||||
- --v=5
|
||||
- --logtostderr=true
|
||||
- --cloud-provider=oci
|
||||
- --nodes=0:10:ocid1.instancepool.oc1.phx.aaaaaaaaqdxy35acq32zjfvkybjmvlbdgj6q3m55qkwwctxhsprmz633k62q
|
||||
- --nodes=0:10:ocid1.instancepool.oc1.phx.aaaaaaaazldzcu4mi5spz56upbtwnsynz2nk6jvmx7zi4hsta4uggxbulbua
|
||||
- --nodes=1:10:ocid1.instancepool.oc1.phx.aaaaaaaal3jhoc32ljsfaeif4x2ssfa2a63oehjgqryiueivieee6yaqbkia
|
||||
- --scale-down-delay-after-add=10m
|
||||
- --scale-down-unneeded-time=10m
|
||||
- --namespace=kube-system
|
||||
imagePullPolicy: "Always"
|
||||
env:
|
||||
- name: OCI_USE_INSTANCE_PRINCIPAL
|
||||
value: "true"
|
||||
- name: OCI_REGION
|
||||
value: "us-phoenix-1"
|
||||
|
|
@ -0,0 +1,6 @@
|
|||
[
|
||||
{
|
||||
"availabilityDomain": "hXgQ:PHX-AD-2",
|
||||
"primarySubnetId": "ocid1.subnet.oc1.phx.aaaaaaaaouihv645dp2xaee6w4uvx6emjwuscsrxcn3miwa6vmijtpdnqdeq"
|
||||
}
|
||||
]
|
||||
|
|
@ -0,0 +1,175 @@
|
|||
/*
|
||||
Copyright 2021 Oracle and/or its affiliates.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package oci
|
||||
|
||||
import (
|
||||
"github.com/pkg/errors"
|
||||
apiv1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/config"
|
||||
caerrors "k8s.io/autoscaler/cluster-autoscaler/utils/errors"
|
||||
"k8s.io/client-go/kubernetes"
|
||||
"k8s.io/client-go/rest"
|
||||
"k8s.io/client-go/tools/clientcmd"
|
||||
"k8s.io/klog/v2"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
ociUseInstancePrincipalEnvVar = "OCI_USE_INSTANCE_PRINCIPAL"
|
||||
ociCompartmentEnvVar = "OCI_COMPARTMENT_ID"
|
||||
ociRegionEnvVar = "OCI_REGION"
|
||||
ociRefreshInterval = "OCI_REFRESH_INTERVAL"
|
||||
ociAnnotationCompartmentID = "oci.oraclecloud.com/compartment-id"
|
||||
// ResourceGPU is the GPU resource type
|
||||
ResourceGPU apiv1.ResourceName = "nvidia.com/gpu"
|
||||
defaultRefreshInterval = 5 * time.Minute
|
||||
)
|
||||
|
||||
// OciCloudProvider implements the CloudProvider interface for OCI. It contains an
|
||||
// instance pool manager to interact with OCI instance pools.
|
||||
type OciCloudProvider struct {
|
||||
rl *cloudprovider.ResourceLimiter
|
||||
poolManager InstancePoolManager
|
||||
}
|
||||
|
||||
// CloudConfig holds the cloud config for OCI provider.
|
||||
type CloudConfig struct {
|
||||
Global struct {
|
||||
RefreshInterval time.Duration `gcfg:"refresh-interval"`
|
||||
CompartmentID string `gcfg:"compartment-id"`
|
||||
Region string `gcfg:"region"`
|
||||
UseInstancePrinciples bool `gcfg:"use-instance-principals"`
|
||||
}
|
||||
}
|
||||
|
||||
// Name returns name of the cloud provider.
|
||||
func (ocp *OciCloudProvider) Name() string {
|
||||
return cloudprovider.OracleCloudProviderName
|
||||
}
|
||||
|
||||
// NodeGroups returns all node groups configured for this cloud provider.
|
||||
func (ocp *OciCloudProvider) NodeGroups() []cloudprovider.NodeGroup {
|
||||
nodePools := ocp.poolManager.GetInstancePools()
|
||||
result := make([]cloudprovider.NodeGroup, 0, len(nodePools))
|
||||
for _, nodePool := range nodePools {
|
||||
result = append(result, nodePool)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// NodeGroupForNode returns the node group for the given node, nil if the node
|
||||
// should not be processed by cluster autoscaler, or non-nil error if such
|
||||
// occurred. Must be implemented.
|
||||
func (ocp *OciCloudProvider) NodeGroupForNode(n *apiv1.Node) (cloudprovider.NodeGroup, error) {
|
||||
|
||||
ociRef, err := nodeToOciRef(n)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
ng, err := ocp.poolManager.GetInstancePoolForInstance(ociRef)
|
||||
|
||||
// this instance may not be a part of an instance pool, or it may be part of a instance pool that the autoscaler does not manage
|
||||
if errors.Cause(err) == errInstanceInstancePoolNotFound {
|
||||
// should not be processed by cluster autoscaler
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
return ng, err
|
||||
}
|
||||
|
||||
// Pricing returns pricing model for this cloud provider or error if not available.
|
||||
// Implementation optional.
|
||||
func (ocp *OciCloudProvider) Pricing() (cloudprovider.PricingModel, caerrors.AutoscalerError) {
|
||||
klog.Info("Pricing called")
|
||||
return nil, cloudprovider.ErrNotImplemented
|
||||
}
|
||||
|
||||
// GetAvailableMachineTypes getInstancePool all machine types that can be requested from the cloud provider.
|
||||
// Implementation optional.
|
||||
func (ocp *OciCloudProvider) GetAvailableMachineTypes() ([]string, error) {
|
||||
klog.Info("GetAvailableMachineTypes called")
|
||||
return nil, cloudprovider.ErrNotImplemented
|
||||
}
|
||||
|
||||
// NewNodeGroup builds a theoretical node group based on the node definition provided. The node group is not automatically
|
||||
// created on the cloud provider side. The node group is not returned by NodeGroups() until it is created.
|
||||
// Implementation optional.
|
||||
func (ocp *OciCloudProvider) NewNodeGroup(machineType string,
|
||||
labels map[string]string,
|
||||
systemLabels map[string]string,
|
||||
taints []apiv1.Taint,
|
||||
extraResources map[string]resource.Quantity,
|
||||
) (cloudprovider.NodeGroup, error) {
|
||||
return nil, cloudprovider.ErrNotImplemented
|
||||
}
|
||||
|
||||
// GetResourceLimiter returns struct containing limits (max, min) for resources (cores, memory etc.).
|
||||
func (ocp *OciCloudProvider) GetResourceLimiter() (*cloudprovider.ResourceLimiter, error) {
|
||||
return ocp.rl, nil
|
||||
}
|
||||
|
||||
// GPULabel returns the label added to nodes with GPU resource.
|
||||
func (ocp *OciCloudProvider) GPULabel() string {
|
||||
// No labels, only taint: nvidia.com/gpu:NoSchedule
|
||||
return ""
|
||||
}
|
||||
|
||||
// GetAvailableGPUTypes return all available GPU types cloud provider supports.
|
||||
func (ocp *OciCloudProvider) GetAvailableGPUTypes() map[string]struct{} {
|
||||
return map[string]struct{}{}
|
||||
}
|
||||
|
||||
// Cleanup cleans up open resources before the cloud provider is destroyed, i.e. go routines etc.
|
||||
func (ocp *OciCloudProvider) Cleanup() error {
|
||||
return ocp.poolManager.Cleanup()
|
||||
}
|
||||
|
||||
// Refresh is called before every main loop and can be used to dynamically update cloud provider state.
|
||||
// In particular the list of node groups returned by NodeGroups can change as a result of CloudProvider.Refresh().
|
||||
func (ocp *OciCloudProvider) Refresh() error {
|
||||
return ocp.poolManager.Refresh()
|
||||
}
|
||||
|
||||
// BuildOCI constructs the OciCloudProvider object that implements the could provider interface (InstancePoolManager).
|
||||
func BuildOCI(opts config.AutoscalingOptions, do cloudprovider.NodeGroupDiscoveryOptions, rl *cloudprovider.ResourceLimiter) *OciCloudProvider {
|
||||
|
||||
ipManager, err := CreateInstancePoolManager(opts.CloudConfig, do, createKubeClient(opts))
|
||||
if err != nil {
|
||||
klog.Fatalf("Could not create OCI cloud provider: %v", err)
|
||||
}
|
||||
return &OciCloudProvider{
|
||||
poolManager: ipManager,
|
||||
rl: rl,
|
||||
}
|
||||
}
|
||||
|
||||
func getKubeConfig(opts config.AutoscalingOptions) *rest.Config {
|
||||
klog.V(1).Infof("Using kubeconfig file: %s", opts.KubeConfigPath)
|
||||
kubeConfig, err := clientcmd.BuildConfigFromFlags("", opts.KubeConfigPath)
|
||||
if err != nil {
|
||||
klog.Fatalf("Failed to build kubeConfig: %v", err)
|
||||
}
|
||||
|
||||
return kubeConfig
|
||||
}
|
||||
|
||||
func createKubeClient(opts config.AutoscalingOptions) kubernetes.Interface {
|
||||
return kubernetes.NewForConfigOrDie(getKubeConfig(opts))
|
||||
}
|
||||
|
|
@ -0,0 +1,234 @@
|
|||
/*
|
||||
Copyright 2021 Oracle and/or its affiliates.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package oci
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/pkg/errors"
|
||||
apiv1 "k8s.io/api/core/v1"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/config"
|
||||
"k8s.io/client-go/kubernetes"
|
||||
"k8s.io/klog/v2"
|
||||
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
)
|
||||
|
||||
const (
|
||||
ociInstanceIDAnnotation = "oci.oraclecloud.com/instance-id"
|
||||
ociInstancePoolIDAnnotation = "oci.oraclecloud.com/instancepool-id"
|
||||
ociInstancePoolResourceIdent = "instancepool"
|
||||
)
|
||||
|
||||
// InstancePoolNodeGroup implements the NodeGroup interface using OCI instance pools.
|
||||
type InstancePoolNodeGroup struct {
|
||||
manager InstancePoolManager
|
||||
kubeClient kubernetes.Interface
|
||||
id string
|
||||
minSize int
|
||||
maxSize int
|
||||
}
|
||||
|
||||
// MaxSize returns maximum size of the instance-pool based node group.
|
||||
func (ip *InstancePoolNodeGroup) MaxSize() int {
|
||||
return ip.maxSize
|
||||
}
|
||||
|
||||
// MinSize returns minimum size of the instance-pool based node group.
|
||||
func (ip *InstancePoolNodeGroup) MinSize() int {
|
||||
return ip.minSize
|
||||
}
|
||||
|
||||
// TargetSize returns the current target size of the instance-pool based node group. It is possible that the
|
||||
// number of nodes in Kubernetes is different at the moment but should be equal
|
||||
// to Size() once everything stabilizes (new nodes finish startup and registration or
|
||||
// removed nodes are deleted completely). Implementation required.
|
||||
func (ip *InstancePoolNodeGroup) TargetSize() (int, error) {
|
||||
return ip.manager.GetInstancePoolSize(*ip)
|
||||
}
|
||||
|
||||
// IncreaseSize increases the size of the instance-pool based node group. To delete a node you need
|
||||
// to explicitly name it and use DeleteNode. This function should wait until
|
||||
// instance-pool size is updated. Implementation required.
|
||||
func (ip *InstancePoolNodeGroup) IncreaseSize(delta int) error {
|
||||
if delta <= 0 {
|
||||
return fmt.Errorf("size increase must be positive")
|
||||
}
|
||||
|
||||
size, err := ip.manager.GetInstancePoolSize(*ip)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if size+delta > ip.MaxSize() {
|
||||
return fmt.Errorf("size increase too large - desired:%d max:%d", size+delta, ip.MaxSize())
|
||||
}
|
||||
|
||||
return ip.manager.SetInstancePoolSize(*ip, size+delta)
|
||||
}
|
||||
|
||||
// DeleteNodes deletes nodes from this instance-pool. Error is returned either on
|
||||
// failure or if the given node doesn't belong to this instance-pool. This function
|
||||
// should wait until instance-pool size is updated. Implementation required.
|
||||
func (ip *InstancePoolNodeGroup) DeleteNodes(nodes []*apiv1.Node) error {
|
||||
|
||||
// FYI, unregistered nodes come in as the provider id as node name.
|
||||
|
||||
klog.Infof("DeleteNodes called with %d node(s)", len(nodes))
|
||||
|
||||
size, err := ip.manager.GetInstancePoolSize(*ip)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if size <= ip.MinSize() {
|
||||
return fmt.Errorf("min size reached, nodes will not be deleted")
|
||||
}
|
||||
|
||||
refs := make([]OciRef, 0, len(nodes))
|
||||
for _, node := range nodes {
|
||||
belongs, err := ip.Belongs(node)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if !belongs {
|
||||
return fmt.Errorf("%s belong to a different instance-pool than %s", node.Name, ip.Id())
|
||||
}
|
||||
ociRef, err := nodeToOciRef(node)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
refs = append(refs, ociRef)
|
||||
}
|
||||
|
||||
return ip.manager.DeleteInstances(*ip, refs)
|
||||
}
|
||||
|
||||
// DecreaseTargetSize decreases the target size of the instance-pool based node group. This function
|
||||
// doesn't permit to delete any existing node and can be used only to reduce the
|
||||
// request for new nodes that have not been yet fulfilled. Delta should be negative.
|
||||
// It is assumed that cloud provider will not delete the existing nodes when there
|
||||
// is an option to just decrease the target. Implementation required.
|
||||
func (ip *InstancePoolNodeGroup) DecreaseTargetSize(delta int) error {
|
||||
if delta >= 0 {
|
||||
return fmt.Errorf("size decrease must be negative")
|
||||
}
|
||||
|
||||
size, err := ip.manager.GetInstancePoolSize(*ip)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
nodes, err := ip.manager.GetInstancePoolNodes(*ip)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if size+delta < len(nodes) {
|
||||
return fmt.Errorf("attempt to delete existing nodes targetSize:%d delta:%d existingNodes: %d",
|
||||
size, delta, len(nodes))
|
||||
}
|
||||
|
||||
return ip.manager.SetInstancePoolSize(*ip, size+delta)
|
||||
}
|
||||
|
||||
// Belongs returns true if the given node belongs to the InstancePoolNodeGroup.
|
||||
func (ip *InstancePoolNodeGroup) Belongs(node *apiv1.Node) (bool, error) {
|
||||
ref, err := nodeToOciRef(node)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
targetInstancePool, err := ip.manager.GetInstancePoolForInstance(ref)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
if targetInstancePool == nil {
|
||||
return false, fmt.Errorf("%s doesn't belong to a known instance-pool", node.Name)
|
||||
}
|
||||
|
||||
return targetInstancePool.Id() == ip.Id(), nil
|
||||
}
|
||||
|
||||
// Id returns an unique identifier of the instance-pool based node group.
|
||||
func (ip *InstancePoolNodeGroup) Id() string {
|
||||
return ip.id
|
||||
}
|
||||
|
||||
// Debug returns a string containing all information regarding this instance-pool.
|
||||
func (ip *InstancePoolNodeGroup) Debug() string {
|
||||
return fmt.Sprintf("%s (%d:%d)", ip.Id(), ip.MinSize(), ip.MaxSize())
|
||||
}
|
||||
|
||||
// Nodes returns a list of all nodes that belong to this instance-pool.
|
||||
// It is required that Instance objects returned by this method have Id field set.
|
||||
// Other fields are optional.
|
||||
// This list should include also instances that might have not become a kubernetes node yet.
|
||||
func (ip *InstancePoolNodeGroup) Nodes() ([]cloudprovider.Instance, error) {
|
||||
return ip.manager.GetInstancePoolNodes(*ip)
|
||||
}
|
||||
|
||||
// TemplateNodeInfo returns a schedulerframework.NodeInfo structure of an empty
|
||||
// (as if just started) node. This will be used in scale-up simulations to
|
||||
// predict what would a new node look like if a instance-pool was expanded. The returned
|
||||
// NodeInfo is expected to have a fully populated Node object, with all of the labels,
|
||||
// capacity and allocatable information as well as all pods that are started on
|
||||
// the node by default, using manifest (most likely only kube-proxy). Implementation optional.
|
||||
func (ip *InstancePoolNodeGroup) TemplateNodeInfo() (*schedulerframework.NodeInfo, error) {
|
||||
node, err := ip.manager.GetInstancePoolTemplateNode(*ip)
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "unable to build node info template")
|
||||
}
|
||||
|
||||
nodeInfo := schedulerframework.NewNodeInfo(
|
||||
cloudprovider.BuildKubeProxy(ip.id),
|
||||
buildCSINodePod(),
|
||||
)
|
||||
nodeInfo.SetNode(node)
|
||||
return nodeInfo, nil
|
||||
}
|
||||
|
||||
// Exist checks if the instance-pool based node group really exists on the cloud provider side. Allows to tell the
|
||||
// theoretical instance-pool from the real one. Implementation required.
|
||||
func (ip *InstancePoolNodeGroup) Exist() bool {
|
||||
return true
|
||||
}
|
||||
|
||||
// Create creates the instance-pool based node group on the cloud provider side. Implementation optional.
|
||||
func (ip *InstancePoolNodeGroup) Create() (cloudprovider.NodeGroup, error) {
|
||||
return nil, cloudprovider.ErrNotImplemented
|
||||
}
|
||||
|
||||
// Delete deletes the instance-pool based node group on the cloud provider side.
|
||||
// This will be executed only for autoprovisioned instance-pools, once their size drops to 0.
|
||||
// Implementation optional.
|
||||
func (ip *InstancePoolNodeGroup) Delete() error {
|
||||
return cloudprovider.ErrNotImplemented
|
||||
}
|
||||
|
||||
// GetOptions returns NodeGroupAutoscalingOptions that should be used for this particular
|
||||
// InstancePoolNodeGroup. Returning a nil will result in using default options.
|
||||
func (ip *InstancePoolNodeGroup) GetOptions(defaults config.NodeGroupAutoscalingOptions) (*config.NodeGroupAutoscalingOptions, error) {
|
||||
return nil, cloudprovider.ErrNotImplemented
|
||||
}
|
||||
|
||||
// Autoprovisioned returns true if the instance-pool based node group is autoprovisioned. An autoprovisioned group
|
||||
// was created by CA and can be deleted when scaled to 0.
|
||||
func (ip *InstancePoolNodeGroup) Autoprovisioned() bool {
|
||||
return false
|
||||
}
|
||||
|
|
@ -0,0 +1,363 @@
|
|||
/*
|
||||
Copyright 2021 Oracle and/or its affiliates.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package oci
|
||||
|
||||
import (
|
||||
"context"
|
||||
"github.com/pkg/errors"
|
||||
"k8s.io/apimachinery/pkg/util/wait"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/oci-go-sdk/v43/common"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/oci-go-sdk/v43/core"
|
||||
"k8s.io/klog/v2"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ComputeMgmtClient wraps core.ComputeManagementClient exposing the functions we actually require.
|
||||
type ComputeMgmtClient interface {
|
||||
GetInstancePool(context.Context, core.GetInstancePoolRequest) (core.GetInstancePoolResponse, error)
|
||||
UpdateInstancePool(context.Context, core.UpdateInstancePoolRequest) (core.UpdateInstancePoolResponse, error)
|
||||
GetInstancePoolInstance(context.Context, core.GetInstancePoolInstanceRequest) (core.GetInstancePoolInstanceResponse, error)
|
||||
ListInstancePoolInstances(context.Context, core.ListInstancePoolInstancesRequest) (core.ListInstancePoolInstancesResponse, error)
|
||||
DetachInstancePoolInstance(context.Context, core.DetachInstancePoolInstanceRequest) (core.DetachInstancePoolInstanceResponse, error)
|
||||
}
|
||||
|
||||
// ComputeClient wraps core.ComputeClient exposing the functions we actually require.
|
||||
type ComputeClient interface {
|
||||
ListVnicAttachments(ctx context.Context, request core.ListVnicAttachmentsRequest) (core.ListVnicAttachmentsResponse, error)
|
||||
}
|
||||
|
||||
// VirtualNetworkClient wraps core.VirtualNetworkClient exposing the functions we actually require.
|
||||
type VirtualNetworkClient interface {
|
||||
GetVnic(context.Context, core.GetVnicRequest) (core.GetVnicResponse, error)
|
||||
}
|
||||
|
||||
type instancePoolCache struct {
|
||||
mu sync.Mutex
|
||||
poolCache map[string]*core.InstancePool
|
||||
instanceSummaryCache map[string]*[]core.InstanceSummary
|
||||
targetSize map[string]int
|
||||
unownedInstances map[OciRef]bool
|
||||
|
||||
computeManagementClient ComputeMgmtClient
|
||||
computeClient ComputeClient
|
||||
virtualNetworkClient VirtualNetworkClient
|
||||
}
|
||||
|
||||
func newInstancePoolCache(computeManagementClient ComputeMgmtClient, computeClient ComputeClient, virtualNetworkClient VirtualNetworkClient) *instancePoolCache {
|
||||
return &instancePoolCache{
|
||||
poolCache: map[string]*core.InstancePool{},
|
||||
instanceSummaryCache: map[string]*[]core.InstanceSummary{},
|
||||
targetSize: map[string]int{},
|
||||
unownedInstances: map[OciRef]bool{},
|
||||
computeManagementClient: computeManagementClient,
|
||||
computeClient: computeClient,
|
||||
virtualNetworkClient: virtualNetworkClient,
|
||||
}
|
||||
}
|
||||
|
||||
func (c *instancePoolCache) InstancePools() map[string]*core.InstancePool {
|
||||
result := map[string]*core.InstancePool{}
|
||||
for k, v := range c.poolCache {
|
||||
result[k] = v
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func (c *instancePoolCache) rebuild(staticInstancePools map[string]*InstancePoolNodeGroup, cfg CloudConfig) error {
|
||||
// Since we only support static instance-pools we don't need to worry about pruning.
|
||||
|
||||
for id := range staticInstancePools {
|
||||
resp, err := c.computeManagementClient.GetInstancePool(context.Background(), core.GetInstancePoolRequest{
|
||||
InstancePoolId: common.String(id),
|
||||
})
|
||||
if err != nil {
|
||||
klog.Errorf("get instance pool %s failed: %v", id, err)
|
||||
return err
|
||||
}
|
||||
klog.V(5).Infof("GetInstancePool() response %v", resp.InstancePool)
|
||||
|
||||
c.setInstancePool(&resp.InstancePool)
|
||||
|
||||
// OCI instance-pools do not contain individual instance objects so they must be fetched separately.
|
||||
listInstancesResponse, err := c.computeManagementClient.ListInstancePoolInstances(context.Background(), core.ListInstancePoolInstancesRequest{
|
||||
InstancePoolId: common.String(id),
|
||||
CompartmentId: common.String(cfg.Global.CompartmentID),
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
klog.V(5).Infof("ListInstancePoolInstances() response %v", listInstancesResponse.Items)
|
||||
c.setInstanceSummaries(*resp.InstancePool.Id, &listInstancesResponse.Items)
|
||||
}
|
||||
// Reset unowned instances cache.
|
||||
c.unownedInstances = make(map[OciRef]bool)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// removeInstance tries to remove the instance from the specified instance pool. If the instance isn't in the array,
|
||||
// then it won't do anything removeInstance returns true if it actually removed the instance and reduced the size of
|
||||
// the instance pool.
|
||||
func (c *instancePoolCache) removeInstance(instancePool InstancePoolNodeGroup, instanceID string) bool {
|
||||
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
|
||||
if instanceID == "" {
|
||||
klog.Warning("instanceID is not set - skipping removal.")
|
||||
return false
|
||||
}
|
||||
|
||||
// This instance pool must be in state RUNNING in order to detach a particular instance.
|
||||
err := c.waitForInstancePoolState(context.Background(), instancePool.Id(), core.InstancePoolLifecycleStateRunning)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
_, err = c.computeManagementClient.DetachInstancePoolInstance(context.Background(), core.DetachInstancePoolInstanceRequest{
|
||||
InstancePoolId: common.String(instancePool.Id()),
|
||||
DetachInstancePoolInstanceDetails: core.DetachInstancePoolInstanceDetails{
|
||||
InstanceId: common.String(instanceID),
|
||||
IsDecrementSize: common.Bool(true),
|
||||
IsAutoTerminate: common.Bool(true),
|
||||
},
|
||||
})
|
||||
|
||||
if err == nil {
|
||||
// Decrease pool size in cache since IsDecrementSize was true
|
||||
c.targetSize[instancePool.Id()] -= 1
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// findInstanceByDetails attempts to find the given instance by details by searching
|
||||
// through the configured instance-pools (ListInstancePoolInstances) for a match.
|
||||
func (c *instancePoolCache) findInstanceByDetails(ociInstance OciRef) (*OciRef, error) {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
|
||||
// Minimum amount of information we need to make a positive match
|
||||
if ociInstance.InstanceID == "" && ociInstance.PrivateIPAddress == "" && ociInstance.PublicIPAddress == "" {
|
||||
return nil, errors.New("instance id or an IP address is required to resolve details")
|
||||
}
|
||||
|
||||
if c.unownedInstances[ociInstance] {
|
||||
// We already know this instance is not part of a configured pool. Return early and avoid additional API calls.
|
||||
klog.V(4).Infof("Node " + ociInstance.Name + " is known to not be a member of any of the specified instance pool(s)")
|
||||
return nil, errInstanceInstancePoolNotFound
|
||||
}
|
||||
|
||||
// Look for the instance in each of the specified pool(s)
|
||||
for _, nextInstancePool := range c.poolCache {
|
||||
// Skip searching instance pool if it's instance count is 0.
|
||||
if *nextInstancePool.Size == 0 {
|
||||
klog.V(4).Infof("skipping over instance pool %s since it is empty", *nextInstancePool.Id)
|
||||
continue
|
||||
}
|
||||
// List instances in the next pool
|
||||
listInstancePoolInstances, err := c.computeManagementClient.ListInstancePoolInstances(context.Background(), core.ListInstancePoolInstancesRequest{
|
||||
CompartmentId: common.String(ociInstance.CompartmentID),
|
||||
InstancePoolId: nextInstancePool.Id,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, poolMember := range listInstancePoolInstances.Items {
|
||||
// Skip comparing this instance if it is not in the Running state
|
||||
if strings.ToLower(*poolMember.State) != strings.ToLower(string(core.InstanceLifecycleStateRunning)) {
|
||||
klog.V(4).Infof("skipping over instance %s: since it is not in the running state: %s", *poolMember.Id, *poolMember.State)
|
||||
continue
|
||||
}
|
||||
listVnicAttachments, err := c.computeClient.ListVnicAttachments(context.Background(), core.ListVnicAttachmentsRequest{
|
||||
CompartmentId: common.String(*poolMember.CompartmentId),
|
||||
InstanceId: poolMember.Id,
|
||||
})
|
||||
if err != nil {
|
||||
klog.Errorf("list vNIC attachments for %s failed: %v", *poolMember.Id, err)
|
||||
return nil, err
|
||||
}
|
||||
klog.V(5).Infof("ListVnicAttachments() response for %s: %v", *poolMember.Id, listVnicAttachments.Items)
|
||||
for _, vnicAttachment := range listVnicAttachments.Items {
|
||||
// Skip this attachment if the vNIC is not live
|
||||
if core.VnicAttachmentLifecycleStateAttached != vnicAttachment.LifecycleState {
|
||||
klog.V(4).Infof("skipping vNIC on instance %s: since it is not active", *poolMember.Id)
|
||||
continue
|
||||
}
|
||||
getVnicResp, err := c.virtualNetworkClient.GetVnic(context.Background(), core.GetVnicRequest{
|
||||
VnicId: vnicAttachment.VnicId,
|
||||
})
|
||||
if err != nil {
|
||||
klog.Errorf("get vNIC for %s failed: %v", *poolMember.Id, err)
|
||||
return nil, err
|
||||
}
|
||||
klog.V(5).Infof("GetVnic() response for vNIC %s: %v", *vnicAttachment.Id, getVnicResp.Vnic)
|
||||
// Preferably we match by instanceID, but we can match by private or public IP
|
||||
if *poolMember.Id == ociInstance.InstanceID ||
|
||||
(getVnicResp.Vnic.PrivateIp != nil && *getVnicResp.Vnic.PrivateIp == ociInstance.PrivateIPAddress) ||
|
||||
(getVnicResp.Vnic.PublicIp != nil && *getVnicResp.Vnic.PublicIp == ociInstance.PublicIPAddress) {
|
||||
klog.V(4).Info(*poolMember.DisplayName, " is a member of "+*nextInstancePool.Id)
|
||||
// Return a complete instance details.
|
||||
if ociInstance.Name == "" {
|
||||
ociInstance.Name = *poolMember.DisplayName
|
||||
}
|
||||
ociInstance.InstanceID = *poolMember.Id
|
||||
ociInstance.PoolID = *nextInstancePool.Id
|
||||
ociInstance.CompartmentID = *poolMember.CompartmentId
|
||||
ociInstance.AvailabilityDomain = strings.Split(*poolMember.AvailabilityDomain, ":")[1]
|
||||
ociInstance.Shape = *poolMember.Shape
|
||||
ociInstance.PrivateIPAddress = *getVnicResp.Vnic.PrivateIp
|
||||
// Public IP is optional
|
||||
if getVnicResp.Vnic.PublicIp != nil {
|
||||
ociInstance.PublicIPAddress = *getVnicResp.Vnic.PublicIp
|
||||
}
|
||||
return &ociInstance, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
c.unownedInstances[ociInstance] = true
|
||||
klog.V(4).Infof(ociInstance.Name + " is not a member of any of the specified instance pool(s)")
|
||||
return nil, errInstanceInstancePoolNotFound
|
||||
}
|
||||
|
||||
func (c *instancePoolCache) getInstancePool(id string) (*core.InstancePool, error) {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
|
||||
return c.getInstancePoolWithoutLock(id)
|
||||
}
|
||||
|
||||
func (c *instancePoolCache) getInstancePoolWithoutLock(id string) (*core.InstancePool, error) {
|
||||
instancePool := c.poolCache[id]
|
||||
if instancePool == nil {
|
||||
return nil, errors.New("instance pool was not found in the cache")
|
||||
}
|
||||
|
||||
return instancePool, nil
|
||||
}
|
||||
|
||||
func (c *instancePoolCache) setInstancePool(np *core.InstancePool) {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
|
||||
c.poolCache[*np.Id] = np
|
||||
c.targetSize[*np.Id] = *np.Size
|
||||
}
|
||||
|
||||
func (c *instancePoolCache) getInstanceSummaries(poolID string) (*[]core.InstanceSummary, error) {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
|
||||
return c.getInstanceSummariesWithoutLock(poolID)
|
||||
}
|
||||
|
||||
func (c *instancePoolCache) getInstanceSummariesWithoutLock(poolID string) (*[]core.InstanceSummary, error) {
|
||||
instanceSummaries := c.instanceSummaryCache[poolID]
|
||||
if instanceSummaries == nil {
|
||||
return nil, errors.New("instance summaries for instance pool id " + poolID + " were not found in cache")
|
||||
}
|
||||
|
||||
return instanceSummaries, nil
|
||||
}
|
||||
|
||||
func (c *instancePoolCache) setInstanceSummaries(instancePoolID string, is *[]core.InstanceSummary) {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
|
||||
c.instanceSummaryCache[instancePoolID] = is
|
||||
}
|
||||
|
||||
func (c *instancePoolCache) setSize(instancePoolID string, size int) error {
|
||||
|
||||
if instancePoolID == "" {
|
||||
return errors.New("instance-pool is required")
|
||||
}
|
||||
|
||||
getInstancePoolResp, err := c.computeManagementClient.GetInstancePool(context.Background(), core.GetInstancePoolRequest{
|
||||
InstancePoolId: common.String(instancePoolID),
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
updateDetails := core.UpdateInstancePoolDetails{
|
||||
Size: common.Int(size),
|
||||
InstanceConfigurationId: getInstancePoolResp.InstanceConfigurationId,
|
||||
}
|
||||
|
||||
_, err = c.computeManagementClient.UpdateInstancePool(context.Background(), core.UpdateInstancePoolRequest{
|
||||
InstancePoolId: common.String(instancePoolID),
|
||||
UpdateInstancePoolDetails: updateDetails,
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
|
||||
c.targetSize[instancePoolID] = size
|
||||
|
||||
return c.waitForInstancePoolState(context.Background(), instancePoolID, core.InstancePoolLifecycleStateRunning)
|
||||
}
|
||||
|
||||
func (c *instancePoolCache) waitForInstancePoolState(ctx context.Context, instancePoolID string, desiredState core.InstancePoolLifecycleStateEnum) error {
|
||||
timeoutCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
|
||||
defer cancel()
|
||||
err := wait.PollImmediateUntil(
|
||||
// TODO we need a better implementation of this function
|
||||
internalPollInterval,
|
||||
func() (bool, error) {
|
||||
getInstancePoolResp, err := c.computeManagementClient.GetInstancePool(context.Background(), core.GetInstancePoolRequest{
|
||||
InstancePoolId: common.String(instancePoolID),
|
||||
})
|
||||
if err != nil {
|
||||
klog.Errorf("getInstancePool failed. Retrying: %+v", err)
|
||||
return false, err
|
||||
} else if getInstancePoolResp.LifecycleState != desiredState {
|
||||
klog.V(4).Infof("waiting for instance-pool %s to enter state: %s (current state: %s)", instancePoolID,
|
||||
desiredState, getInstancePoolResp.LifecycleState)
|
||||
return false, nil
|
||||
}
|
||||
klog.V(3).Infof("instance pool %s is in desired state: %s", instancePoolID, desiredState)
|
||||
|
||||
return true, nil
|
||||
}, timeoutCtx.Done())
|
||||
if err != nil {
|
||||
// may be wait.ErrWaitTimeout
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *instancePoolCache) getSize(id string) (int, error) {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
|
||||
size, ok := c.targetSize[id]
|
||||
if !ok {
|
||||
return -1, errors.New("target size not found")
|
||||
}
|
||||
|
||||
return size, nil
|
||||
}
|
||||
|
|
@ -0,0 +1,513 @@
|
|||
/*
|
||||
Copyright 2021 Oracle and/or its affiliates.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package oci
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"gopkg.in/gcfg.v1"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
apiv1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/util/wait"
|
||||
"k8s.io/client-go/kubernetes"
|
||||
"k8s.io/klog/v2"
|
||||
kubeletapis "k8s.io/kubelet/pkg/apis"
|
||||
|
||||
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
|
||||
|
||||
"github.com/pkg/errors"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/oci-go-sdk/v43/common"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/oci-go-sdk/v43/common/auth"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/oci-go-sdk/v43/core"
|
||||
)
|
||||
|
||||
var (
|
||||
internalPollInterval = 1 * time.Minute
|
||||
errInstanceInstancePoolNotFound = errors.New("instance-pool not found for instance")
|
||||
)
|
||||
|
||||
// InstancePoolManager defines the operations required for an *instance-pool based* autoscaler.
|
||||
type InstancePoolManager interface {
|
||||
// Refresh triggers refresh of cached resources.
|
||||
Refresh() error
|
||||
// Cleanup cleans up open resources before the cloud provider is destroyed, i.e. go routines etc.
|
||||
Cleanup() error
|
||||
|
||||
// GetInstancePools returns list of registered InstancePools.
|
||||
GetInstancePools() []*InstancePoolNodeGroup
|
||||
// GetInstancePoolNodes returns InstancePool nodes.
|
||||
GetInstancePoolNodes(ip InstancePoolNodeGroup) ([]cloudprovider.Instance, error)
|
||||
// GetInstancePoolForInstance returns InstancePool to which the given instance belongs.
|
||||
GetInstancePoolForInstance(instance OciRef) (*InstancePoolNodeGroup, error)
|
||||
// GetInstancePoolTemplateNode returns a template node for InstancePool.
|
||||
GetInstancePoolTemplateNode(ip InstancePoolNodeGroup) (*apiv1.Node, error)
|
||||
// GetInstancePoolSize gets the InstancePool size.
|
||||
GetInstancePoolSize(ip InstancePoolNodeGroup) (int, error)
|
||||
// SetInstancePoolSize sets the InstancePool size.
|
||||
SetInstancePoolSize(ip InstancePoolNodeGroup, size int) error
|
||||
// DeleteInstances deletes the given instances. All instances must be controlled by the same InstancePool.
|
||||
DeleteInstances(ip InstancePoolNodeGroup, instances []OciRef) error
|
||||
}
|
||||
|
||||
// InstancePoolManagerImpl is the implementation of an instance-pool based autoscaler on OCI.
|
||||
type InstancePoolManagerImpl struct {
|
||||
cfg *CloudConfig
|
||||
shapeGetter ShapeGetter
|
||||
staticInstancePools map[string]*InstancePoolNodeGroup
|
||||
lastRefresh time.Time
|
||||
// caches the instance pool and instance summary objects received from OCI.
|
||||
// All interactions with OCI's API should go through the poolCache.
|
||||
instancePoolCache *instancePoolCache
|
||||
}
|
||||
|
||||
// CreateInstancePoolManager constructs the InstancePoolManager object.
|
||||
func CreateInstancePoolManager(cloudConfigPath string, discoveryOpts cloudprovider.NodeGroupDiscoveryOptions, kubeClient kubernetes.Interface) (InstancePoolManager, error) {
|
||||
|
||||
var err error
|
||||
var configProvider common.ConfigurationProvider
|
||||
var cloudConfig = &CloudConfig{}
|
||||
|
||||
// cloudConfigPath is the optional file of variables passed in with the --cloud-config flag, which takes precedence over environment variables
|
||||
if cloudConfigPath != "" {
|
||||
config, fileErr := os.Open(cloudConfigPath)
|
||||
if fileErr != nil {
|
||||
klog.Fatalf("could not open cloud provider configuration %s: %#v", cloudConfigPath, fileErr)
|
||||
}
|
||||
defer config.Close()
|
||||
if config != nil {
|
||||
if err := gcfg.ReadInto(cloudConfig, config); err != nil {
|
||||
klog.Errorf("could not read config: %v", err)
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
}
|
||||
// Fall back to environment variables
|
||||
if cloudConfig.Global.CompartmentID == "" {
|
||||
cloudConfig.Global.CompartmentID = os.Getenv(ociCompartmentEnvVar)
|
||||
} else if !cloudConfig.Global.UseInstancePrinciples {
|
||||
if os.Getenv(ociUseInstancePrincipalEnvVar) == "true" {
|
||||
cloudConfig.Global.UseInstancePrinciples = true
|
||||
}
|
||||
if os.Getenv(ociRegionEnvVar) != "" {
|
||||
cloudConfig.Global.Region = os.Getenv(ociRegionEnvVar)
|
||||
}
|
||||
}
|
||||
if cloudConfig.Global.RefreshInterval == 0 {
|
||||
if os.Getenv(ociRefreshInterval) != "" {
|
||||
klog.V(4).Info("using a custom cache refresh interval %v...", os.Getenv(ociRefreshInterval))
|
||||
cloudConfig.Global.RefreshInterval, _ = time.ParseDuration(os.Getenv(ociRefreshInterval))
|
||||
} else {
|
||||
cloudConfig.Global.RefreshInterval = defaultRefreshInterval
|
||||
}
|
||||
}
|
||||
|
||||
clientConfig := common.CustomClientConfiguration{
|
||||
RetryPolicy: newRetryPolicy(),
|
||||
}
|
||||
|
||||
if os.Getenv(ociUseInstancePrincipalEnvVar) == "true" {
|
||||
klog.V(4).Info("using instance principals...")
|
||||
region := os.Getenv(ociRegionEnvVar)
|
||||
if region == "" {
|
||||
klog.Fatalf("OCI_REGION is required when OCI_USE_INSTANCE_PRINCIPAL is set to true")
|
||||
}
|
||||
configProvider, err = auth.InstancePrincipalConfigurationProviderForRegion(common.StringToRegion(region))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
} else {
|
||||
klog.Info("using default configuration provider")
|
||||
configProvider = common.DefaultConfigProvider()
|
||||
}
|
||||
providerRegion, _ := configProvider.Region()
|
||||
klog.Infof("OCI provider region: %s ", providerRegion)
|
||||
|
||||
computeMgmtClient, err := core.NewComputeManagementClientWithConfigurationProvider(configProvider)
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "unable to create compute management client")
|
||||
}
|
||||
computeMgmtClient.SetCustomClientConfiguration(clientConfig)
|
||||
|
||||
computeClient, err := core.NewComputeClientWithConfigurationProvider(configProvider)
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "unable to create compute client")
|
||||
}
|
||||
computeClient.SetCustomClientConfiguration(clientConfig)
|
||||
|
||||
networkClient, err := core.NewVirtualNetworkClientWithConfigurationProvider(configProvider)
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "unable to create virtual network client")
|
||||
}
|
||||
networkClient.SetCustomClientConfiguration(clientConfig)
|
||||
|
||||
cloudConfig.Global.CompartmentID = os.Getenv(ociCompartmentEnvVar)
|
||||
|
||||
// Not passed by --cloud-config or environment variable, attempt to use the tenancy ID as the compartment ID
|
||||
if cloudConfig.Global.CompartmentID == "" {
|
||||
tenancyID, err := configProvider.TenancyOCID()
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "unable to retrieve tenancy ID")
|
||||
}
|
||||
cloudConfig.Global.CompartmentID = tenancyID
|
||||
}
|
||||
|
||||
ipManager := &InstancePoolManagerImpl{
|
||||
cfg: cloudConfig,
|
||||
staticInstancePools: map[string]*InstancePoolNodeGroup{},
|
||||
shapeGetter: createShapeGetter(ShapeClientImpl{computeMgmtClient: computeMgmtClient, computeClient: computeClient}),
|
||||
instancePoolCache: newInstancePoolCache(&computeMgmtClient, &computeClient, &networkClient),
|
||||
}
|
||||
|
||||
// Contains all the specs from the args that give us the pools.
|
||||
for _, arg := range discoveryOpts.NodeGroupSpecs {
|
||||
ip, err := instancePoolFromArg(arg)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to construct instance pool from argument: %v", err)
|
||||
}
|
||||
|
||||
ip.manager = ipManager
|
||||
ip.kubeClient = kubeClient
|
||||
|
||||
ipManager.staticInstancePools[ip.Id()] = ip
|
||||
}
|
||||
|
||||
// wait until we have an initial full poolCache.
|
||||
err = wait.PollImmediateInfinite(
|
||||
10*time.Second,
|
||||
func() (bool, error) {
|
||||
err := ipManager.Refresh()
|
||||
if err != nil {
|
||||
klog.Errorf("unable to fill cache on startup. Retrying: %+v", err)
|
||||
return false, nil
|
||||
}
|
||||
|
||||
return true, nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return ipManager, nil
|
||||
}
|
||||
|
||||
// instancePoolFromArg parses a instancepool spec represented in the form of `<minSize>:<maxSize>:<ocid>` and produces an instance pool wrapper spec object
|
||||
func instancePoolFromArg(value string) (*InstancePoolNodeGroup, error) {
|
||||
|
||||
if !strings.Contains(value, ociInstancePoolResourceIdent) {
|
||||
return nil, fmt.Errorf("instance pool manager does not work with resources of type: %s", value)
|
||||
}
|
||||
|
||||
tokens := strings.SplitN(value, ":", 3)
|
||||
if len(tokens) != 3 || !strings.HasPrefix(tokens[2], "ocid") {
|
||||
return nil, fmt.Errorf("incorrect instance configuration: %s", value)
|
||||
}
|
||||
|
||||
spec := &InstancePoolNodeGroup{}
|
||||
if size, err := strconv.Atoi(tokens[0]); err == nil {
|
||||
spec.minSize = size
|
||||
} else {
|
||||
return nil, fmt.Errorf("failed to set pool min size: %s %v", tokens[0], err)
|
||||
}
|
||||
|
||||
if size, err := strconv.Atoi(tokens[1]); err == nil {
|
||||
spec.maxSize = size
|
||||
} else {
|
||||
return nil, fmt.Errorf("failed to set pool max size: %s %v", tokens[1], err)
|
||||
}
|
||||
|
||||
spec.id = tokens[2]
|
||||
|
||||
klog.Infof("static instance pool wrapper spec constructed: %+v", spec)
|
||||
|
||||
return spec, nil
|
||||
}
|
||||
|
||||
// Refresh triggers refresh of cached resources.
|
||||
func (m *InstancePoolManagerImpl) Refresh() error {
|
||||
if m.lastRefresh.Add(m.cfg.Global.RefreshInterval).After(time.Now()) {
|
||||
return nil
|
||||
}
|
||||
|
||||
return m.forceRefresh()
|
||||
}
|
||||
|
||||
func (m *InstancePoolManagerImpl) forceRefresh() error {
|
||||
if m.cfg == nil {
|
||||
return errors.New("instance pool manager does have a required config")
|
||||
}
|
||||
err := m.instancePoolCache.rebuild(m.staticInstancePools, *m.cfg)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
m.lastRefresh = time.Now()
|
||||
klog.Infof("Refreshed instance-pool list, next refresh after %v", m.lastRefresh.Add(m.cfg.Global.RefreshInterval))
|
||||
return nil
|
||||
}
|
||||
|
||||
// Cleanup cleans up open resources before the cloud provider is destroyed, i.e. go routines etc.
|
||||
func (m *InstancePoolManagerImpl) Cleanup() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetInstancePools returns list of registered InstancePools.
|
||||
func (m *InstancePoolManagerImpl) GetInstancePools() []*InstancePoolNodeGroup {
|
||||
var instancePools []*InstancePoolNodeGroup
|
||||
for _, np := range m.staticInstancePools {
|
||||
instancePools = append(instancePools, np)
|
||||
}
|
||||
return instancePools
|
||||
}
|
||||
|
||||
// GetInstancePoolNodes returns InstancePool nodes that are not in a terminal state.
|
||||
func (m *InstancePoolManagerImpl) GetInstancePoolNodes(ip InstancePoolNodeGroup) ([]cloudprovider.Instance, error) {
|
||||
|
||||
klog.V(4).Infof("getting instances for node pool: %q", ip.Id())
|
||||
|
||||
instanceSummaries, err := m.instancePoolCache.getInstanceSummaries(ip.Id())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var providerInstances []cloudprovider.Instance
|
||||
for _, instance := range *instanceSummaries {
|
||||
status := &cloudprovider.InstanceStatus{}
|
||||
switch *instance.State {
|
||||
case string(core.InstanceLifecycleStateRunning):
|
||||
status.State = cloudprovider.InstanceRunning
|
||||
case string(core.InstanceLifecycleStateCreatingImage):
|
||||
status.State = cloudprovider.InstanceCreating
|
||||
case string(core.InstanceLifecycleStateStarting):
|
||||
status.State = cloudprovider.InstanceCreating
|
||||
case string(core.InstanceLifecycleStateMoving):
|
||||
status.State = cloudprovider.InstanceCreating
|
||||
case string(core.InstanceLifecycleStateProvisioning):
|
||||
status.State = cloudprovider.InstanceCreating
|
||||
case string(core.InstanceLifecycleStateTerminating):
|
||||
status.State = cloudprovider.InstanceDeleting
|
||||
case string(core.InstanceLifecycleStateStopping):
|
||||
status.State = cloudprovider.InstanceDeleting
|
||||
}
|
||||
|
||||
// Instance not in a terminal or unknown state, ok to add.
|
||||
if status.State != 0 {
|
||||
providerInstances = append(providerInstances, cloudprovider.Instance{
|
||||
Id: *instance.Id,
|
||||
Status: status,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return providerInstances, nil
|
||||
}
|
||||
|
||||
// GetInstancePoolForInstance returns InstancePool to which the given instance belongs. If
|
||||
// PoolID is not set on the specified OciRef, we will look for a match.
|
||||
func (m *InstancePoolManagerImpl) GetInstancePoolForInstance(instanceDetails OciRef) (*InstancePoolNodeGroup, error) {
|
||||
|
||||
if instanceDetails.PoolID != "" {
|
||||
// It's possible that this instance belongs to an instance pool that was not specified via --nodes argument.
|
||||
return m.staticInstancePools[instanceDetails.PoolID], nil
|
||||
}
|
||||
|
||||
if instanceDetails.CompartmentID == "" {
|
||||
// cfg.Global.CompartmentID would be set to tenancy OCID at runtime if compartment was not set.
|
||||
instanceDetails.CompartmentID = m.cfg.Global.CompartmentID
|
||||
}
|
||||
|
||||
// Details are missing from this instance - including the pool ID.
|
||||
// Try to resolve them, though it may not be a member of an instance-pool we manage.
|
||||
resolvedInstanceDetails, err := m.instancePoolCache.findInstanceByDetails(instanceDetails)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
} else if resolvedInstanceDetails == nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
kubeClient := m.staticInstancePools[resolvedInstanceDetails.PoolID].kubeClient
|
||||
|
||||
// Optionally annotate & label the node so that it does not need to be searched for in subsequent iterations.
|
||||
_ = annotateNode(kubeClient, resolvedInstanceDetails.Name, ociInstanceIDAnnotation, resolvedInstanceDetails.InstanceID)
|
||||
_ = annotateNode(kubeClient, resolvedInstanceDetails.Name, ociInstancePoolIDAnnotation, resolvedInstanceDetails.PoolID)
|
||||
_ = annotateNode(kubeClient, resolvedInstanceDetails.Name, ociAnnotationCompartmentID, resolvedInstanceDetails.CompartmentID)
|
||||
_ = labelNode(kubeClient, resolvedInstanceDetails.Name, apiv1.LabelTopologyZone, resolvedInstanceDetails.AvailabilityDomain)
|
||||
_ = labelNode(kubeClient, resolvedInstanceDetails.Name, apiv1.LabelFailureDomainBetaZone, resolvedInstanceDetails.AvailabilityDomain)
|
||||
_ = labelNode(kubeClient, resolvedInstanceDetails.Name, apiv1.LabelInstanceType, resolvedInstanceDetails.Shape)
|
||||
_ = labelNode(kubeClient, resolvedInstanceDetails.Name, apiv1.LabelInstanceTypeStable, resolvedInstanceDetails.Shape)
|
||||
_ = setNodeProviderID(kubeClient, resolvedInstanceDetails.Name, resolvedInstanceDetails.InstanceID)
|
||||
|
||||
return m.staticInstancePools[resolvedInstanceDetails.PoolID], nil
|
||||
}
|
||||
|
||||
// GetInstancePoolTemplateNode returns a template node for the InstancePool.
|
||||
func (m *InstancePoolManagerImpl) GetInstancePoolTemplateNode(ip InstancePoolNodeGroup) (*apiv1.Node, error) {
|
||||
|
||||
instancePool, err := m.instancePoolCache.getInstancePool(ip.Id())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
node, err := m.buildNodeFromTemplate(instancePool)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return node, nil
|
||||
}
|
||||
|
||||
// GetInstancePoolSize gets the instance-pool size.
|
||||
func (m *InstancePoolManagerImpl) GetInstancePoolSize(ip InstancePoolNodeGroup) (int, error) {
|
||||
return m.instancePoolCache.getSize(ip.Id())
|
||||
}
|
||||
|
||||
// SetInstancePoolSize sets instance-pool size.
|
||||
func (m *InstancePoolManagerImpl) SetInstancePoolSize(np InstancePoolNodeGroup, size int) error {
|
||||
|
||||
err := m.instancePoolCache.setSize(np.Id(), size)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Interface says this function should wait until node group size is updated.
|
||||
|
||||
// We do not wait for the work request to finish or nodes become active on purpose. This allows
|
||||
// the autoscaler to make decisions quicker especially since the autoscaler is aware of
|
||||
// unregistered nodes in addition to registered nodes.
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// DeleteInstances deletes the given instances. All instances must be controlled by the same instance-pool.
|
||||
func (m *InstancePoolManagerImpl) DeleteInstances(instancePool InstancePoolNodeGroup, instances []OciRef) error {
|
||||
klog.Infof("DeleteInstances called on instance pool %s", instancePool.Id())
|
||||
|
||||
for _, instance := range instances {
|
||||
// removeInstance auto decrements instance pool size.
|
||||
detached := m.instancePoolCache.removeInstance(instancePool, instance.InstanceID)
|
||||
if !detached {
|
||||
return fmt.Errorf("could not delete instance %s from instance pool %s", instance.InstanceID, instancePool.Id())
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *InstancePoolManagerImpl) buildNodeFromTemplate(instancePool *core.InstancePool) (*apiv1.Node, error) {
|
||||
|
||||
node := apiv1.Node{}
|
||||
nodeName := fmt.Sprintf("%s-%d", "inst", 555555)
|
||||
|
||||
ocidParts := strings.Split(*instancePool.Id, ".")
|
||||
instanceIDPlaceholder := ocidParts[0] + "." + "instance" + "." + ocidParts[2] + "." + ocidParts[3] + ".tbd"
|
||||
|
||||
annotations := make(map[string]string)
|
||||
annotations[ociAnnotationCompartmentID] = *instancePool.CompartmentId
|
||||
annotations[ociInstancePoolIDAnnotation] = *instancePool.Id
|
||||
annotations[ociInstanceIDAnnotation] = instanceIDPlaceholder
|
||||
|
||||
node.ObjectMeta = metav1.ObjectMeta{
|
||||
Name: nodeName,
|
||||
Labels: map[string]string{},
|
||||
Annotations: annotations,
|
||||
}
|
||||
|
||||
node.Status = apiv1.NodeStatus{
|
||||
Capacity: apiv1.ResourceList{},
|
||||
}
|
||||
shape, err := m.shapeGetter.GetInstancePoolShape(instancePool)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if shape.GPU > 0 {
|
||||
node.Spec.Taints = append(node.Spec.Taints, apiv1.Taint{
|
||||
Key: "nvidia.com/gpu",
|
||||
Value: "",
|
||||
Effect: "NoSchedule",
|
||||
})
|
||||
}
|
||||
|
||||
node.Status.Capacity[apiv1.ResourcePods] = *resource.NewQuantity(110, resource.DecimalSI)
|
||||
node.Status.Capacity[apiv1.ResourceCPU] = *resource.NewQuantity(int64(shape.CPU), resource.DecimalSI)
|
||||
node.Status.Capacity[apiv1.ResourceMemory] = *resource.NewQuantity(int64(shape.MemoryInBytes), resource.DecimalSI)
|
||||
node.Status.Capacity[ResourceGPU] = *resource.NewQuantity(int64(shape.GPU), resource.DecimalSI)
|
||||
|
||||
node.Status.Allocatable = node.Status.Capacity
|
||||
|
||||
availabilityDomain, err := getInstancePoolAvailabilityDomain(instancePool)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
node.Labels = cloudprovider.JoinStringMaps(node.Labels, buildGenericLabelsForInstancePool(instancePool, nodeName, shape.Name, availabilityDomain))
|
||||
|
||||
node.Status.Conditions = cloudprovider.BuildReadyConditions()
|
||||
return &node, nil
|
||||
}
|
||||
|
||||
// getInstancePoolAvailabilityDomain determines the availability of the instance pool.
|
||||
// This breaks down if the customer specifies more than one placement configuration,
|
||||
// so best practices should be a node pool per AD if customers care about it during scheduling.
|
||||
// if there are more than 1AD defined, then we return the first one always.
|
||||
func getInstancePoolAvailabilityDomain(ip *core.InstancePool) (string, error) {
|
||||
if len(ip.PlacementConfigurations) == 0 {
|
||||
// At least one placement configuration is required for an instance pool, so we should not get here.
|
||||
return "", fmt.Errorf("instance-pool %q does not have the required placement configurations", *ip.Id)
|
||||
}
|
||||
|
||||
if len(ip.PlacementConfigurations) > 1 {
|
||||
klog.Warningf("instance-pool %q has more than 1 placement config so picking first availability domain", *ip.Id)
|
||||
}
|
||||
|
||||
// Get the availability domain which is by default in the format of `Uocm:PHX-AD-1`
|
||||
// and remove the hash prefix.
|
||||
availabilityDomain := strings.Split(*ip.PlacementConfigurations[0].AvailabilityDomain, ":")[1]
|
||||
return availabilityDomain, nil
|
||||
}
|
||||
|
||||
func buildGenericLabelsForInstancePool(instancePool *core.InstancePool, nodeName, shape, availabilityDomain string) map[string]string {
|
||||
result := make(map[string]string)
|
||||
result[kubeletapis.LabelArch] = cloudprovider.DefaultArch
|
||||
result[apiv1.LabelArchStable] = cloudprovider.DefaultArch
|
||||
|
||||
result[kubeletapis.LabelOS] = cloudprovider.DefaultOS
|
||||
result[apiv1.LabelOSStable] = cloudprovider.DefaultOS
|
||||
|
||||
parts := strings.Split(*instancePool.Id, ".")
|
||||
if len(parts) == 5 {
|
||||
// backward compatibility with older pod labels
|
||||
result[apiv1.LabelZoneRegion] = parts[3]
|
||||
result[apiv1.LabelZoneRegionStable] = parts[3]
|
||||
}
|
||||
|
||||
result[apiv1.LabelInstanceType] = shape
|
||||
result[apiv1.LabelInstanceTypeStable] = shape
|
||||
|
||||
result[apiv1.LabelZoneFailureDomain] = availabilityDomain
|
||||
// backward compatibility with older pod labels
|
||||
result[apiv1.LabelZoneFailureDomainStable] = availabilityDomain
|
||||
|
||||
result[apiv1.LabelHostname] = nodeName
|
||||
|
||||
return result
|
||||
}
|
||||
|
|
@ -0,0 +1,511 @@
|
|||
package oci
|
||||
|
||||
import (
|
||||
"context"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/oci-go-sdk/v43/core"
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/oci-go-sdk/v43/common"
|
||||
)
|
||||
|
||||
type mockComputeManagementClient struct {
|
||||
err error
|
||||
getInstancePoolResponse core.GetInstancePoolResponse
|
||||
getInstancePoolInstanceResponse core.GetInstancePoolInstanceResponse
|
||||
listInstancePoolInstancesResponse core.ListInstancePoolInstancesResponse
|
||||
updateInstancePoolResponse core.UpdateInstancePoolResponse
|
||||
detachInstancePoolInstanceResponse core.DetachInstancePoolInstanceResponse
|
||||
}
|
||||
|
||||
type mockVirtualNetworkClient struct {
|
||||
err error
|
||||
getVnicResponse core.GetVnicResponse
|
||||
}
|
||||
|
||||
type mockComputeClient struct {
|
||||
err error
|
||||
listVnicAttachmentsResponse core.ListVnicAttachmentsResponse
|
||||
}
|
||||
|
||||
func (m *mockComputeClient) ListVnicAttachments(ctx context.Context, request core.ListVnicAttachmentsRequest) (core.ListVnicAttachmentsResponse, error) {
|
||||
return m.listVnicAttachmentsResponse, m.err
|
||||
}
|
||||
|
||||
func (m *mockVirtualNetworkClient) GetVnic(context.Context, core.GetVnicRequest) (core.GetVnicResponse, error) {
|
||||
return m.getVnicResponse, m.err
|
||||
}
|
||||
|
||||
func (m *mockComputeManagementClient) ListInstancePoolInstances(_ context.Context, _ core.ListInstancePoolInstancesRequest) (core.ListInstancePoolInstancesResponse, error) {
|
||||
return m.listInstancePoolInstancesResponse, m.err
|
||||
}
|
||||
|
||||
func (m *mockComputeManagementClient) GetInstancePool(context.Context, core.GetInstancePoolRequest) (core.GetInstancePoolResponse, error) {
|
||||
return m.getInstancePoolResponse, m.err
|
||||
}
|
||||
|
||||
func (m *mockComputeManagementClient) UpdateInstancePool(context.Context, core.UpdateInstancePoolRequest) (core.UpdateInstancePoolResponse, error) {
|
||||
return m.updateInstancePoolResponse, m.err
|
||||
}
|
||||
|
||||
func (m *mockComputeManagementClient) GetInstancePoolInstance(context.Context, core.GetInstancePoolInstanceRequest) (core.GetInstancePoolInstanceResponse, error) {
|
||||
return m.getInstancePoolInstanceResponse, m.err
|
||||
}
|
||||
|
||||
func (m *mockComputeManagementClient) DetachInstancePoolInstance(context.Context, core.DetachInstancePoolInstanceRequest) (core.DetachInstancePoolInstanceResponse, error) {
|
||||
return m.detachInstancePoolInstanceResponse, m.err
|
||||
}
|
||||
|
||||
var computeClient = &mockComputeClient{
|
||||
err: nil,
|
||||
listVnicAttachmentsResponse: core.ListVnicAttachmentsResponse{
|
||||
RawResponse: nil,
|
||||
Items: []core.VnicAttachment{{
|
||||
Id: common.String("ocid1.vnic.oc1.phx.abc"),
|
||||
LifecycleState: core.VnicAttachmentLifecycleStateAttached,
|
||||
}},
|
||||
},
|
||||
}
|
||||
|
||||
var computeManagementClient = &mockComputeManagementClient{
|
||||
err: nil,
|
||||
getInstancePoolResponse: core.GetInstancePoolResponse{
|
||||
InstancePool: core.InstancePool{
|
||||
Id: common.String("ocid1.instancepool.oc1.phx.aaaaaaaa1"),
|
||||
CompartmentId: common.String("ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
|
||||
InstanceConfigurationId: common.String("ocid1.instanceconfiguration.oc1.phx.aaaaaaaa1"),
|
||||
LifecycleState: core.InstancePoolLifecycleStateRunning,
|
||||
Size: common.Int(2),
|
||||
},
|
||||
},
|
||||
listInstancePoolInstancesResponse: core.ListInstancePoolInstancesResponse{
|
||||
RawResponse: nil,
|
||||
Items: []core.InstanceSummary{{
|
||||
Id: common.String("ocid1.instance.oc1.phx.aaa1"),
|
||||
AvailabilityDomain: common.String("Uocm:PHX-AD-2"),
|
||||
CompartmentId: common.String("ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
|
||||
DisplayName: common.String("inst-1ncvn-ociinstancepool"),
|
||||
Shape: common.String("VM.Standard2.8"),
|
||||
State: common.String(string(core.InstanceLifecycleStateRunning)),
|
||||
}, {
|
||||
Id: common.String("ocid1.instance.oc1.phx.aaacachemiss"),
|
||||
AvailabilityDomain: common.String("Uocm:PHX-AD-2"),
|
||||
CompartmentId: common.String("ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
|
||||
DisplayName: common.String("inst-2ncvn-ociinstancepool"),
|
||||
Shape: common.String("VM.Standard2.8"),
|
||||
State: common.String(string(core.InstanceLifecycleStateRunning)),
|
||||
}},
|
||||
},
|
||||
}
|
||||
|
||||
var virtualNetworkClient = &mockVirtualNetworkClient{
|
||||
err: nil,
|
||||
getVnicResponse: core.GetVnicResponse{
|
||||
RawResponse: nil,
|
||||
Vnic: core.Vnic{
|
||||
Id: common.String("ocid1.vnic.oc1.phx.abyhqljsxigued23s7ywgcqlbpqfiysgnhxj672awzjluhoopzf7l7wvm6rq"),
|
||||
PrivateIp: common.String("10.0.20.59"),
|
||||
PublicIp: common.String("129.146.58.250"),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
func TestInstancePoolFromArgs(t *testing.T) {
|
||||
|
||||
value := `1:5:ocid1.instancepool.oc1.phx.aaaaaaaah`
|
||||
instanceNodePool, err := instancePoolFromArg(value)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %+v", err)
|
||||
}
|
||||
|
||||
if instanceNodePool.minSize != 1 {
|
||||
t.Errorf("got minSize %d ; wanted minSize 1", instanceNodePool.minSize)
|
||||
}
|
||||
|
||||
if instanceNodePool.maxSize != 5 {
|
||||
t.Errorf("got maxSize %d ; wanted maxSize 1", instanceNodePool.maxSize)
|
||||
}
|
||||
|
||||
if instanceNodePool.id != "ocid1.instancepool.oc1.phx.aaaaaaaah" {
|
||||
t.Errorf("got ocid %q ; wanted id \"ocid1.instancepool.oc1.phx.aaaaaaaah\"", instanceNodePool.id)
|
||||
}
|
||||
|
||||
value = `1:5:ocid1.nodepool.oc1.phx.aaaaaaaah`
|
||||
_, err = instancePoolFromArg(value)
|
||||
if err == nil {
|
||||
t.Fatal("expected error processing an oke based node-pool")
|
||||
}
|
||||
|
||||
value = `1:5:incorrect:ocid1.instancepool.oc1.phx.aaaaaaaah`
|
||||
_, err = instancePoolFromArg(value)
|
||||
if err == nil {
|
||||
t.Fatal("expected error of an invalid instance pool")
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetSetInstancePoolSize(t *testing.T) {
|
||||
nodePoolCache := newInstancePoolCache(computeManagementClient, computeClient, virtualNetworkClient)
|
||||
nodePoolCache.targetSize["ocid1.instancepool.oc1.phx.aaaaaaaai"] = 5
|
||||
|
||||
manager := &InstancePoolManagerImpl{instancePoolCache: nodePoolCache}
|
||||
size, err := manager.GetInstancePoolSize(InstancePoolNodeGroup{id: "ocid1.instancepool.oc1.phx.aaaaaaaai"})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %+v", err)
|
||||
}
|
||||
if size != 5 {
|
||||
t.Errorf("got size %d ; wanted size 5", size)
|
||||
}
|
||||
|
||||
err = manager.SetInstancePoolSize(InstancePoolNodeGroup{id: "ocid1.instancepool.oc1.phx.aaaaaaaai"}, 6)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %+v", err)
|
||||
}
|
||||
size, err = manager.GetInstancePoolSize(InstancePoolNodeGroup{id: "ocid1.instancepool.oc1.phx.aaaaaaaai"})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %+v", err)
|
||||
}
|
||||
if size != 6 {
|
||||
t.Errorf("got size %d ; wanted size 6", size)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func TestGetInstancePoolForInstance(t *testing.T) {
|
||||
|
||||
nodePoolCache := newInstancePoolCache(computeManagementClient, computeClient, virtualNetworkClient)
|
||||
nodePoolCache.poolCache["ocid1.instancepool.oc1.phx.aaaaaaaa1"] = &core.InstancePool{
|
||||
Id: common.String("ocid1.instancepool.oc1.phx.aaaaaaaa1"),
|
||||
Size: common.Int(1),
|
||||
}
|
||||
|
||||
var cloudConfig = CloudConfig{}
|
||||
cloudConfig.Global.CompartmentID = "compartment.oc1..aaaaaaaa7ey4sg3a6b5wnv5hlkjlkjadslkfjalskfjalsadfadsf"
|
||||
|
||||
manager := &InstancePoolManagerImpl{
|
||||
cfg: &cloudConfig,
|
||||
staticInstancePools: map[string]*InstancePoolNodeGroup{
|
||||
"ocid1.instancepool.oc1.phx.aaaaaaaa1": {id: "ocid1.instancepool.oc1.phx.aaaaaaaa1"},
|
||||
"ocid1.instancepool.oc1.phx.aaaaaaaa2": {id: "ocid1.instancepool.oc1.phx.aaaaaaaa2"},
|
||||
},
|
||||
instancePoolCache: nodePoolCache,
|
||||
}
|
||||
|
||||
// first verify instance pool can be found when only the instance id is specified.
|
||||
np, err := manager.GetInstancePoolForInstance(OciRef{InstanceID: "ocid1.instance.oc1.phx.aaa1"})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %+v", err)
|
||||
}
|
||||
|
||||
if np.Id() != "ocid1.instancepool.oc1.phx.aaaaaaaa1" {
|
||||
t.Fatalf("got unexpected ocid %q ; wanted \"ocid1.instancepool.oc1.phx.aaaaaaaa1\"", np.Id())
|
||||
}
|
||||
|
||||
// next, verify a valid instance can be found if it is currently missing from the cache.
|
||||
np, err = manager.GetInstancePoolForInstance(OciRef{InstanceID: "ocid1.instance.oc1.phx.aaacachemiss"})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %+v", err)
|
||||
}
|
||||
|
||||
if np.Id() != "ocid1.instancepool.oc1.phx.aaaaaaaa1" {
|
||||
t.Fatalf("got unexpected ocid %q ; wanted \"ocid1.instancepool.oc1.phx.aaaaaaaa1s\"", np.Id())
|
||||
}
|
||||
|
||||
// next, verify an invalid instance cant be found if it is missing from the cache and pool.
|
||||
_, err = manager.GetInstancePoolForInstance(OciRef{InstanceID: "ocid1.instance.oc1.phx.aaadne"})
|
||||
if err != errInstanceInstancePoolNotFound {
|
||||
t.Fatalf("epected error looking for an invalid instance")
|
||||
}
|
||||
|
||||
// verify an invalid instance pool produces an error.
|
||||
ip, err := manager.GetInstancePoolForInstance(OciRef{InstanceID: "ocid1.instance.oc1.phx.aaadne", PoolID: "ocid1.instancepool.oc1.phx.aaaaaaaadne"})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %+v", err)
|
||||
}
|
||||
if ip != nil {
|
||||
t.Fatalf("expected nil looking for an instance with invalid instance & pool ids")
|
||||
}
|
||||
|
||||
// next verify instance pool can be found when the instance pool id is specified directly.
|
||||
_, err = manager.GetInstancePoolForInstance(OciRef{PoolID: "ocid1.instancepool.oc1.phx.aaaaaaaa1"})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %+v", err)
|
||||
}
|
||||
|
||||
// next verify instance pool can be found when only the private IP is specified.
|
||||
np, err = manager.GetInstancePoolForInstance(OciRef{
|
||||
PrivateIPAddress: "10.0.20.59"})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %+v", err)
|
||||
}
|
||||
|
||||
if np.Id() != "ocid1.instancepool.oc1.phx.aaaaaaaa1" {
|
||||
t.Fatalf("got unexpected ocid %q ; wanted \"ocid1.instancepool.oc1.phx.aaaaaaaa1\"", np.Id())
|
||||
}
|
||||
|
||||
// now verify node pool can be found via lookup up by instance id in poolCache
|
||||
np, err = manager.GetInstancePoolForInstance(OciRef{InstanceID: "ocid1.instance.oc1.phx.aaa1"})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %+v", err)
|
||||
}
|
||||
|
||||
if np.Id() != "ocid1.instancepool.oc1.phx.aaaaaaaa1" {
|
||||
t.Fatalf("got unexpected ocid %q ; wanted \"ocid1.instancepool.oc1.phx.aaaaaaaa1\"", np.Id())
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func TestGetInstancePoolNodes(t *testing.T) {
|
||||
|
||||
nodePoolCache := newInstancePoolCache(computeManagementClient, computeClient, virtualNetworkClient)
|
||||
nodePoolCache.poolCache["ocid1.instancepool.oc1.phx.aaaaaaaa1"] = &core.InstancePool{
|
||||
Id: common.String("ocid1.instancepool.oc1.phx.aaaaaaaa1"),
|
||||
CompartmentId: common.String("ocid1.compartment.oc1..aaaaaaaa1"),
|
||||
LifecycleState: core.InstancePoolLifecycleStateRunning,
|
||||
}
|
||||
nodePoolCache.instanceSummaryCache["ocid1.instancepool.oc1.phx.aaaaaaaa1"] = &[]core.InstanceSummary{{
|
||||
Id: common.String("ocid1.instance.oc1.phx.aaa1"),
|
||||
AvailabilityDomain: common.String("PHX-AD-2"),
|
||||
State: common.String(string(core.InstanceLifecycleStateRunning)),
|
||||
}, {
|
||||
Id: common.String("ocid1.instance.oc1.phx.aaa2"),
|
||||
AvailabilityDomain: common.String("PHX-AD-1"),
|
||||
State: common.String(string(core.InstanceLifecycleStateTerminating)),
|
||||
},
|
||||
}
|
||||
|
||||
expected := []cloudprovider.Instance{
|
||||
{
|
||||
Id: "ocid1.instance.oc1.phx.aaa1",
|
||||
Status: &cloudprovider.InstanceStatus{
|
||||
State: cloudprovider.InstanceRunning,
|
||||
},
|
||||
},
|
||||
{
|
||||
Id: "ocid1.instance.oc1.phx.aaa2",
|
||||
Status: &cloudprovider.InstanceStatus{
|
||||
State: cloudprovider.InstanceDeleting,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
manager := &InstancePoolManagerImpl{instancePoolCache: nodePoolCache, cfg: &CloudConfig{}}
|
||||
instances, err := manager.GetInstancePoolNodes(InstancePoolNodeGroup{id: "ocid1.instancepool.oc1.phx.aaaaaaaa1"})
|
||||
if err != nil {
|
||||
t.Fatalf("received unexpected error; %+v", err)
|
||||
}
|
||||
|
||||
if !reflect.DeepEqual(instances, expected) {
|
||||
t.Errorf("got %+v\nwanted %+v", instances, expected)
|
||||
}
|
||||
|
||||
err = manager.forceRefresh()
|
||||
if err != nil {
|
||||
t.Fatalf("received unexpected error refreshing cache; %+v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetInstancePoolAvailabilityDomain(t *testing.T) {
|
||||
testCases := map[string]struct {
|
||||
np *core.InstancePool
|
||||
result string
|
||||
expectedErr bool
|
||||
}{
|
||||
"single ad": {
|
||||
np: &core.InstancePool{
|
||||
Id: common.String("id"),
|
||||
LifecycleState: "",
|
||||
PlacementConfigurations: []core.InstancePoolPlacementConfiguration{{
|
||||
AvailabilityDomain: common.String("hash:US-ASHBURN-1"),
|
||||
PrimarySubnetId: common.String("ocid1.subnet.oc1.phx.aaaaaaaa1"),
|
||||
}},
|
||||
Size: common.Int(2),
|
||||
},
|
||||
result: "US-ASHBURN-1",
|
||||
},
|
||||
"multi-ad": {
|
||||
np: &core.InstancePool{
|
||||
Id: common.String("id"),
|
||||
LifecycleState: "",
|
||||
PlacementConfigurations: []core.InstancePoolPlacementConfiguration{{
|
||||
AvailabilityDomain: common.String("hash:US-ASHBURN-1"),
|
||||
PrimarySubnetId: common.String("ocid1.subnet.oc1.phx.aaaaaaaa1"),
|
||||
}, {
|
||||
AvailabilityDomain: common.String("hash:US-ASHBURN-2"),
|
||||
PrimarySubnetId: common.String("ocid1.subnet.oc1.phx.aaaaaaaa2"),
|
||||
}},
|
||||
Size: common.Int(2),
|
||||
},
|
||||
result: "US-ASHBURN-1",
|
||||
},
|
||||
}
|
||||
|
||||
for name, tc := range testCases {
|
||||
t.Run(name, func(t *testing.T) {
|
||||
ad, err := getInstancePoolAvailabilityDomain(tc.np)
|
||||
if tc.expectedErr {
|
||||
if err == nil {
|
||||
t.Fatalf("expected err but not nil")
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
if ad != tc.result {
|
||||
t.Errorf("got %q ; wanted %q", ad, tc.result)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetInstancePoolsAndInstances(t *testing.T) {
|
||||
|
||||
var computeManagementClient = &mockComputeManagementClient{
|
||||
getInstancePoolResponse: core.GetInstancePoolResponse{
|
||||
InstancePool: core.InstancePool{
|
||||
Id: common.String("ocid1.instancepool.oc1.phx.aaaaaaaa1"),
|
||||
CompartmentId: common.String("ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
|
||||
InstanceConfigurationId: common.String("ocid1.instanceconfiguration.oc1.phx.aaaaaaaa1"),
|
||||
PlacementConfigurations: nil,
|
||||
Size: common.Int(2),
|
||||
},
|
||||
},
|
||||
listInstancePoolInstancesResponse: core.ListInstancePoolInstancesResponse{
|
||||
Items: []core.InstanceSummary{{
|
||||
Id: common.String("ocid1.instance.oc1.phx.aaa1"),
|
||||
AvailabilityDomain: common.String("Uocm:PHX-AD-2"),
|
||||
CompartmentId: common.String("ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
|
||||
DisplayName: common.String("inst-1ncvn-ociinstancepool"),
|
||||
Shape: common.String("VM.Standard2.8"),
|
||||
State: common.String(string(core.InstanceLifecycleStateRunning)),
|
||||
}, {
|
||||
Id: common.String("ocid1.instance.oc1.phx.aaaterminal"),
|
||||
AvailabilityDomain: common.String("Uocm:PHX-AD-2"),
|
||||
CompartmentId: common.String("ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
|
||||
DisplayName: common.String("inst-2ncvn-ociinstancepool"),
|
||||
Shape: common.String("VM.Standard2.8"),
|
||||
State: common.String(string(core.InstanceLifecycleStateTerminated)),
|
||||
}},
|
||||
},
|
||||
}
|
||||
|
||||
cloudConfig := &CloudConfig{}
|
||||
cloudConfig.Global.CompartmentID = "ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
|
||||
manager := &InstancePoolManagerImpl{
|
||||
cfg: cloudConfig,
|
||||
staticInstancePools: map[string]*InstancePoolNodeGroup{
|
||||
"ocid1.instancepool.oc1.phx.aaaaaaaa1": {id: "ocid1.instancepool.oc1.phx.aaaaaaaa1"},
|
||||
},
|
||||
instancePoolCache: newInstancePoolCache(computeManagementClient, computeClient, virtualNetworkClient),
|
||||
}
|
||||
|
||||
// Populate cache(s) (twice to increase code coverage).
|
||||
_ = manager.Refresh()
|
||||
err := manager.Refresh()
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %+v", err)
|
||||
}
|
||||
err = manager.forceRefresh()
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %+v", err)
|
||||
}
|
||||
|
||||
instancePoolNodeGroups := manager.GetInstancePools()
|
||||
if got := len(instancePoolNodeGroups); got != 1 {
|
||||
t.Fatalf("expected 1 (static) instance pool, got %d", got)
|
||||
}
|
||||
instances, err := manager.GetInstancePoolNodes(*instancePoolNodeGroups[0])
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %+v", err)
|
||||
}
|
||||
// Should not pick up terminated instance.
|
||||
if got := len(instances); got != 1 {
|
||||
t.Fatalf("expected 1 instance, got %d", got)
|
||||
}
|
||||
|
||||
instancePoolNodeGroup, err := manager.GetInstancePoolForInstance(OciRef{InstanceID: instances[0].Id})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %+v", err)
|
||||
}
|
||||
if !reflect.DeepEqual(instancePoolNodeGroup, instancePoolNodeGroups[0]) {
|
||||
t.Errorf("got %+v\nwanted %+v", instancePoolNodeGroup, instancePoolNodeGroups[0])
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeleteInstances(t *testing.T) {
|
||||
|
||||
var computeManagementClient = &mockComputeManagementClient{
|
||||
getInstancePoolResponse: core.GetInstancePoolResponse{
|
||||
InstancePool: core.InstancePool{
|
||||
Id: common.String("ocid1.instancepool.oc1.phx.aaaaaaaa1"),
|
||||
CompartmentId: common.String("ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
|
||||
InstanceConfigurationId: common.String("ocid1.instanceconfiguration.oc1.phx.aaaaaaaa1"),
|
||||
LifecycleState: core.InstancePoolLifecycleStateRunning,
|
||||
Size: common.Int(2),
|
||||
},
|
||||
},
|
||||
listInstancePoolInstancesResponse: core.ListInstancePoolInstancesResponse{
|
||||
Items: []core.InstanceSummary{{
|
||||
Id: common.String("ocid1.instance.oc1.phx.aaa1"),
|
||||
AvailabilityDomain: common.String("Uocm:PHX-AD-1"),
|
||||
CompartmentId: common.String("ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
|
||||
DisplayName: common.String("inst-1ncvn-ociinstancepool"),
|
||||
Shape: common.String("VM.Standard2.16"),
|
||||
State: common.String(string(core.InstanceLifecycleStateRunning)),
|
||||
}, {
|
||||
Id: common.String("ocid1.instance.oc1.phx.aaa2"),
|
||||
AvailabilityDomain: common.String("Uocm:PHX-AD-1"),
|
||||
CompartmentId: common.String("ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
|
||||
DisplayName: common.String("inst-2ncvn-ociinstancepool"),
|
||||
Shape: common.String("VM.Standard2.16"),
|
||||
State: common.String(string(core.InstanceLifecycleStateRunning)),
|
||||
}},
|
||||
},
|
||||
}
|
||||
|
||||
cloudConfig := &CloudConfig{}
|
||||
cloudConfig.Global.CompartmentID = "ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
|
||||
manager := &InstancePoolManagerImpl{
|
||||
cfg: cloudConfig,
|
||||
staticInstancePools: map[string]*InstancePoolNodeGroup{
|
||||
"ocid1.instancepool.oc1.phx.aaaaaaaa1": {id: "ocid1.instancepool.oc1.phx.aaaaaaaa1"},
|
||||
},
|
||||
instancePoolCache: newInstancePoolCache(computeManagementClient, computeClient, virtualNetworkClient),
|
||||
}
|
||||
// Populate cache(s).
|
||||
manager.Refresh()
|
||||
|
||||
instances, err := manager.GetInstancePoolNodes(InstancePoolNodeGroup{id: "ocid1.instancepool.oc1.phx.aaaaaaaa1"})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %+v", err)
|
||||
}
|
||||
// Should not pick up terminated instance.
|
||||
if got := len(instances); got != 2 {
|
||||
t.Fatalf("expected 2 instance, got %d", got)
|
||||
}
|
||||
// Check size before and after delete
|
||||
size, err := manager.GetInstancePoolSize(InstancePoolNodeGroup{id: "ocid1.instancepool.oc1.phx.aaaaaaaa1"})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %+v", err)
|
||||
}
|
||||
if size != 2 {
|
||||
t.Errorf("got size %d ; wanted size 2 before delete", size)
|
||||
}
|
||||
|
||||
instanceToDelete := OciRef{
|
||||
AvailabilityDomain: "PHX-AD-1",
|
||||
Name: "inst-2ncvn-ociinstancepool",
|
||||
InstanceID: "ocid1.instance.oc1.phx.aaa2",
|
||||
PoolID: "ocid1.instancepool.oc1.phx.aaaaaaaa1",
|
||||
}
|
||||
err = manager.DeleteInstances(InstancePoolNodeGroup{id: "ocid1.instancepool.oc1.phx.aaaaaaaa1"}, []OciRef{instanceToDelete})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %+v", err)
|
||||
}
|
||||
size, err = manager.GetInstancePoolSize(InstancePoolNodeGroup{id: "ocid1.instancepool.oc1.phx.aaaaaaaa1"})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %+v", err)
|
||||
}
|
||||
if size != 1 {
|
||||
t.Errorf("got size %d ; wanted size 1 *after* delete", size)
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,66 @@
|
|||
/*
|
||||
Copyright 2021 Oracle and/or its affiliates.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package oci
|
||||
|
||||
import (
|
||||
apiv1 "k8s.io/api/core/v1"
|
||||
)
|
||||
|
||||
// OciRef contains s reference to some entity in OCI world.
|
||||
type OciRef struct {
|
||||
AvailabilityDomain string
|
||||
Name string
|
||||
CompartmentID string
|
||||
InstanceID string
|
||||
PoolID string
|
||||
PrivateIPAddress string
|
||||
PublicIPAddress string
|
||||
Shape string
|
||||
}
|
||||
|
||||
func nodeToOciRef(n *apiv1.Node) (OciRef, error) {
|
||||
return OciRef{
|
||||
Name: n.ObjectMeta.Name,
|
||||
AvailabilityDomain: n.Labels[apiv1.LabelZoneFailureDomain],
|
||||
CompartmentID: n.Annotations[ociAnnotationCompartmentID],
|
||||
InstanceID: n.Annotations[ociInstanceIDAnnotation],
|
||||
PoolID: n.Annotations[ociInstancePoolIDAnnotation],
|
||||
PrivateIPAddress: getNodeInternalAddress(n),
|
||||
PublicIPAddress: getNodeExternalAddress(n),
|
||||
Shape: n.Labels[apiv1.LabelInstanceType],
|
||||
}, nil
|
||||
}
|
||||
|
||||
// getNodeInternalAddress returns the first private address of the node and an empty string if none are found.
|
||||
func getNodeInternalAddress(node *apiv1.Node) string {
|
||||
for _, address := range node.Status.Addresses {
|
||||
if address.Type == apiv1.NodeInternalIP {
|
||||
return address.Address
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// getNodeExternalAddress returns the first public address of the node and an empty string if none are found.
|
||||
func getNodeExternalAddress(node *apiv1.Node) string {
|
||||
for _, address := range node.Status.Addresses {
|
||||
if address.Type == apiv1.NodeExternalIP {
|
||||
return address.Address
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
|
@ -0,0 +1,136 @@
|
|||
/*
|
||||
Copyright 2021 Oracle and/or its affiliates.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package oci
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/oci-go-sdk/v43/core"
|
||||
"k8s.io/klog/v2"
|
||||
)
|
||||
|
||||
// ShapeGetter returns the oci shape attributes for the pool.
|
||||
type ShapeGetter interface {
|
||||
GetInstancePoolShape(pool *core.InstancePool) (*Shape, error)
|
||||
}
|
||||
|
||||
// ShapeClient is an interface around the GetInstanceConfiguration and ListShapes calls.
|
||||
type ShapeClient interface {
|
||||
GetInstanceConfiguration(context.Context, core.GetInstanceConfigurationRequest) (core.GetInstanceConfigurationResponse, error)
|
||||
ListShapes(context.Context, core.ListShapesRequest) (core.ListShapesResponse, error)
|
||||
}
|
||||
|
||||
// ShapeClientImpl is the implementation for fetching shape information.
|
||||
type ShapeClientImpl struct {
|
||||
// Can fetch instance configs (flexible shapes)
|
||||
computeMgmtClient core.ComputeManagementClient
|
||||
// Can fetch shapes directly
|
||||
computeClient core.ComputeClient
|
||||
}
|
||||
|
||||
// GetInstanceConfiguration gets the instance configuration.
|
||||
func (cc ShapeClientImpl) GetInstanceConfiguration(ctx context.Context, req core.GetInstanceConfigurationRequest) (core.GetInstanceConfigurationResponse, error) {
|
||||
return cc.computeMgmtClient.GetInstanceConfiguration(ctx, req)
|
||||
}
|
||||
|
||||
// ListShapes lists the shapes.
|
||||
func (cc ShapeClientImpl) ListShapes(ctx context.Context, req core.ListShapesRequest) (core.ListShapesResponse, error) {
|
||||
return cc.computeClient.ListShapes(ctx, req)
|
||||
}
|
||||
|
||||
// Shape includes the resource attributes of a given shape which should be used
|
||||
// for constructing node templates.
|
||||
type Shape struct {
|
||||
Name string
|
||||
CPU float32
|
||||
GPU int
|
||||
MemoryInBytes float32
|
||||
}
|
||||
|
||||
// createShapeGetter creates a new oci shape getter.
|
||||
func createShapeGetter(shapeClient ShapeClient) ShapeGetter {
|
||||
return &shapeGetterImpl{
|
||||
shapeClient: shapeClient,
|
||||
cache: map[string]*Shape{},
|
||||
}
|
||||
}
|
||||
|
||||
type shapeGetterImpl struct {
|
||||
shapeClient ShapeClient
|
||||
cache map[string]*Shape
|
||||
}
|
||||
|
||||
func (osf *shapeGetterImpl) GetInstancePoolShape(ip *core.InstancePool) (*Shape, error) {
|
||||
|
||||
shape := &Shape{}
|
||||
|
||||
klog.V(5).Info("fetching shape configuration details for instance-pool " + *ip.Id)
|
||||
|
||||
instanceConfig, err := osf.shapeClient.GetInstanceConfiguration(context.Background(), core.GetInstanceConfigurationRequest{
|
||||
InstanceConfigurationId: ip.InstanceConfigurationId,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if instanceConfig.InstanceDetails == nil {
|
||||
return nil, fmt.Errorf("instance configuration details for instance %s has not been set", *ip.Id)
|
||||
}
|
||||
|
||||
if instanceDetails, ok := instanceConfig.InstanceDetails.(core.ComputeInstanceDetails); ok {
|
||||
// flexible shape use details or look up the static shape details below.
|
||||
if instanceDetails.LaunchDetails != nil && instanceDetails.LaunchDetails.ShapeConfig != nil {
|
||||
if instanceDetails.LaunchDetails.Shape != nil {
|
||||
shape.Name = *instanceDetails.LaunchDetails.Shape
|
||||
}
|
||||
if instanceDetails.LaunchDetails.ShapeConfig.Ocpus != nil {
|
||||
shape.CPU = *instanceDetails.LaunchDetails.ShapeConfig.Ocpus
|
||||
// Minimum amount of memory unless explicitly set higher
|
||||
shape.MemoryInBytes = *instanceDetails.LaunchDetails.ShapeConfig.Ocpus * 1024 * 1024 * 1024
|
||||
}
|
||||
if instanceDetails.LaunchDetails.ShapeConfig.MemoryInGBs != nil {
|
||||
shape.MemoryInBytes = *instanceDetails.LaunchDetails.ShapeConfig.MemoryInGBs * 1024 * 1024 * 1024
|
||||
}
|
||||
} else {
|
||||
allShapes, _ := osf.shapeClient.ListShapes(context.Background(), core.ListShapesRequest{
|
||||
CompartmentId: instanceConfig.CompartmentId,
|
||||
})
|
||||
for _, nextShape := range allShapes.Items {
|
||||
if *nextShape.Shape == *instanceDetails.LaunchDetails.Shape {
|
||||
shape.Name = *nextShape.Shape
|
||||
if nextShape.Ocpus != nil {
|
||||
shape.CPU = *nextShape.Ocpus
|
||||
}
|
||||
if nextShape.MemoryInGBs != nil {
|
||||
shape.MemoryInBytes = *nextShape.MemoryInGBs * 1024 * 1024 * 1024
|
||||
}
|
||||
if nextShape.Gpus != nil {
|
||||
shape.GPU = *nextShape.Gpus
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
return nil, fmt.Errorf("(compute) instance configuration for instance-pool %s not found", *ip.Id)
|
||||
}
|
||||
|
||||
// Didn't find a match
|
||||
if shape.Name == "" {
|
||||
return nil, fmt.Errorf("shape information for instance-pool %s not found", *ip.Id)
|
||||
}
|
||||
return shape, nil
|
||||
}
|
||||
|
|
@ -0,0 +1,237 @@
|
|||
package oci
|
||||
|
||||
import (
|
||||
"context"
|
||||
apiv1 "k8s.io/api/core/v1"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
|
||||
kubeletapis "k8s.io/kubelet/pkg/apis"
|
||||
"reflect"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/oci-go-sdk/v43/common"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/oci-go-sdk/v43/core"
|
||||
)
|
||||
|
||||
type mockShapeClient struct {
|
||||
err error
|
||||
listShapeResp core.ListShapesResponse
|
||||
getInstanceConfigResp core.GetInstanceConfigurationResponse
|
||||
}
|
||||
|
||||
func (m *mockShapeClient) ListShapes(_ context.Context, _ core.ListShapesRequest) (core.ListShapesResponse, error) {
|
||||
return m.listShapeResp, m.err
|
||||
}
|
||||
|
||||
func (m *mockShapeClient) GetInstanceConfiguration(context.Context, core.GetInstanceConfigurationRequest) (core.GetInstanceConfigurationResponse, error) {
|
||||
return m.getInstanceConfigResp, m.err
|
||||
}
|
||||
|
||||
var launchDetails = core.InstanceConfigurationLaunchInstanceDetails{
|
||||
CompartmentId: nil,
|
||||
DisplayName: nil,
|
||||
CreateVnicDetails: nil,
|
||||
Shape: common.String("VM.Standard.E3.Flex"),
|
||||
ShapeConfig: &core.InstanceConfigurationLaunchInstanceShapeConfigDetails{
|
||||
Ocpus: common.Float32(8),
|
||||
MemoryInGBs: common.Float32(128),
|
||||
},
|
||||
SourceDetails: nil,
|
||||
}
|
||||
var instanceDetails = core.ComputeInstanceDetails{
|
||||
LaunchDetails: &launchDetails,
|
||||
}
|
||||
|
||||
var shapeClient = &mockShapeClient{
|
||||
err: nil,
|
||||
listShapeResp: core.ListShapesResponse{
|
||||
Items: []core.Shape{
|
||||
{
|
||||
Shape: common.String("VM.Standard2.8"),
|
||||
Ocpus: common.Float32(8),
|
||||
MemoryInGBs: common.Float32(120),
|
||||
},
|
||||
},
|
||||
},
|
||||
getInstanceConfigResp: core.GetInstanceConfigurationResponse{
|
||||
RawResponse: nil,
|
||||
InstanceConfiguration: core.InstanceConfiguration{
|
||||
CompartmentId: nil,
|
||||
Id: common.String("ocid1.instanceconfiguration.oc1.phx.aaaaaaaa1"),
|
||||
TimeCreated: nil,
|
||||
DefinedTags: nil,
|
||||
DisplayName: nil,
|
||||
FreeformTags: nil,
|
||||
InstanceDetails: instanceDetails,
|
||||
DeferredFields: nil,
|
||||
},
|
||||
Etag: nil,
|
||||
OpcRequestId: nil,
|
||||
},
|
||||
}
|
||||
|
||||
func TestGetShape(t *testing.T) {
|
||||
|
||||
testCases := map[string]struct {
|
||||
shape string
|
||||
expected *Shape
|
||||
}{
|
||||
"flex shape": {
|
||||
shape: "VM.Standard.E3.Flex",
|
||||
expected: &Shape{
|
||||
Name: "VM.Standard.E3.Flex",
|
||||
CPU: 8,
|
||||
MemoryInBytes: float32(128) * 1024 * 1024 * 1024,
|
||||
GPU: 0,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for name, tc := range testCases {
|
||||
shapeGetter := createShapeGetter(shapeClient)
|
||||
|
||||
t.Run(name, func(t *testing.T) {
|
||||
shape, err := shapeGetter.GetInstancePoolShape(&core.InstancePool{Id: &tc.shape, InstanceConfigurationId: common.String("ocid1.instanceconfiguration.oc1.phx.aaaaaaaa1")})
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if !reflect.DeepEqual(shape, tc.expected) {
|
||||
t.Errorf("wanted %+v ; got %+v", tc.expected, shape)
|
||||
}
|
||||
|
||||
if !strings.Contains(tc.shape, "Flex") {
|
||||
// we can't poolCache flex shapes so only check poolCache on non flex shapes
|
||||
cacheShape, ok := shapeGetter.(*shapeGetterImpl).cache[tc.shape]
|
||||
if !ok {
|
||||
t.Error("shape not found in poolCache")
|
||||
}
|
||||
|
||||
if !reflect.DeepEqual(cacheShape, tc.expected) {
|
||||
t.Errorf("wanted %+v ; got %+v", tc.expected, shape)
|
||||
}
|
||||
}
|
||||
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetInstancePoolTemplateNode(t *testing.T) {
|
||||
instancePoolCache := newInstancePoolCache(computeManagementClient, computeClient, virtualNetworkClient)
|
||||
instancePoolCache.poolCache["ocid1.instancepool.oc1.phx.aaaaaaaa1"] = &core.InstancePool{
|
||||
Id: common.String("ocid1.instancepool.oc1.phx.aaaaaaaa1"),
|
||||
CompartmentId: common.String("ocid1.compartment.oc1..aaaaaaaa1"),
|
||||
LifecycleState: core.InstancePoolLifecycleStateRunning,
|
||||
PlacementConfigurations: []core.InstancePoolPlacementConfiguration{{
|
||||
AvailabilityDomain: common.String("hash:US-ASHBURN-1"),
|
||||
PrimarySubnetId: common.String("ocid1.subnet.oc1.phx.aaaaaaaa1"),
|
||||
}},
|
||||
}
|
||||
instancePoolCache.instanceSummaryCache["ocid1.instancepool.oc1.phx.aaaaaaaa1"] = &[]core.InstanceSummary{{
|
||||
Id: common.String("ocid1.instance.oc1.phx.aaa1"),
|
||||
AvailabilityDomain: common.String("PHX-AD-2"),
|
||||
State: common.String(string(core.InstanceLifecycleStateRunning)),
|
||||
},
|
||||
}
|
||||
|
||||
cloudConfig := &CloudConfig{}
|
||||
cloudConfig.Global.CompartmentID = "ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
|
||||
var manager = &InstancePoolManagerImpl{
|
||||
cfg: cloudConfig,
|
||||
shapeGetter: createShapeGetter(shapeClient),
|
||||
staticInstancePools: map[string]*InstancePoolNodeGroup{
|
||||
"ocid1.instancepool.oc1.phx.aaaaaaaa1": {id: "ocid1.instancepool.oc1.phx.aaaaaaaa1"},
|
||||
},
|
||||
instancePoolCache: instancePoolCache,
|
||||
}
|
||||
|
||||
instancePoolNodeGroups := manager.GetInstancePools()
|
||||
|
||||
if got := len(instancePoolNodeGroups); got != 1 {
|
||||
t.Fatalf("expected 1 (static) instance pool, got %d", got)
|
||||
}
|
||||
nodeTemplate, err := manager.GetInstancePoolTemplateNode(*instancePoolNodeGroups[0])
|
||||
if err != nil {
|
||||
t.Fatalf("received unexpected error refreshing cache; %+v", err)
|
||||
}
|
||||
labels := nodeTemplate.GetLabels()
|
||||
if labels == nil {
|
||||
t.Fatalf("expected labels on node object")
|
||||
}
|
||||
// Double check the shape label.
|
||||
if got := labels[apiv1.LabelInstanceTypeStable]; got != "VM.Standard.E3.Flex" {
|
||||
t.Fatalf("expected shape label %s to be set to VM.Standard.E3.Flex: %v", apiv1.LabelInstanceTypeStable, nodeTemplate.Labels)
|
||||
}
|
||||
|
||||
// Also check the AD label for good measure.
|
||||
if got := labels[apiv1.LabelTopologyZone]; got != "US-ASHBURN-1" {
|
||||
t.Fatalf("expected AD zone label %s to be set to US-ASHBURN-1: %v", apiv1.LabelTopologyZone, nodeTemplate.Labels)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func TestBuildGenericLabels(t *testing.T) {
|
||||
|
||||
shapeName := "VM.Standard2.8"
|
||||
np := &core.InstancePool{
|
||||
Id: common.String("ocid1.instancepool.oc1.phx.aaaaaaaah"),
|
||||
Size: common.Int(2),
|
||||
}
|
||||
|
||||
nodeName := "node1"
|
||||
availabilityDomain := "US-ASHBURN-1"
|
||||
|
||||
expected := map[string]string{
|
||||
kubeletapis.LabelArch: cloudprovider.DefaultArch,
|
||||
apiv1.LabelArchStable: cloudprovider.DefaultArch,
|
||||
kubeletapis.LabelOS: cloudprovider.DefaultOS,
|
||||
apiv1.LabelOSStable: cloudprovider.DefaultOS,
|
||||
apiv1.LabelZoneRegion: "phx",
|
||||
apiv1.LabelZoneRegionStable: "phx",
|
||||
apiv1.LabelInstanceType: shapeName,
|
||||
apiv1.LabelInstanceTypeStable: shapeName,
|
||||
apiv1.LabelZoneFailureDomain: availabilityDomain,
|
||||
apiv1.LabelZoneFailureDomainStable: availabilityDomain,
|
||||
apiv1.LabelHostname: nodeName,
|
||||
}
|
||||
|
||||
launchDetails := core.InstanceConfigurationLaunchInstanceDetails{
|
||||
Shape: common.String("VM.Standard2.8"),
|
||||
}
|
||||
|
||||
instanceDetails := core.ComputeInstanceDetails{
|
||||
LaunchDetails: &launchDetails,
|
||||
}
|
||||
|
||||
// For list shapes
|
||||
mockShapeClient := &mockShapeClient{
|
||||
err: nil,
|
||||
listShapeResp: core.ListShapesResponse{
|
||||
Items: []core.Shape{
|
||||
{Shape: common.String("VM.Standard2.4"), Ocpus: common.Float32(4), MemoryInGBs: common.Float32(60)},
|
||||
{Shape: common.String("VM.Standard2.8"), Ocpus: common.Float32(8), MemoryInGBs: common.Float32(120)}},
|
||||
},
|
||||
getInstanceConfigResp: core.GetInstanceConfigurationResponse{
|
||||
InstanceConfiguration: core.InstanceConfiguration{
|
||||
Id: common.String("ocid1.instanceconfiguration.oc1.phx.aaaaaaaa1"),
|
||||
InstanceDetails: instanceDetails,
|
||||
},
|
||||
},
|
||||
}
|
||||
shapeGetter := createShapeGetter(mockShapeClient)
|
||||
|
||||
manager := InstancePoolManagerImpl{
|
||||
shapeGetter: shapeGetter,
|
||||
}
|
||||
|
||||
shape, err := manager.shapeGetter.GetInstancePoolShape(np)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %+v", err)
|
||||
}
|
||||
|
||||
output := buildGenericLabelsForInstancePool(np, nodeName, shape.Name, availabilityDomain)
|
||||
if !reflect.DeepEqual(output, expected) {
|
||||
t.Fatalf("got %+v\nwanted %+v", output, expected)
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,211 @@
|
|||
/*
|
||||
Copyright 2021 Oracle and/or its affiliates.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package oci
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"k8s.io/client-go/kubernetes"
|
||||
"k8s.io/klog/v2"
|
||||
"math"
|
||||
"math/rand"
|
||||
"net"
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
"github.com/pkg/errors"
|
||||
apiv1 "k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/oci-go-sdk/v43/common"
|
||||
"k8s.io/kubernetes/pkg/apis/scheduling"
|
||||
)
|
||||
|
||||
// IsRetryable returns true if the given error is retryable.
|
||||
func IsRetryable(err error) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
err = errors.Cause(err)
|
||||
|
||||
// Retry on network timeout errors
|
||||
if err, ok := err.(net.Error); ok && err.Timeout() {
|
||||
return true
|
||||
}
|
||||
|
||||
// handle oci retryable errors.
|
||||
serviceErr, ok := common.IsServiceError(err)
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
|
||||
switch serviceErr.GetHTTPStatusCode() {
|
||||
case http.StatusTooManyRequests, http.StatusGatewayTimeout,
|
||||
http.StatusInternalServerError, http.StatusBadGateway:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func newRetryPolicy() *common.RetryPolicy {
|
||||
return NewRetryPolicyWithMaxAttempts(uint(8))
|
||||
}
|
||||
|
||||
// NewRetryPolicyWithMaxAttempts returns a RetryPolicy with the specified max retryAttempts
|
||||
func NewRetryPolicyWithMaxAttempts(retryAttempts uint) *common.RetryPolicy {
|
||||
isRetryableOperation := func(r common.OCIOperationResponse) bool {
|
||||
return IsRetryable(r.Error)
|
||||
}
|
||||
|
||||
nextDuration := func(r common.OCIOperationResponse) time.Duration {
|
||||
// you might want wait longer for next retry when your previous one failed
|
||||
// this function will return the duration as:
|
||||
// 1s, 2s, 4s, 8s, 16s, 32s, 64s etc...
|
||||
return time.Duration(math.Pow(float64(2), float64(r.AttemptNumber-1))) * time.Second
|
||||
}
|
||||
|
||||
policy := common.NewRetryPolicy(
|
||||
retryAttempts, isRetryableOperation, nextDuration,
|
||||
)
|
||||
return &policy
|
||||
}
|
||||
|
||||
// Missing resource requests on kube-proxy
|
||||
// Flannel missing priority
|
||||
|
||||
func buildCSINodePod() *apiv1.Pod {
|
||||
priority := scheduling.SystemCriticalPriority
|
||||
return &apiv1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: fmt.Sprintf("csi-oci-node-%d", rand.Int63()),
|
||||
Namespace: "kube-system",
|
||||
Labels: map[string]string{
|
||||
"app": "csi-oci-node",
|
||||
},
|
||||
},
|
||||
Spec: apiv1.PodSpec{
|
||||
Containers: []apiv1.Container{
|
||||
{
|
||||
Image: "iad.ocir.io/oracle/cloud-provider-oci:latest",
|
||||
},
|
||||
},
|
||||
Priority: &priority,
|
||||
},
|
||||
Status: apiv1.PodStatus{
|
||||
Phase: apiv1.PodRunning,
|
||||
Conditions: []apiv1.PodCondition{
|
||||
{
|
||||
Type: apiv1.PodReady,
|
||||
Status: apiv1.ConditionTrue,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func annotateNode(kubeClient kubernetes.Interface, nodeName string, key string, value string) error {
|
||||
|
||||
if nodeName == "" {
|
||||
return errors.New("node name is required")
|
||||
}
|
||||
if kubeClient == nil {
|
||||
return errors.New("kubeconfig is required")
|
||||
}
|
||||
|
||||
node, err := kubeClient.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
klog.Errorf("failed to get node %s %+v", nodeName, err)
|
||||
return err
|
||||
}
|
||||
|
||||
annotations := node.GetAnnotations()
|
||||
if annotations == nil {
|
||||
annotations = map[string]string{}
|
||||
}
|
||||
|
||||
if v := annotations[key]; v != value {
|
||||
node.Annotations[key] = value
|
||||
_, err := kubeClient.CoreV1().Nodes().Update(context.Background(), node, metav1.UpdateOptions{})
|
||||
if err != nil {
|
||||
klog.Errorf("failed to annotate node %s %+v", nodeName, err)
|
||||
return err
|
||||
}
|
||||
klog.V(3).Infof("updated annotation %s=%s on node: %s", key, value, nodeName)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func labelNode(kubeClient kubernetes.Interface, nodeName string, key string, value string) error {
|
||||
|
||||
if nodeName == "" {
|
||||
return errors.New("node name is required")
|
||||
}
|
||||
if kubeClient == nil {
|
||||
return errors.New("kubeconfig is required")
|
||||
}
|
||||
|
||||
node, err := kubeClient.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
klog.Errorf("failed to get node %s %+v", nodeName, err)
|
||||
return err
|
||||
}
|
||||
|
||||
labels := node.GetLabels()
|
||||
if labels == nil {
|
||||
labels = map[string]string{}
|
||||
}
|
||||
|
||||
if v := labels[key]; v != value {
|
||||
node.Labels[key] = value
|
||||
_, err := kubeClient.CoreV1().Nodes().Update(context.Background(), node, metav1.UpdateOptions{})
|
||||
if err != nil {
|
||||
klog.Errorf("failed to label node %s %+v", nodeName, err)
|
||||
return err
|
||||
}
|
||||
klog.V(3).Infof("updated label %s=%s on node: %s", key, value, nodeName)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func setNodeProviderID(kubeClient kubernetes.Interface, nodeName string, value string) error {
|
||||
|
||||
if nodeName == "" {
|
||||
return errors.New("node name is required")
|
||||
}
|
||||
if kubeClient == nil {
|
||||
return errors.New("kubeconfig is required")
|
||||
}
|
||||
|
||||
node, err := kubeClient.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{})
|
||||
|
||||
if err != nil {
|
||||
klog.Errorf("failed to get node %s %+v", nodeName, err)
|
||||
return err
|
||||
}
|
||||
|
||||
if node.Spec.ProviderID != value {
|
||||
node.Spec.ProviderID = value
|
||||
_, err := kubeClient.CoreV1().Nodes().Update(context.Background(), node, metav1.UpdateOptions{})
|
||||
if err != nil {
|
||||
klog.Errorf("failed to update node's provider ID %s %+v", nodeName, err)
|
||||
return err
|
||||
}
|
||||
klog.V(3).Infof("updated provider ID on node: %s", nodeName)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
|
@ -0,0 +1,12 @@
|
|||
package oci
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestSetProviderID(t *testing.T) {
|
||||
err := setNodeProviderID(nil, "", "")
|
||||
if err == nil {
|
||||
t.Fatal("expected error")
|
||||
}
|
||||
}
|
||||
|
|
@ -157,7 +157,8 @@ skipped_dirs = ['Godeps', 'third_party', '_gopath', '_output', '.git', 'cluster/
|
|||
"cluster-autoscaler/cloudprovider/digitalocean/godo",
|
||||
"cluster-autoscaler/cloudprovider/magnum/gophercloud",
|
||||
"cluster-autoscaler/cloudprovider/ionoscloud/ionos-cloud-sdk-go",
|
||||
"cluster-autoscaler/cloudprovider/hetzner/hcloud-go"]
|
||||
"cluster-autoscaler/cloudprovider/hetzner/hcloud-go",
|
||||
"cluster-autoscaler/cloudprovider/oci"]
|
||||
|
||||
# list all the files contain 'DO NOT EDIT', but are not generated
|
||||
skipped_ungenerated_files = ['hack/build-ui.sh', 'hack/lib/swagger.sh',
|
||||
|
|
|
|||
|
|
@ -23,4 +23,4 @@ DIR=$(dirname $0)
|
|||
go install ${DIR}/../../../github.com/client9/misspell/cmd/misspell
|
||||
|
||||
# Spell checking
|
||||
git ls-files --full-name | grep -v -e vendor | grep -v cluster-autoscaler/cloudprovider/magnum/gophercloud| grep -v cluster-autoscaler/cloudprovider/huaweicloud/huaweicloud-sdk-go-v3 | grep -v cluster-autoscaler/cloudprovider/digitalocean/godo | grep -v cluster-autoscaler/cloudprovider/hetzner/hcloud-go | grep -v cluster-autoscaler/cloudprovider/bizflycloud/gobizfly | grep -E -v 'cluster-autoscaler/cloudprovider/brightbox/(go-cache|gobrightbox|k8ssdk|linkheader)' | xargs misspell -error -o stderr
|
||||
git ls-files --full-name | grep -v -e vendor | grep -v cluster-autoscaler/cloudprovider/magnum/gophercloud| grep -v cluster-autoscaler/cloudprovider/huaweicloud/huaweicloud-sdk-go-v3 | grep -v cluster-autoscaler/cloudprovider/digitalocean/godo | grep -v cluster-autoscaler/cloudprovider/hetzner/hcloud-go | grep -v cluster-autoscaler/cloudprovider/bizflycloud/gobizfly | grep -v cluster-autoscaler/cloudprovider/oci/oci-go-sdk | grep -E -v 'cluster-autoscaler/cloudprovider/brightbox/(go-cache|gobrightbox|k8ssdk|linkheader)' | xargs misspell -error -o stderr
|
||||
|
|
|
|||
Loading…
Reference in New Issue