{# Copyright 2016 Google Inc. All rights reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. #} {% set NAME_PREFIX = env['deployment'] %} {% set CLUSTER_NAME = NAME_PREFIX %} {% set CPU_POOL = NAME_PREFIX + '-cpu-pool-' + properties['pool-version'] %} {% set GPU_POOL = NAME_PREFIX + '-gpu-pool-' + properties['pool-version'] %} {% set LARGE_POOL = NAME_PREFIX + '-large-pool-' + properties['pool-version'] %} {# Type names are the names to give to deployment manager type providers that will be created to represent Kubernetes objects. There is type corresponding to each API endpoint. #} {% set VM_OAUTH_SCOPES = ['https://www.googleapis.com/auth/logging.write', 'https://www.googleapis.com/auth/monitoring', 'https://www.googleapis.com/auth/devstorage.read_only'] %} {# Names for service accounts. -admin is to be used for admin tasks -user is to be used by users for actual jobs. -vm is used for the VM service account attached to the GKE VMs. #} {% set KF_ADMIN_NAME = NAME_PREFIX + '-admin' %} {% set KF_USER_NAME = NAME_PREFIX + '-user' %} {% set KF_VM_SA_NAME = NAME_PREFIX + '-vm' %} resources: - name: {{ KF_ADMIN_NAME }} type: iam.v1.serviceAccount properties: accountId: {{ KF_ADMIN_NAME }} displayName: Service Account used for Kubeflow admin actions. - name: {{ KF_USER_NAME }} type: iam.v1.serviceAccount properties: accountId: {{ KF_USER_NAME }} displayName: Service Account used for Kubeflow user actions. - name: {{ KF_VM_SA_NAME }} type: iam.v1.serviceAccount properties: accountId: {{ KF_VM_SA_NAME }} displayName: GCP Service Account to use as VM Service Account for Kubeflow Cluster VMs - name: {{ CLUSTER_NAME }} {% if properties['gkeApiVersion'] == 'v1beta1' %} type: gcp-types/container-v1beta1:projects.locations.clusters {% else %} type: container.v1.cluster {% endif %} properties: parent: projects/{{ env['project'] }}/locations/{{ properties['zone'] }} zone: {{ properties['zone'] }} cluster: name: {{ CLUSTER_NAME }} initialClusterVersion: {{ properties['cluster-version'] }} # We need 1.10.2 to support Stackdrivier GKE. # loggingService: none # monitoringService: none {% if properties['gkeApiVersion'] == 'v1beta1' %} podSecurityPolicyConfig: enabled: {{ properties['securityConfig']['podSecurityPolicy'] }} {% endif %} {% if properties['securityConfig']['privatecluster'] %} ipAllocationPolicy: createSubnetwork: true useIpAliases: true masterIpv4CidrBlock: {{ properties['securityConfig']['masterIpv4CidrBlock'] }} privateCluster: true masterAuthorizedNetworksConfig: enabled: {{ properties['securityConfig']['masterAuthorizedNetworksConfigEnabled'] }} {% if properties['securityConfig']['masterAuthorizedNetworksConfigEnabled'] %} cidrBlocks: {{ properties['securityConfig']['masterAuthorizedNetworksConfigCidr'] }} {% endif %} {% endif %} nodePools: - name: default-pool initialNodeCount: {{ properties['cpu-pool-initialNodeCount'] }} autoscaling: enabled: {{ properties['cpu-pool-enable-autoscaling'] }} {% if properties['cpu-pool-enable-autoscaling'] %} minNodeCount: {{ properties['cpu-pool-min-nodes'] }} maxNodeCount: {{ properties['cpu-pool-max-nodes'] }} {% endif %} config: {% if properties['securityConfig']['secureNodeMetadata'] %} workloadMetadataConfig: nodeMetadata: SECURE {% endif %} machineType: n1-standard-8 serviceAccount: {{ KF_VM_SA_NAME }}@{{ env['project'] }}.iam.gserviceaccount.com oauthScopes: {{ VM_OAUTH_SCOPES }} # Set min cpu platform to ensure AVX2 is supported. minCpuPlatform: 'Intel Haswell' metadata: dependsOn: - {{ KF_VM_SA_NAME }} # We manage the node pools as separate resources. # We do this so that if we want to make changes we can delete the existing resource and then recreate it. # Updating doesn't work so well because we are limited in what changes GKE's update method supports. - name: {{ GPU_POOL }} {% if properties['gkeApiVersion'] == 'v1beta1' %} type: gcp-types/container-v1beta1:projects.locations.clusters.nodePools {% else %} type: container.v1.nodePool {% endif %} properties: parent: projects/{{ env['project'] }}/locations/{{ properties['zone'] }}/clusters/{{ CLUSTER_NAME }} project: {{ properties['securityConfig']['project'] }} zone: {{ properties['zone'] }} clusterId: {{ CLUSTER_NAME }} nodePool: name: gpu-pool initialNodeCount: {{ properties['gpu-pool-initialNodeCount'] }} autoscaling: enabled: {{ properties['gpu-pool-enable-autoscaling'] }} {% if properties['gpu-pool-enable-autoscaling'] %} minNodeCount: {{ properties['gpu-pool-min-nodes'] }} maxNodeCount: {{ properties['gpu-pool-max-nodes'] }} {% endif %} config: {% if properties['securityConfig']['secureNodeMetadata'] %} workloadMetadataConfig: nodeMetadata: SECURE {% endif %} machineType: n1-standard-8 serviceAccount: {{ KF_VM_SA_NAME }}@{{ env['project'] }}.iam.gserviceaccount.com oauthScopes: {{ VM_OAUTH_SCOPES }} # Set min cpu platform to ensure AVX2 is supported. minCpuPlatform: 'Intel Haswell' accelerators: - acceleratorCount: 1 acceleratorType: nvidia-tesla-k80 metadata: dependsOn: # We can only create 1 node pool at a time. - {{ CLUSTER_NAME }} # Add a high memory pool because creating the search index requires a lot of memory. - name: {{ LARGE_POOL }} {% if properties['gkeApiVersion'] == 'v1beta1' %} type: gcp-types/container-v1beta1:projects.locations.clusters.nodePools {% else %} type: container.v1.nodePool {% endif %} properties: parent: projects/{{ env['project'] }}/locations/{{ properties['zone'] }}/clusters/{{ CLUSTER_NAME }} project: {{ properties['securityConfig']['project'] }} zone: {{ properties['zone'] }} clusterId: {{ CLUSTER_NAME }} nodePool: name: large-pool initialNodeCount: 0 autoscaling: enabled: true minNodeCount: 1 maxNodeCount: 10 config: {% if properties['securityConfig']['secureNodeMetadata'] %} workloadMetadataConfig: nodeMetadata: SECURE {% endif %} machineType: n1-standard-32 serviceAccount: {{ KF_VM_SA_NAME }}@{{ env['project'] }}.iam.gserviceaccount.com oauthScopes: {{ VM_OAUTH_SCOPES }} # Set min cpu platform to ensure AVX2 is supported. minCpuPlatform: 'Intel Haswell' metadata: dependsOn: # We can only create 1 node pool at a time. - {{ GPU_POOL }} {# Project defaults to the project of the deployment. #} - name: {{ properties['ipName'] }} type: compute.v1.globalAddress properties: description: "Static IP for Kubeflow ingress."