examples/code_search/demo/cs-demo-1103/gcp_config/cluster.jinja

196 lines
7.4 KiB
Django/Jinja

{#
Copyright 2016 Google Inc. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
#}
{% set NAME_PREFIX = env['deployment'] %}
{% set CLUSTER_NAME = NAME_PREFIX %}
{% set CPU_POOL = NAME_PREFIX + '-cpu-pool-' + properties['pool-version'] %}
{% set GPU_POOL = NAME_PREFIX + '-gpu-pool-' + properties['pool-version'] %}
{% set LARGE_POOL = NAME_PREFIX + '-large-pool-' + properties['pool-version'] %}
{# Type names are the names to give to deployment manager type providers
that will be created to represent Kubernetes objects.
There is type corresponding to each API endpoint.
#}
{% set VM_OAUTH_SCOPES = ['https://www.googleapis.com/auth/logging.write',
'https://www.googleapis.com/auth/monitoring',
'https://www.googleapis.com/auth/devstorage.read_only'] %}
{# Names for service accounts.
-admin is to be used for admin tasks
-user is to be used by users for actual jobs.
-vm is used for the VM service account attached to the GKE VMs.
#}
{% set KF_ADMIN_NAME = NAME_PREFIX + '-admin' %}
{% set KF_USER_NAME = NAME_PREFIX + '-user' %}
{% set KF_VM_SA_NAME = NAME_PREFIX + '-vm' %}
resources:
- name: {{ KF_ADMIN_NAME }}
type: iam.v1.serviceAccount
properties:
accountId: {{ KF_ADMIN_NAME }}
displayName: Service Account used for Kubeflow admin actions.
- name: {{ KF_USER_NAME }}
type: iam.v1.serviceAccount
properties:
accountId: {{ KF_USER_NAME }}
displayName: Service Account used for Kubeflow user actions.
- name: {{ KF_VM_SA_NAME }}
type: iam.v1.serviceAccount
properties:
accountId: {{ KF_VM_SA_NAME }}
displayName: GCP Service Account to use as VM Service Account for Kubeflow Cluster VMs
- name: {{ CLUSTER_NAME }}
{% if properties['gkeApiVersion'] == 'v1beta1' %}
type: gcp-types/container-v1beta1:projects.locations.clusters
{% else %}
type: container.v1.cluster
{% endif %}
properties:
parent: projects/{{ env['project'] }}/locations/{{ properties['zone'] }}
zone: {{ properties['zone'] }}
cluster:
name: {{ CLUSTER_NAME }}
initialClusterVersion: {{ properties['cluster-version'] }}
# We need 1.10.2 to support Stackdrivier GKE.
# loggingService: none
# monitoringService: none
{% if properties['gkeApiVersion'] == 'v1beta1' %}
podSecurityPolicyConfig:
enabled: {{ properties['securityConfig']['podSecurityPolicy'] }}
{% endif %}
{% if properties['securityConfig']['privatecluster'] %}
ipAllocationPolicy:
createSubnetwork: true
useIpAliases: true
masterIpv4CidrBlock: {{ properties['securityConfig']['masterIpv4CidrBlock'] }}
privateCluster: true
masterAuthorizedNetworksConfig:
enabled: {{ properties['securityConfig']['masterAuthorizedNetworksConfigEnabled'] }}
{% if properties['securityConfig']['masterAuthorizedNetworksConfigEnabled'] %}
cidrBlocks:
{{ properties['securityConfig']['masterAuthorizedNetworksConfigCidr'] }}
{% endif %}
{% endif %}
nodePools:
- name: default-pool
initialNodeCount: {{ properties['cpu-pool-initialNodeCount'] }}
autoscaling:
enabled: {{ properties['cpu-pool-enable-autoscaling'] }}
{% if properties['cpu-pool-enable-autoscaling'] %}
minNodeCount: {{ properties['cpu-pool-min-nodes'] }}
maxNodeCount: {{ properties['cpu-pool-max-nodes'] }}
{% endif %}
config:
{% if properties['securityConfig']['secureNodeMetadata'] %}
workloadMetadataConfig:
nodeMetadata: SECURE
{% endif %}
machineType: n1-standard-8
serviceAccount: {{ KF_VM_SA_NAME }}@{{ env['project'] }}.iam.gserviceaccount.com
oauthScopes: {{ VM_OAUTH_SCOPES }}
# Set min cpu platform to ensure AVX2 is supported.
minCpuPlatform: 'Intel Haswell'
metadata:
dependsOn:
- {{ KF_VM_SA_NAME }}
# We manage the node pools as separate resources.
# We do this so that if we want to make changes we can delete the existing resource and then recreate it.
# Updating doesn't work so well because we are limited in what changes GKE's update method supports.
- name: {{ GPU_POOL }}
{% if properties['gkeApiVersion'] == 'v1beta1' %}
type: gcp-types/container-v1beta1:projects.locations.clusters.nodePools
{% else %}
type: container.v1.nodePool
{% endif %}
properties:
parent: projects/{{ env['project'] }}/locations/{{ properties['zone'] }}/clusters/{{ CLUSTER_NAME }}
project: {{ properties['securityConfig']['project'] }}
zone: {{ properties['zone'] }}
clusterId: {{ CLUSTER_NAME }}
nodePool:
name: gpu-pool
initialNodeCount: {{ properties['gpu-pool-initialNodeCount'] }}
autoscaling:
enabled: {{ properties['gpu-pool-enable-autoscaling'] }}
{% if properties['gpu-pool-enable-autoscaling'] %}
minNodeCount: {{ properties['gpu-pool-min-nodes'] }}
maxNodeCount: {{ properties['gpu-pool-max-nodes'] }}
{% endif %}
config:
{% if properties['securityConfig']['secureNodeMetadata'] %}
workloadMetadataConfig:
nodeMetadata: SECURE
{% endif %}
machineType: n1-standard-8
serviceAccount: {{ KF_VM_SA_NAME }}@{{ env['project'] }}.iam.gserviceaccount.com
oauthScopes: {{ VM_OAUTH_SCOPES }}
# Set min cpu platform to ensure AVX2 is supported.
minCpuPlatform: 'Intel Haswell'
accelerators:
- acceleratorCount: 1
acceleratorType: nvidia-tesla-k80
metadata:
dependsOn:
# We can only create 1 node pool at a time.
- {{ CLUSTER_NAME }}
# Add a high memory pool because creating the search index requires a lot of memory.
- name: {{ LARGE_POOL }}
{% if properties['gkeApiVersion'] == 'v1beta1' %}
type: gcp-types/container-v1beta1:projects.locations.clusters.nodePools
{% else %}
type: container.v1.nodePool
{% endif %}
properties:
parent: projects/{{ env['project'] }}/locations/{{ properties['zone'] }}/clusters/{{ CLUSTER_NAME }}
project: {{ properties['securityConfig']['project'] }}
zone: {{ properties['zone'] }}
clusterId: {{ CLUSTER_NAME }}
nodePool:
name: large-pool
initialNodeCount: 0
autoscaling:
enabled: true
minNodeCount: 1
maxNodeCount: 10
config:
{% if properties['securityConfig']['secureNodeMetadata'] %}
workloadMetadataConfig:
nodeMetadata: SECURE
{% endif %}
machineType: n1-standard-32
serviceAccount: {{ KF_VM_SA_NAME }}@{{ env['project'] }}.iam.gserviceaccount.com
oauthScopes: {{ VM_OAUTH_SCOPES }}
# Set min cpu platform to ensure AVX2 is supported.
minCpuPlatform: 'Intel Haswell'
metadata:
dependsOn:
# We can only create 1 node pool at a time.
- {{ GPU_POOL }}
{# Project defaults to the project of the deployment. #}
- name: {{ properties['ipName'] }}
type: compute.v1.globalAddress
properties:
description: "Static IP for Kubeflow ingress."