examples/code_search/demo/cs-demo-1103/gcp_config/cluster.jinja

{#
Copyright 2016 Google Inc. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
#}


{% set NAME_PREFIX = env['deployment'] %}
{% set CLUSTER_NAME = NAME_PREFIX %}
{% set CPU_POOL = NAME_PREFIX + '-cpu-pool-' + properties['pool-version'] %}
{% set GPU_POOL = NAME_PREFIX + '-gpu-pool-' + properties['pool-version'] %}
{% set LARGE_POOL = NAME_PREFIX + '-large-pool-' + properties['pool-version'] %}

{# Type names are the names to give to deployment manager type providers
   that will be created to represent Kubernetes objects.
   There is type corresponding to each API endpoint.
#}

{% set VM_OAUTH_SCOPES = ['https://www.googleapis.com/auth/logging.write',
                          'https://www.googleapis.com/auth/monitoring',
                          'https://www.googleapis.com/auth/devstorage.read_only'] %}

{# Names for service accounts.
   -admin is to be used for admin tasks
   -user is to be used by users for actual jobs.
   -vm is used for the VM service account attached to the GKE VMs.
 #}
{% set KF_ADMIN_NAME = NAME_PREFIX + '-admin' %}
{% set KF_USER_NAME = NAME_PREFIX + '-user' %}
{% set KF_VM_SA_NAME = NAME_PREFIX + '-vm' %}

resources:
- name: {{ KF_ADMIN_NAME }}
  type: iam.v1.serviceAccount
  properties:
    accountId: {{ KF_ADMIN_NAME }}
    displayName: Service Account used for Kubeflow admin actions.

- name: {{ KF_USER_NAME }}
  type: iam.v1.serviceAccount
  properties:
    accountId: {{ KF_USER_NAME }}
    displayName: Service Account used for Kubeflow user actions.

- name: {{ KF_VM_SA_NAME }}
  type: iam.v1.serviceAccount
  properties:
    accountId: {{ KF_VM_SA_NAME }}
    displayName: GCP Service Account to use as VM Service Account for Kubeflow Cluster VMs

- name: {{ CLUSTER_NAME }}
  {% if properties['gkeApiVersion'] == 'v1beta1' %}
  type: gcp-types/container-v1beta1:projects.locations.clusters
  {% else %}
  type: container.v1.cluster
  {% endif %}
  properties:
    parent: projects/{{ env['project'] }}/locations/{{ properties['zone'] }}
    zone: {{ properties['zone'] }}
    cluster:
      name: {{ CLUSTER_NAME }}
      initialClusterVersion: {{ properties['cluster-version'] }}
      # We need 1.10.2 to support Stackdrivier GKE.
      # loggingService: none
      # monitoringService: none
      {% if properties['gkeApiVersion'] == 'v1beta1' %}
      podSecurityPolicyConfig:
        enabled: {{ properties['securityConfig']['podSecurityPolicy'] }}
      {% endif %}
      {% if properties['securityConfig']['privatecluster'] %}
      ipAllocationPolicy:
        createSubnetwork: true
        useIpAliases: true
      masterIpv4CidrBlock: {{ properties['securityConfig']['masterIpv4CidrBlock'] }}
      privateCluster: true
      masterAuthorizedNetworksConfig:
        enabled: {{ properties['securityConfig']['masterAuthorizedNetworksConfigEnabled'] }}
        {% if properties['securityConfig']['masterAuthorizedNetworksConfigEnabled'] %}
        cidrBlocks:
          {{ properties['securityConfig']['masterAuthorizedNetworksConfigCidr'] }}
        {% endif %}
      {% endif %}
      nodePools:
      - name: default-pool
        initialNodeCount: {{ properties['cpu-pool-initialNodeCount'] }}
        autoscaling:
          enabled: {{ properties['cpu-pool-enable-autoscaling'] }}
          {% if properties['cpu-pool-enable-autoscaling'] %}
          minNodeCount: {{ properties['cpu-pool-min-nodes'] }}
          maxNodeCount: {{ properties['cpu-pool-max-nodes'] }}
          {% endif %}
        config:
          {% if properties['securityConfig']['secureNodeMetadata'] %}
          workloadMetadataConfig:
            nodeMetadata: SECURE
          {% endif %}
          machineType: n1-standard-8
          serviceAccount: {{ KF_VM_SA_NAME }}@{{ env['project'] }}.iam.gserviceaccount.com
          oauthScopes: {{ VM_OAUTH_SCOPES }}
          # Set min cpu platform to ensure AVX2 is supported.
          minCpuPlatform: 'Intel Haswell'
  metadata:
    dependsOn:
    - {{ KF_VM_SA_NAME }}

# We manage the node pools as separate resources.
# We do this so that if we want to make changes we can delete the existing resource and then recreate it.
# Updating doesn't work so well because we are limited in what changes GKE's update method supports.

- name: {{ GPU_POOL }}
  {% if properties['gkeApiVersion'] == 'v1beta1' %}
  type: gcp-types/container-v1beta1:projects.locations.clusters.nodePools
  {% else %}
  type: container.v1.nodePool
  {% endif %}
  properties:
    parent: projects/{{ env['project'] }}/locations/{{ properties['zone'] }}/clusters/{{ CLUSTER_NAME }}
    project: {{ properties['securityConfig']['project'] }}
    zone: {{ properties['zone'] }}
    clusterId: {{ CLUSTER_NAME }}
    nodePool:
      name: gpu-pool
      initialNodeCount: {{ properties['gpu-pool-initialNodeCount'] }}
      autoscaling:
        enabled: {{ properties['gpu-pool-enable-autoscaling'] }}
        {% if properties['gpu-pool-enable-autoscaling'] %}
        minNodeCount: {{ properties['gpu-pool-min-nodes'] }}
        maxNodeCount: {{ properties['gpu-pool-max-nodes'] }}
        {% endif %}
      config:
        {% if properties['securityConfig']['secureNodeMetadata'] %}
        workloadMetadataConfig:
          nodeMetadata: SECURE
        {% endif %}
        machineType: n1-standard-8
        serviceAccount: {{ KF_VM_SA_NAME }}@{{ env['project'] }}.iam.gserviceaccount.com
        oauthScopes: {{ VM_OAUTH_SCOPES }}
        # Set min cpu platform to ensure AVX2 is supported.
        minCpuPlatform: 'Intel Haswell'
        accelerators:
          - acceleratorCount: 1
            acceleratorType: nvidia-tesla-k80

  metadata:
    dependsOn:
    # We can only create 1 node pool at a time.
    - {{ CLUSTER_NAME }}

# Add a high memory pool because creating the search index requires a lot of memory.
- name: {{ LARGE_POOL }}
  {% if properties['gkeApiVersion'] == 'v1beta1' %}
  type: gcp-types/container-v1beta1:projects.locations.clusters.nodePools
  {% else %}
  type: container.v1.nodePool
  {% endif %}
  properties:
    parent: projects/{{ env['project'] }}/locations/{{ properties['zone'] }}/clusters/{{ CLUSTER_NAME }}
    project: {{ properties['securityConfig']['project'] }}
    zone: {{ properties['zone'] }}
    clusterId: {{ CLUSTER_NAME }}
    nodePool:
      name: large-pool
      initialNodeCount: 0
      autoscaling:
        enabled: true
        minNodeCount: 1
        maxNodeCount: 10
      config:
        {% if properties['securityConfig']['secureNodeMetadata'] %}
        workloadMetadataConfig:
          nodeMetadata: SECURE
        {% endif %}
        machineType: n1-standard-32
        serviceAccount: {{ KF_VM_SA_NAME }}@{{ env['project'] }}.iam.gserviceaccount.com
        oauthScopes: {{ VM_OAUTH_SCOPES }}
        # Set min cpu platform to ensure AVX2 is supported.
        minCpuPlatform: 'Intel Haswell'

  metadata:
    dependsOn:
    # We can only create 1 node pool at a time.
    - {{ GPU_POOL }}

{# Project defaults to the project of the deployment. #}
- name: {{ properties['ipName']  }}
  type: compute.v1.globalAddress
  properties:
    description: "Static IP for Kubeflow ingress."