Import rev 95381e1892a43a167b0894f583894da2a7067123 of fluxcd/flagger docs

- Add nav entries (frontmatter)
- Import diagrams and screens - use local images (not hotlink GitHub)
- Import redirects too

Addresses: #894

Signed-off-by: Daniel Holbach <daniel@weave.works>
This commit is contained in:
Daniel Holbach 2021-10-31 11:44:58 +01:00
parent 09221d4679
commit edca76e009
88 changed files with 10253 additions and 43 deletions

View File

@ -9,7 +9,7 @@ This repo houses the assets used to build the Flux project's landing page at <ht
> Project | Docs Site | Github Source
> ---------------- | ------------------------------------------| -------------
> Flux | <https://fluxcd.io/docs> | <https://github.com/fluxcd/website>
> Flagger | <https://docs.flagger.app> | <https://github.com/fluxcd/flagger>
> Flagger | <https://fluxcd.io/flagger> | <https://github.com/fluxcd/website>
> Flux (legacy) | <https://fluxcd.io/legacy/flux> | <https://github.com/fluxcd/website>
> Helm Operator | <https://fluxcd.io/legacy/helm-operator/> | <https://github.com/fluxcd/website>

View File

@ -1,7 +1,7 @@
adopters:
project: Flagger
description: >
Check out everything about Flagger on <https://flagger.app>.
Check out everything about Flagger on [its website](/flagger).
companies:
- name: Capra Consulting
url: https://www.capraconsulting.no

28
assets/icons/flagger.svg Normal file
View File

@ -0,0 +1,28 @@
<?xml version="1.0" encoding="UTF-8"?>
<svg width="64px" height="64px" viewBox="0 0 64 64" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
<title>flagger-icon</title>
<g id="flagger-icon" stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
<path d="M11.7870673,17.7791346 C10.7376442,17.0966346 10.7376442,15.5602885 11.7870673,14.8783654 L31.1659135,2.27951923 C31.4527085,2.09313128 31.7809164,2 32.109115,2 C32.1144833,2 32.1144833,30.6577081 32.1144833,30.6577081 C31.7844966,30.658555 31.4542722,30.565385 31.1659135,30.3779808 L11.7870673,17.7791346 Z" id="Fill-1" fill="#7BB09F"></path>
<path d="M1.78706731,24.2791346 C0.737644231,23.5966346 0.737644231,22.0602885 1.78706731,21.3783654 L21.1659135,8.77951923 C21.4527085,8.59313128 21.7809164,8.5 22.109147,8.5 C22.1144833,8.5 16.7504591,14.9302441 16.4621003,14.7428399 L1.78706731,24.2791346 Z" id="Fill-1-Copy" fill="#7BB09F"></path>
<path d="M32.1005414,32.7999243 C32.4314477,32.7984255 32.7627112,32.8915919 33.051875,33.0795192 L52.4312981,45.6783654 C53.4807212,46.3602885 53.4807212,47.8966346 52.4312981,48.5791346 L33.051875,61.1779808 C32.7723427,61.3596486 32.4534685,61.452763 32.1336066,61.4575 C32.1336066,61.4575 32.1005414,32.7999243 32.1005414,32.7999243 Z" id="Fill-1" fill="#7BB09F"></path>
<path d="M42.6594534,51.6821881 L62.4312981,39.1784411 C63.4807212,39.8603641 63.4807212,41.3967103 62.4312981,42.0792103 L43.051875,54.6780564 C42.7723427,54.8597243 42.4534685,54.9528387 42.1336066,54.9573995 C42.1336066,54.9573995 42.3702896,51.4942608 42.6594534,51.6821881 Z" id="Fill-1-Copy-2" fill="#7BB09F"></path>
<g id="Group" transform="translate(11.000000, 32.000000)" fill="#7BB09F">
<path d="M23.1648462,1.79890385 C22.7177486,1.57560653 21.9079818,0.775616463 21.12525,0.8 C20.4471662,0.8 20.1819808,1.07951923 19.08525,1.79890385 C20.6106346,2.45198077 26.2588846,4.40082692 27.8390769,4.83755769 C25.3252832,3.17403545 23.7672062,2.16115083 23.1648462,1.79890385 Z" id="Fill-5"></path>
<path d="M17.1629423,3.04898077 L15.4269808,4.17744231 C16.9371113,5.2881491 18.4473527,6.15609036 19.9577051,6.7812661 C23.0272183,8.05182074 26.9292047,8.86995809 27.8191154,9.08936538 C32.8706538,10.3349423 37.6418077,11.5107115 41.4783462,15.3478269 C41.6733462,15.54225 41.8562308,15.7407115 42.0373846,15.93975 C42.4308462,15.1880192 42.2335385,14.1957115 41.4466154,13.6845577 L33.8560385,8.74898077 C32.0133462,8.14090385 30.1360385,7.67590385 28.2806538,7.21898077 C26.5308462,6.78744231 23.5727919,6.02059898 21.9181765,5.49156052 C20.8150996,5.13886821 19.2300215,4.32467496 17.1629423,3.04898077 Z" id="Fill-7"></path>
<path d="M16.8750849,8.20483098 C15.6193488,7.62821266 14.2831279,6.83594208 12.8664221,5.82801923 L11.2119808,6.91782692 C12.8335577,8.16072333 14.2385577,9.07106402 15.4269808,9.648849 C17.2096154,10.5155265 24.0114808,12.3722308 24.4326346,12.4760769 C29.4841731,13.7210769 34.2553269,14.8968462 38.0924423,18.7339615 C38.0987885,18.7408846 38.1045577,18.7472308 38.1114808,18.7541538 L39.75225,17.6868462 C39.6524423,17.5824231 39.5584038,17.4751154 39.4545577,17.3718462 C35.2384038,13.1551154 29.9791731,11.8587692 24.8941731,10.6051154 C24.3137885,10.4620385 18.7586891,9.06975845 16.8750849,8.20483098 Z" id="Fill-11"></path>
<path d="M23.1650769,16.3935 C27.1175769,17.4140769 30.8341154,18.6342692 33.9823846,21.4381154 L35.6439231,20.3581154 C32.0465509,16.9762696 28.005918,15.3433009 23.4551919,14.4789663 C19.3757148,13.7041376 16.6984434,12.7436057 13.6520967,11.5324737 C11.621199,10.7250524 9.97648083,9.72718297 8.71794231,8.53886538 L7.04717308,9.62521154 C9.06194661,11.3700006 10.8411363,12.5664934 12.3847422,13.2146899 C14.4228927,14.0705575 18.0163376,15.1301608 23.1650769,16.3935 Z" id="Fill-17"></path>
<path d="M4.57875,11.2299231 L2.92990385,12.3018462 C2.98759615,12.3612692 3.04009615,12.423 3.09951923,12.4818462 C7.31625,16.6985769 12.5743269,17.9949231 17.6599038,19.2485769 C22.0641346,20.3337692 26.2543269,21.3687692 29.7989423,24.1581923 L31.4893269,23.0591538 C27.4958654,19.6968462 22.7385577,18.5158846 18.1214423,17.3781923 C13.1206731,16.1453077 8.39567308,14.9758846 4.57875,11.2299231" id="Fill-19"></path>
<path d="M1.07555769,14.5060962 C0.883442308,14.3139808 0.702865385,14.1184038 0.524019231,13.9216731 C-0.227711538,14.6745577 -0.139442308,15.9726346 0.80325,16.5853269 L6.50959615,20.2955192 C9.03536538,21.3409038 11.6765192,21.9945577 14.2738269,22.6349423 C18.3284423,23.6341731 22.2019038,24.5924423 25.5578654,26.9157115 L27.2834423,25.7930192 C23.4676731,22.9245577 19.0403654,21.8255192 14.7347885,20.7639808 C9.68382692,19.5189808 4.91267308,18.3432115 1.07555769,14.5060962" id="Fill-21"></path>
<path d="M19.6441154,28.8342692 C20.0243077,29.0188846 20.3998846,29.2133077 20.7691154,29.4221538 C21.2093077,29.5150385 21.6771923,29.4383077 22.0683462,29.1838846 L23.0260385,28.5613846 C19.9493077,26.5035 16.5287308,25.461 13.1196923,24.5927308 L19.6441154,28.8342692 Z" id="Fill-23"></path>
</g>
<g id="Group-Copy" transform="translate(11.000000, 1.200000)" fill="#7BB09F">
<path d="M23.1648462,1.79890385 C22.7177486,1.57560653 21.9079818,0.775616463 21.12525,0.8 C20.4471662,0.8 20.1819808,1.07951923 19.08525,1.79890385 C20.6106346,2.45198077 26.2588846,4.40082692 27.8390769,4.83755769 C25.3252832,3.17403545 23.7672062,2.16115083 23.1648462,1.79890385 Z" id="Fill-5"></path>
<path d="M17.1629423,3.04898077 L15.4269808,4.17744231 C16.9371113,5.2881491 18.4473527,6.15609036 19.9577051,6.7812661 C23.0272183,8.05182074 26.9292047,8.86995809 27.8191154,9.08936538 C32.8706538,10.3349423 37.6418077,11.5107115 41.4783462,15.3478269 C41.6733462,15.54225 41.8562308,15.7407115 42.0373846,15.93975 C42.4308462,15.1880192 42.2335385,14.1957115 41.4466154,13.6845577 L33.8560385,8.74898077 C32.0133462,8.14090385 30.1360385,7.67590385 28.2806538,7.21898077 C26.5308462,6.78744231 23.5727919,6.02059898 21.9181765,5.49156052 C20.8150996,5.13886821 19.2300215,4.32467496 17.1629423,3.04898077 Z" id="Fill-7"></path>
<path d="M16.8750849,8.20483098 C15.6193488,7.62821266 14.2831279,6.83594208 12.8664221,5.82801923 L11.2119808,6.91782692 C12.8335577,8.16072333 14.2385577,9.07106402 15.4269808,9.648849 C17.2096154,10.5155265 24.0114808,12.3722308 24.4326346,12.4760769 C29.4841731,13.7210769 34.2553269,14.8968462 38.0924423,18.7339615 C38.0987885,18.7408846 38.1045577,18.7472308 38.1114808,18.7541538 L39.75225,17.6868462 C39.6524423,17.5824231 39.5584038,17.4751154 39.4545577,17.3718462 C35.2384038,13.1551154 29.9791731,11.8587692 24.8941731,10.6051154 C24.3137885,10.4620385 18.7586891,9.06975845 16.8750849,8.20483098 Z" id="Fill-11"></path>
<path d="M23.1650769,16.3935 C27.1175769,17.4140769 30.8341154,18.6342692 33.9823846,21.4381154 L35.6439231,20.3581154 C32.0465509,16.9762696 28.005918,15.3433009 23.4551919,14.4789663 C19.3757148,13.7041376 16.6984434,12.7436057 13.6520967,11.5324737 C11.621199,10.7250524 9.97648083,9.72718297 8.71794231,8.53886538 L7.04717308,9.62521154 C9.06194661,11.3700006 10.8411363,12.5664934 12.3847422,13.2146899 C14.4228927,14.0705575 18.0163376,15.1301608 23.1650769,16.3935 Z" id="Fill-17"></path>
<path d="M4.57875,11.2299231 L2.92990385,12.3018462 C2.98759615,12.3612692 3.04009615,12.423 3.09951923,12.4818462 C7.31625,16.6985769 12.5743269,17.9949231 17.6599038,19.2485769 C22.0641346,20.3337692 26.2543269,21.3687692 29.7989423,24.1581923 L31.4893269,23.0591538 C27.4958654,19.6968462 22.7385577,18.5158846 18.1214423,17.3781923 C13.1206731,16.1453077 8.39567308,14.9758846 4.57875,11.2299231" id="Fill-19"></path>
<path d="M1.07555769,14.5060962 C0.883442308,14.3139808 0.702865385,14.1184038 0.524019231,13.9216731 C-0.227711538,14.6745577 -0.139442308,15.9726346 0.80325,16.5853269 L6.50959615,20.2955192 C9.03536538,21.3409038 11.6765192,21.9945577 14.2738269,22.6349423 C18.3284423,23.6341731 22.2019038,24.5924423 25.5578654,26.9157115 L27.2834423,25.7930192 C23.4676731,22.9245577 19.0403654,21.8255192 14.7347885,20.7639808 C9.68382692,19.5189808 4.91267308,18.3432115 1.07555769,14.5060962" id="Fill-21"></path>
<path d="M19.6441154,28.8342692 C20.0243077,29.0188846 20.3998846,29.2133077 20.7691154,29.4221538 C21.2093077,29.5150385 21.6771923,29.4383077 22.0683462,29.1838846 L23.0260385,28.5613846 C19.9493077,26.5035 16.5287308,25.461 13.1196923,24.5927308 L19.6441154,28.8342692 Z" id="Fill-23"></path>
</g>
</g>
</svg>

After

Width:  |  Height:  |  Size: 8.4 KiB

View File

@ -153,14 +153,6 @@ enable = true
icon = "fab fa-youtube"
desc = "Recordings of past Flux dev meetings"
[[menus.main]]
url = "/docs"
name = "Docs"
weight = 1
[[menus.main]]
identifier = "Project"
name = "Project"

View File

@ -255,6 +255,15 @@ What we achieved up until today is only possible because of <a href="/community"
{{% blocks/section color="white" %}}
<p id="flagger"></p>
{{% blocks/project title="Flagger"
image="/img/flagger-gitops.png" image-align="right" bg-color="green"
button1-url="/flagger" %}}
Kubernetes Operator for the automation of promoting canary deployments using Istio, Linkerd, App Mesh, NGINX, Skipper, Contour, Gloo or Traefik routing for traffic shifting, and Prometheus metrics for canary analysis.
The canary analysis can be extended with webhooks for running system integration/acceptance tests, load tests, or any other custom validation.
{{% /blocks/project %}}
<p id="gitops-toolkit"></p>
{{% blocks/project title="GitOps Toolkit"
image="/img/building-blocks.svg" image-align="left" bg-color="blue"
@ -263,15 +272,6 @@ What we achieved up until today is only possible because of <a href="/community"
The set of APIs and controllers that make up the runtime for Flux. You can use the GitOps Toolkit to extend Flux, and to build your own systems for continuous delivery.
{{% /blocks/project %}}
{{% blocks/project title="Flagger"
image="/img/flagger-gitops.png" image-align="right" bg-color="green"
button1-url="https://docs.flagger.app/"
button2-url="https://flagger.app/" %}}
Kubernetes Operator for the automation of promoting canary deployments using Istio, Linkerd, App Mesh, NGINX, Skipper, Contour, Gloo or Traefik routing for traffic shifting, and Prometheus metrics for canary analysis.
The canary analysis can be extended with webhooks for running system integration/acceptance tests, load tests, or any other custom validation.
{{% /blocks/project %}}
{{% blocks/project title="Flux v1 and Helm Operator"
image="/img/logos/flux-horizontal-color.png" image-align="right" bg-color="white"
button1-url="/legacy/flux" button1-caption="Flux v1 Documentation" button1-color="blue"

View File

@ -121,7 +121,7 @@ We are very happy to announce the v1.6 release of Flagger. This release
includes:
- Support for A/B testing using [Gloo
Edge](https://docs.flagger.app/tutorials/gloo-progressive-delivery)
Edge](/flagger/tutorials/gloo-progressive-delivery)
HTTP headers based routing.
- Extended support for Istio\'s `HTTPMatchRequest` and `VirtualService`
delegation.

View File

@ -119,7 +119,7 @@ history, the move itself and what it means:
## Flagger v1.7.0 is out
This release comes with support for manually approving the traffic weight increase. Starting with this version, Flagger can be used with Linkerd v2.10 and its new Viz addon, please see [the updated guide](https://docs.flagger.app/tutorials/linkerd-progressive-delivery). Thanks to the Linkerd team for contributing to Flagger.
This release comes with support for manually approving the traffic weight increase. Starting with this version, Flagger can be used with Linkerd v2.10 and its new Viz addon, please see [the updated guide](/flagger/tutorials/linkerd-progressive-delivery). Thanks to the Linkerd team for contributing to Flagger.
## Upcoming events

View File

@ -106,7 +106,7 @@ where they can be reviewed and approved before going to production.
## Flagger v1.8.0
Until now [Flagger](https://flagger.app) was compatible with Linkerd
Until now [Flagger](/flagger) was compatible with Linkerd
which implements the [Service Mesh Interface](https://smi-spec.io) (SMI) `v1alpha1`.
Starting with v1.8.0, Flagger extends the SMI support for the
`v1alpha2` and `v1alpha3` APIs.

View File

@ -96,7 +96,7 @@ Kubernetes 1.19 is supported.
This release comes with support for [Kuma Service
Mesh](https://kuma.io/). For more details see the [Kuma
Progressive Delivery
tutorial](https://docs.flagger.app/tutorials/kuma-progressive-delivery).
tutorial](/flagger/tutorials/kuma-progressive-delivery).
{{< imgproc flagger-kuma-canary Resize 600x >}}
Kuma Progressive Delivery with Flagger

View File

@ -118,7 +118,7 @@ services are adjusted accordingly.
![Flagger canary](flagger-canary-steps.png)
If you want to get started right away, have a look at [our
tutorial](https://docs.flagger.app/tutorials/gatewayapi-progressive-delivery),
tutorial](/flagger/tutorials/gatewayapi-progressive-delivery),
which shows you how to use Contour's Gateway API implementation and Flagger to automate canary
deployments. It won't take long to follow, but will convey how powerful
this integration is.

View File

@ -77,7 +77,7 @@ project within Kubernetes. Be sure to check out the blog post to find
out how to integrate this into your setups.
The Flux community is happy and proud that
[Flagger](https://docs.flagger.app/) is part of our effort
[Flagger](/flagger/) is part of our effort
to bring GitOps solutions to the world.
### Flux "Maintainers' Focus" Project Board

View File

@ -1,6 +1,5 @@
---
title: Flagger
manualLink: https://docs.flagger.app
manualLinkTarget: _blank
manualLink: /flagger
weight: 150
---

View File

@ -52,7 +52,7 @@ New app release can be automatically delivered to staging using Flux's [image up
For production, you may choose to manually approve app version bumps by configuring Flux
to push the changes to a new branch from which you can create a pull request.
You can limit the impact of an issue that escaped staging testing by using Flagger's
[canary releases](https://docs.flagger.app/).
[canary releases](/flagger/).
Changes to infrastructure can be disruptive and could cause cluster-wide outages.
These config changes can be made in steps, first you merge a change to staging,

View File

@ -1269,7 +1269,7 @@ If you are on GitHub, and are struggling to get started using GitHub Actions, or
[kubecfg yaml parser]: https://github.com/bitnami/kubecfg/blob/master/lib/kubecfg.libsonnet#L25
[bitnami-labs/kube-libsonnet]: https://github.com/bitnami-labs/kube-libsonnet
[example 10.3 jsonnet]: https://github.com/kingdonb/any_old_app/blob/release/0.10.3/manifests/example.jsonnet
[Manual Gating]: https://docs.flagger.app/usage/webhooks#manual-gating
[Manual Gating]: /flagger/usage/webhooks#manual-gating
[flux create tenant]: /cmd/flux_create_tenant
[Flux 2 Multi-Tenancy Guide]: https://github.com/fluxcd/flux2-multi-tenancy
[Mozilla SOPS]: /guides/mozilla-sops/

View File

@ -0,0 +1,45 @@
# Table of contents
* [Introduction](README.md)
* [FAQ](faq.md)
## Install
* [Flagger Install on Kubernetes](install/flagger-install-on-kubernetes.md)
* [Flagger Install on GKE Istio](install/flagger-install-on-google-cloud.md)
* [Flagger Install on EKS App Mesh](install/flagger-install-on-eks-appmesh.md)
* [Flagger Install on Alibaba ServiceMesh](install/flagger-install-on-alibaba-servicemesh.md)
## Usage
* [How it works](usage/how-it-works.md)
* [Deployment Strategies](usage/deployment-strategies.md)
* [Metrics Analysis](usage/metrics.md)
* [Webhooks](usage/webhooks.md)
* [Alerting](usage/alerting.md)
* [Monitoring](usage/monitoring.md)
## Tutorials
* [Istio Canary Deployments](tutorials/istio-progressive-delivery.md)
* [Istio A/B Testing](tutorials/istio-ab-testing.md)
* [Linkerd Canary Deployments](tutorials/linkerd-progressive-delivery.md)
* [App Mesh Canary Deployments](tutorials/appmesh-progressive-delivery.md)
* [Contour Canary Deployments](tutorials/contour-progressive-delivery.md)
* [Gloo Canary Deployments](tutorials/gloo-progressive-delivery.md)
* [NGINX Canary Deployments](tutorials/nginx-progressive-delivery.md)
* [Skipper Canary Deployments](tutorials/skipper-progressive-delivery.md)
* [Traefik Canary Deployments](tutorials/traefik-progressive-delivery.md)
* [Open Service Mesh Deployments](tutorials/osm-progressive-delivery.md)
* [Kuma Canary Deployments](tutorials/kuma-progressive-delivery.md)
* [Gateway API Canary Deployments](tutorials/gatewayapi-progressive-delivery.md)
* [Blue/Green Deployments](tutorials/kubernetes-blue-green.md)
* [Canary analysis with Prometheus Operator](tutorials/prometheus-operator.md)
* [Zero downtime deployments](tutorials/zero-downtime-deployments.md)
## Dev
* [Development Guide](dev/dev-guide.md)
* [Release Guide](dev/release-guide.md)
* [Upgrade Guide](dev/upgrade-guide.md)

View File

@ -0,0 +1,55 @@
---
description: Flagger is a progressive delivery Kubernetes operator
type: docs
weight: 1
title: Flagger
---
[Flagger](https://github.com/fluxcd/flagger) is a progressive delivery tool that automates the release
process for applications running on Kubernetes. It reduces the risk of introducing a new software
version in production by gradually shifting traffic to the new version while measuring metrics
and running conformance tests.
Flagger implements several deployment strategies (Canary releases, A/B testing, Blue/Green mirroring)
using a service mesh (App Mesh, Istio, Linkerd, Kuma, Open Service Mesh)
or an ingress controller (Contour, Gloo, NGINX, Skipper, Traefik) for traffic routing.
For release analysis, Flagger can query Prometheus, InfluxDB, Datadog, New Relic, CloudWatch, Stackdriver
or Graphite and for alerting it uses Slack, MS Teams, Discord and Rocket.
![Flagger overview diagram](/img/diagrams/flagger-overview.png)
Flagger can be configured with Kubernetes custom resources and is compatible with
any CI/CD solutions made for Kubernetes. Since Flagger is declarative and reacts to Kubernetes events,
it can be used in **GitOps** pipelines together with tools like Flux, JenkinsX, Carvel, Argo, etc.
Flagger is a [Cloud Native Computing Foundation](https://cncf.io/) project
and part of [Flux](/) family of GitOps tools.
## Getting started
To get started with Flagger, choose one of the supported routing providers and
[install](install/flagger-install-on-kubernetes.md) Flagger with Helm or Kustomize.
After installing Flagger, you can follow one of these tutorials to get started:
**Service mesh tutorials**
* [Istio](tutorials/istio-progressive-delivery.md)
* [Linkerd](tutorials/linkerd-progressive-delivery.md)
* [AWS App Mesh](tutorials/appmesh-progressive-delivery.md)
* [Open Service Mesh](tutorials/osm-progressive-delivery.md)
* [Kuma](tutorials/kuma-progressive-delivery.md)
**Ingress controller tutorials**
* [Contour](tutorials/contour-progressive-delivery.md)
* [Gloo](tutorials/gloo-progressive-delivery.md)
* [NGINX Ingress](tutorials/nginx-progressive-delivery.md)
* [Skipper Ingress](tutorials/skipper-progressive-delivery.md)
* [Traefik](tutorials/traefik-progressive-delivery.md)
**Hands-on GitOps workshops**
* [Istio](https://github.com/stefanprodan/gitops-istio)
* [Linkerd](https://helm.workshop.flagger.dev)
* [AWS App Mesh](https://eks.handson.flagger.dev)

View File

@ -0,0 +1,5 @@
---
type: docs
title: Dev
weight: 5
---

View File

@ -0,0 +1,212 @@
---
type: docs
title: Development Guide
weight: 1
---
This document describes how to build, test and run Flagger from source.
## Setup dev environment
Flagger is written in Go and uses Go modules for dependency management.
On your dev machine install the following tools:
* go >= 1.17
* git >;= 2.20
* bash >= 5.0
* make >= 3.81
* kubectl >= 1.22
* kustomize >= 4.4
* helm >= 3.0
* docker >= 19.03
You'll also need a Kubernetes cluster for testing Flagger.
You can use Minikube, Kind, Docker desktop or any remote cluster (AKS/EKS/GKE/etc) Kubernetes version 1.22 or newer.
To start contributing to Flagger, fork the [repository](https://github.com/fluxcd/flagger) on GitHub.
Create a dir inside your `GOPATH`:
```bash
mkdir -p $GOPATH/src/github.com/fluxcd
```
Clone your fork:
```bash
cd $GOPATH/src/github.com/fluxcd
git clone https://github.com/YOUR_USERNAME/flagger
cd flagger
```
Set Flagger repository as upstream:
```bash
git remote add upstream https://github.com/fluxcd/flagger.git
```
Sync your fork regularly to keep it up-to-date with upstream:
```bash
git fetch upstream
git checkout main
git merge upstream/main
```
## Build
Download Go modules:
```bash
go mod download
```
Build Flagger binary:
```bash
make build
```
Build load tester binary:
```bash
make loadtester-build
```
## Code changes
We require all commits to be signed. By signing off with your signature, you
certify that you wrote the patch or otherwise have the right to contribute the
material by the rules of the [DCO](https://raw.githubusercontent.com/fluxcd/flagger/main/DCO).
If your `user.name` and `user.email` are configured in your Git config,
you can sign your commit automatically with:
```bash
git commit -s
```
Before submitting a PR, make sure your changes are covered by unit tests.
If you made changes to `go.mod` run:
```bash
go mod tidy
```
If you made changes to `pkg/apis` regenerate Kubernetes client sets with:
```bash
make codegen
```
Run code formatters:
```bash
go install golang.org/x/tools/cmd/goimports@latest
make fmt
```
Run unit tests:
```bash
make test
```
## API changes
If you made changes to `pkg/apis` regenerate the Kubernetes client sets with:
```bash
make codegen
```
Update the validation spec in `artifacts/flagger/crd.yaml` and run:
```bash
make crd
```
Note that any change to the CRDs must be accompanied by an update to the Open API schema.
## Manual testing
Install a service mesh and/or an ingress controller on your cluster
and deploy Flagger using one of the install options
[listed here](/flagger/install/flagger-install-on-kubernetes).
If you made changes to the CRDs, apply your local copy with:
```bash
kubectl apply -f artifacts/flagger/crd.yaml
```
Shutdown the Flagger instance installed on your cluster (replace the namespace with your mesh/ingress one):
```bash
kubectl -n istio-system scale deployment/flagger --replicas=0
```
Port forward to your Prometheus instance:
```bash
kubectl -n istio-system port-forward svc/prometheus 9090:9090
```
Run Flagger locally against your remote cluster by specifying a kubeconfig path:
```bash
go run cmd/flagger/ -kubeconfig=$HOME/.kube/config \
-log-level=info \
-mesh-provider=istio \
-metrics-server=http://localhost:9090
```
Another option to manually test your changes is to build and push the image to your container registry:
```bash
make build
docker build -t <YOUR-DOCKERHUB-USERNAME>/flagger:<YOUR-TAG> .
docker push <YOUR-DOCKERHUB-USERNAME>/flagger:<YOUR-TAG>
```
Deploy your image on the cluster and scale up Flagger:
```bash
kubectl -n istio-system set image deployment/flagger flagger=<YOUR-DOCKERHUB-USERNAME>/flagger:<YOUR-TAG>
kubectl -n istio-system scale deployment/flagger --replicas=1
```
Now you can use one of the [tutorials](/flagger/) to manually test your changes.
## Integration testing
Flagger end-to-end tests can be run locally with [Kubernetes Kind](https://github.com/kubernetes-sigs/kind).
Create a Kind cluster:
```bash
kind create cluster
```
Build Flagger container image and load it on the cluster:
```bash
make build
docker build -t test/flagger:latest .
kind load docker-image test/flagger:latest
```
Run the Istio e2e tests:
```bash
./test/istio/run.sh
```
For each service mesh and ingress controller, there is a dedicated e2e test suite,
choose one that matches your changes from this [list](https://github.com/fluxcd/flagger/tree/main/test).
When you open a pull request on Flagger repo, the unit and integration tests will be run in CI.

View File

@ -0,0 +1,38 @@
---
type: docs
title: Release Guide
weight: 1
---
This document describes how to release Flagger.
## Release
To release a new Flagger version (e.g. `2.0.0`) follow these steps:
* create a branch `git checkout -b prep-2.0.0`
* set the version in code and manifests `TAG=2.0.0 make version-set`
* commit changes and merge PR
* checkout master `git checkout main && git pull`
* tag master `make release`
## CI
After the tag has been pushed to GitHub, the CI release pipeline does the following:
* creates a GitHub release
* pushes the Flagger binary and change log to GitHub release
* pushes the Flagger container image to Docker Hub
* pushes the Helm chart to github-pages branch
* GitHub pages publishes the new chart version on the Helm repository
## Docs
The documentation [website](/flagger) is built from the `docs` branch.
After a Flagger release, publish the docs with:
* `git checkout main && git pull`
* `git checkout docs`
* `git rebase main`
* `git push origin docs`

View File

@ -0,0 +1,94 @@
---
type: docs
weight: 3
title: Upgrade Guide
---
This document describes how to upgrade Flagger.
## Upgrade canaries v1alpha3 to v1beta1
Canary CRD changes in `canaries.flagger.app/v1beta1`:
* the `spec.canaryAnalysis` field has been deprecated and replaced with `spec.analysis`
* the `spec.analysis.interval` and `spec.analysis.threshold` fields are required
* the `status.lastAppliedSpec` and `status.lastPromotedSpec` hashing algorithm changed to `hash/fnv`
* the `spec.analysis.alerts` array can reference `alertproviders.flagger.app/v1beta1` resources
* the `spec.analysis.metrics[].templateRef` can reference a `metrictemplate.flagger.app/v1beta1` resource
* the `metric.threshold` field has been deprecated and replaced with `metric.thresholdRange`
* the `metric.query` field has been deprecated and replaced with `metric.templateRef`
* the `spec.ingressRef.apiVersion` accepts `networking.k8s.io/v1beta1`
* the `spec.targetRef` can reference `DaemonSet` kind
* the `spec.service.meshName` field has been deprecated and no longer used for `provider: appmesh:v1beta2`
Upgrade procedure:
* install the `v1beta1` CRDs
* update Flagger deployment
* replace `apiVersion: flagger.app/v1alpha3` with `apiVersion: flagger.app/v1beta1` in all canary manifests
* replace `spec.canaryAnalysis` with `spec.analysis` in all canary manifests
* update canary manifests in cluster
**Note** that after upgrading Flagger, all canaries will be triggered as the hash value used for tracking changes is computed differently. You can set `spec.skipAnalysis: true` in all canary manifests before upgrading Flagger, do the upgrade, wait for Flagger to finish the no-op promotions and finally set `skipAnalysis` to `false`.
Update builtin metrics:
* replace `threshold` with `thresholdRange.min` for request-success-rate
* replace `threshold` with `thresholdRange.max` for request-duration
```yaml
metrics:
- name: request-success-rate
thresholdRange:
min: 99
interval: 1m
- name: request-duration
thresholdRange:
max: 500
interval: 1m
```
## Istio telemetry v2
Istio 1.5 comes with a breaking change for Flagger uses. In Istio telemetry v2 the metric `istio_request_duration_seconds_bucket` has been removed and replaced with `istio_request_duration_milliseconds_bucket` and this breaks the `request-duration` metric check.
If are using **Istio 1.4**, you can create a metric template using the old duration metric like this:
```yaml
apiVersion: flagger.app/v1beta1
kind: MetricTemplate
metadata:
name: latency
namespace: istio-system
spec:
provider:
type: prometheus
address: http://prometheus.istio-system:9090
query: |
histogram_quantile(
0.99,
sum(
rate(
istio_request_duration_seconds_bucket{
reporter="destination",
destination_workload_namespace="{{ namespace }}",
destination_workload=~"{{ target }}"
}[{{ interval }}]
)
) by (le)
)
```
In the canary manifests, replace the `request-duration` metric with `latency`:
```yaml
metrics:
- name: latency
templateRef:
name: latency
namespace: istio-system
thresholdRange:
max: 0.500
interval: 1m
```

947
content/en/flagger/faq.md Normal file
View File

@ -0,0 +1,947 @@
---
type: docs
weight: 10
title: FAQ
---
## Deployment Strategies
#### Which deployment strategies are supported by Flagger?
Flagger implements the following deployment strategies:
* [Canary Release](usage/deployment-strategies.md#canary-release)
* [A/B Testing](usage/deployment-strategies.md#ab-testing)
* [Blue/Green](usage/deployment-strategies.md#bluegreen-deployments)
* [Blue/Green Mirroring](usage/deployment-strategies.md#bluegreen-with-traffic-mirroring)
#### When should I use A/B testing instead of progressive traffic shifting?
For frontend applications that require session affinity, you should use HTTP headers or
cookie match conditions to ensure a set of users will stay on the same version for
the whole duration of the canary analysis.
#### Can I use Flagger to manage applications that live outside of a service mesh?
For applications that are not deployed on a service mesh,
Flagger can orchestrate Blue/Green style deployments with Kubernetes L4 networking.
#### When can I use traffic mirroring?
Traffic mirroring can be used for Blue/Green deployment strategy or a pre-stage in a Canary release.
Traffic mirroring will copy each incoming request, sending one request to the primary and one to the canary service.
Mirroring should be used for requests that are **idempotent**
or capable of being processed twice (once by the primary and once by the canary).
#### How to retry a failed release?
A canary analysis is triggered by changes in any of the following objects:
* Deployment/DaemonSet PodSpec (metadata, container image, command, ports, env, resources, etc)
* ConfigMaps mounted as volumes or mapped to environment variables
* Secrets mounted as volumes or mapped to environment variables
To retry a release you can add or change an annotation on the pod template:
```yaml
apiVersion: apps/v1
kind: Deployment
spec:
template:
metadata:
annotations:
timestamp: "2020-03-10T14:24:48+0000"
```
#### How to change replicas for a deployment when not using HPA?
To change replicas for a deployment when not using HPA, you have to update the canary deployment with the desired replica count
and trigger an analysis by annotating the template. After the analysis finishes, Flagger will promote the `spec.replicas` changes to the primary deployment.
Example:
```yaml
apiVersion: apps/v1
kind: Deployment
spec:
replicas: 4 #update replicas
template:
metadata:
annotations:
timestamp: "2022-02-10T14:24:48+0000" #add annotation to trigger analysis
```
#### Why is there a window of downtime during the canary initializing process when analysis is disabled?
A window of downtime is the intended behavior when the analysis is disabled. This allows instant rollback and also mimics the way
a Kubernetes deployment initialization works. To avoid this, enable the analysis (`skipAnalysis: true`), wait for the initialization
to finish, and disable it afterward (`skipAnalysis: false`).
## Kubernetes services
#### How is an application exposed inside the cluster?
Assuming the app name is `podinfo`, you can define a canary like:
```yaml
apiVersion: flagger.app/v1beta1
kind: Canary
metadata:
name: podinfo
namespace: test
spec:
targetRef:
apiVersion: apps/v1
kind: Deployment
name: podinfo
service:
# service name (optional)
name: podinfo
# ClusterIP port number (required)
port: 9898
# container port name or number
targetPort: http
# port name can be http or grpc (default http)
portName: http
```
If the `service.name` is not specified, then `targetRef.name` is used for
the apex domain and canary/primary services name prefix.
You should treat the service name as an immutable field; changing its could result in routing conflicts.
Based on the canary spec service, Flagger generates the following Kubernetes ClusterIP service:
* `<service.name>.<namespace>.svc.cluster.local`
selector `app=<name>-primary`
* `<service.name>-primary.<namespace>.svc.cluster.local`
selector `app=<name>-primary`
* `<service.name>-canary.<namespace>.svc.cluster.local`
selector `app=<name>`
This ensures that traffic coming from a namespace outside the mesh to `podinfo.test:9898`
will be routed to the latest stable release of your app.
```yaml
apiVersion: v1
kind: Service
metadata:
name: podinfo
spec:
type: ClusterIP
selector:
app: podinfo-primary
ports:
- name: http
port: 9898
protocol: TCP
targetPort: http
---
apiVersion: v1
kind: Service
metadata:
name: podinfo-primary
spec:
type: ClusterIP
selector:
app: podinfo-primary
ports:
- name: http
port: 9898
protocol: TCP
targetPort: http
---
apiVersion: v1
kind: Service
metadata:
name: podinfo-canary
spec:
type: ClusterIP
selector:
app: podinfo
ports:
- name: http
port: 9898
protocol: TCP
targetPort: http
```
The `podinfo-canary.test:9898` address is available only during the canary analysis
and can be used for conformance testing or load testing.
## Multiple ports
#### My application listens on multiple ports. How can I expose them inside the cluster?
If port discovery is enabled, Flagger scans the deployment spec and extracts the containers ports excluding
the port specified in the canary service and Envoy sidecar ports.
These ports will be used when generating the ClusterIP services.
For a deployment that exposes two ports:
```yaml
apiVersion: apps/v1
kind: Deployment
spec:
template:
metadata:
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9899"
spec:
containers:
- name: app
ports:
- containerPort: 8080
- containerPort: 9090
```
You can enable port discovery so that Prometheus will be able to reach port `9090` over mTLS:
```yaml
apiVersion: flagger.app/v1beta1
kind: Canary
spec:
service:
# container port used for canary analysis
port: 8080
# port name can be http or grpc (default http)
portName: http
# add all the other container ports
# to the ClusterIP services (default false)
portDiscovery: true
trafficPolicy:
tls:
mode: ISTIO_MUTUAL
```
Both port `8080` and `9090` will be added to the ClusterIP services.
## Label selectors
#### What labels selectors are supported by Flagger?
The target deployment must have a single label selector in the format `app: <DEPLOYMENT-NAME>`:
```yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: podinfo
spec:
selector:
matchLabels:
app: podinfo
template:
metadata:
labels:
app: podinfo
```
Besides `app`, Flagger supports `name` and `app.kubernetes.io/name` selectors.
If you use a different convention, you can specify your label with the `-selector-labels` flag.
For example:
```
flagger \
-selector-labels=service,name,app.kubernetes.io/name \
...
```
#### Is pod affinity and anti affinity supported?
Flagger will rewrite the first value in each match expression,
defined in the target deployment's pod anti-affinity and topology spread constraints,
satisfying the following two requirements when creating, or updating, the primary deployment:
* The key in the match expression must be one of the labels specified by the parameter selector-labels.
The default labels are `app`,`name`,`app.kubernetes.io/name`.
* The value must match the name of the target deployment.
The rewrite done by Flagger in these cases is to suffix the value with `-primary`.
This rewrite can be used to spread the pods created by the canary
and primary deployments across different availability zones.
Example target deployment:
```yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: podinfo
spec:
selector:
matchLabels:
app: podinfo
template:
metadata:
labels:
app: podinfo
spec:
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- podinfo
topologyKey: topology.kubernetes.io/zone
```
Example of generated primary deployment:
```yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: podinfo-primary
spec:
selector:
matchLabels:
app: podinfo-primary
template:
metadata:
labels:
app: podinfo-primary
spec:
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- podinfo-primary
topologyKey: topology.kubernetes.io/zone
```
It is also possible to use a different label than the `app`, `name` or `app.kubernetes.io/name`.
Anti affinity example(using a different label):
```yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: podinfo
spec:
selector:
matchLabels:
app: podinfo
affinity: podinfo
template:
metadata:
labels:
app: podinfo
affinity: podinfo
spec:
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchLabels:
affinity: podinfo
topologyKey: topology.kubernetes.io/zone
```
## Metrics
#### How does Flagger measure the request success rate and duration?
By default, Flagger measures the request success rate and duration using Prometheus queries.
#### HTTP requests success rate percentage
Spec:
```yaml
analysis:
metrics:
- name: request-success-rate
# minimum req success rate (non 5xx responses)
# percentage (0-100)
thresholdRange:
min: 99
interval: 1m
```
Istio query:
```javascript
sum(
rate(
istio_requests_total{
reporter="destination",
destination_workload_namespace=~"$namespace",
destination_workload=~"$workload",
response_code!~"5.*"
}[$interval]
)
)
/
sum(
rate(
istio_requests_total{
reporter="destination",
destination_workload_namespace=~"$namespace",
destination_workload=~"$workload"
}[$interval]
)
)
```
Envoy query (App Mesh):
```javascript
sum(
rate(
envoy_cluster_upstream_rq{
kubernetes_namespace="$namespace",
kubernetes_pod_name=~"$workload",
envoy_response_code!~"5.*"
}[$interval]
)
)
/
sum(
rate(
envoy_cluster_upstream_rq{
kubernetes_namespace="$namespace",
kubernetes_pod_name=~"$workload"
}[$interval]
)
)
```
Envoy query (Contour and Gloo):
```javascript
sum(
rate(
envoy_cluster_upstream_rq{
envoy_cluster_name=~"$namespace-$workload",
envoy_response_code!~"5.*"
}[$interval]
)
)
/
sum(
rate(
envoy_cluster_upstream_rq{
envoy_cluster_name=~"$namespace-$workload",
}[$interval]
)
)
```
#### HTTP requests milliseconds duration P99
Spec:
```yaml
analysis:
metrics:
- name: request-duration
# maximum req duration P99
# milliseconds
thresholdRange:
max: 500
interval: 1m
```
Istio query:
```javascript
histogram_quantile(0.99,
sum(
irate(
istio_request_duration_milliseconds_bucket{
reporter="destination",
destination_workload=~"$workload",
destination_workload_namespace=~"$namespace"
}[$interval]
)
) by (le)
)
```
Envoy query (App Mesh, Contour and Gloo):
```javascript
histogram_quantile(0.99,
sum(
irate(
envoy_cluster_upstream_rq_time_bucket{
kubernetes_pod_name=~"$workload",
kubernetes_namespace=~"$namespace"
}[$interval]
)
) by (le)
)
```
> **Note** that the metric interval should be lower or equal to the control loop interval.
#### Can I use custom metrics?
The analysis can be extended with metrics provided by Prometheus, Datadog, AWS CloudWatch, New Relic and Graphite.
For more details on how custom metrics can be used, please read the [metrics docs](usage/metrics.md).
#### Istio Gateway API
If you're using Istio with Gateway API, the Prometheus query needs to include `reporter="source"`. For example, to calculate HTTP requests error percentage, the query would be:
```javascript
100 - sum(
rate(
istio_requests_total{
reporter="source",
destination_workload_namespace=~"$namespace",
destination_workload=~"$workload",
response_code!~"5.*"
}[$interval]
)
)
/
sum(
rate(
istio_requests_total{
reporter="source",
destination_workload_namespace=~"$namespace",
destination_workload=~"$workload"
}[$interval]
)
) * 100
```
## Istio routing
#### How does Flagger interact with Istio?
Flagger creates an Istio Virtual Service and Destination Rules based on the Canary service spec.
The service configuration lets you expose an app inside or outside the mesh. You can also define traffic policies,
HTTP match conditions, URI rewrite rules, CORS policies, timeout and retries.
The following spec exposes the `frontend` workload inside the mesh on `frontend.test.svc.cluster.local:9898`
and outside the mesh on `frontend.example.com`. You'll have to specify an Istio ingress gateway for external hosts.
```yaml
apiVersion: flagger.app/v1beta1
kind: Canary
metadata:
name: frontend
namespace: test
spec:
service:
# container port
port: 9898
# service port name (optional, will default to "http")
portName: http-frontend
# Istio gateways (optional)
gateways:
- public-gateway.istio-system.svc.cluster.local
- mesh
# Istio virtual service host names (optional)
hosts:
- frontend.example.com
# Istio traffic policy
trafficPolicy:
tls:
# use ISTIO_MUTUAL when mTLS is enabled
mode: DISABLE
# HTTP match conditions (optional)
match:
- uri:
prefix: /
# HTTP rewrite (optional)
rewrite:
uri: /
# Istio retry policy (optional)
retries:
attempts: 3
perTryTimeout: 1s
retryOn: "gateway-error,connect-failure,refused-stream"
# Add headers (optional)
headers:
request:
add:
x-some-header: "value"
# cross-origin resource sharing policy (optional)
corsPolicy:
allowOrigin:
- example.com
allowMethods:
- GET
allowCredentials: false
allowHeaders:
- x-some-header
maxAge: 24h
```
For the above spec Flagger will generate the following virtual service:
```yaml
apiVersion: networking.istio.io/v1alpha3
kind: VirtualService
metadata:
name: frontend
namespace: test
ownerReferences:
- apiVersion: flagger.app/v1beta1
blockOwnerDeletion: true
controller: true
kind: Canary
name: podinfo
uid: 3a4a40dd-3875-11e9-8e1d-42010a9c0fd1
spec:
gateways:
- public-gateway.istio-system.svc.cluster.local
- mesh
hosts:
- frontend.example.com
- frontend
http:
- corsPolicy:
allowHeaders:
- x-some-header
allowMethods:
- GET
allowOrigin:
- example.com
maxAge: 24h
headers:
request:
add:
x-some-header: "value"
match:
- uri:
prefix: /
rewrite:
uri: /
route:
- destination:
host: podinfo-primary
weight: 100
- destination:
host: podinfo-canary
weight: 0
retries:
attempts: 3
perTryTimeout: 1s
retryOn: "gateway-error,connect-failure,refused-stream"
```
For each destination in the virtual service a rule is generated:
```yaml
apiVersion: networking.istio.io/v1alpha3
kind: DestinationRule
metadata:
name: frontend-primary
namespace: test
spec:
host: frontend-primary
trafficPolicy:
tls:
mode: DISABLE
---
apiVersion: networking.istio.io/v1alpha3
kind: DestinationRule
metadata:
name: frontend-canary
namespace: test
spec:
host: frontend-canary
trafficPolicy:
tls:
mode: DISABLE
```
Flagger keeps in sync the virtual service and destination rules with the canary service spec.
Any direct modification to the virtual service spec will be overwritten.
To expose a workload inside the mesh on `http://backend.test.svc.cluster.local:9898`,
the service spec can contain only the container port and the traffic policy:
```yaml
apiVersion: flagger.app/v1beta1
kind: Canary
metadata:
name: backend
namespace: test
spec:
service:
port: 9898
trafficPolicy:
tls:
mode: DISABLE
```
Based on the above spec, Flagger will create several ClusterIP services like:
```yaml
apiVersion: v1
kind: Service
metadata:
name: backend-primary
ownerReferences:
- apiVersion: flagger.app/v1beta1
blockOwnerDeletion: true
controller: true
kind: Canary
name: backend
uid: 2ca1a9c7-2ef6-11e9-bd01-42010a9c0145
spec:
type: ClusterIP
ports:
- name: http
port: 9898
protocol: TCP
targetPort: 9898
selector:
app: backend-primary
```
Flagger works for user facing apps exposed outside the cluster via an ingress gateway and for backend HTTP APIs
that are accessible only from inside the mesh.
If `Delegation` is enabled, Flagger would generate Istio VirtualService without hosts and gateway,
making the service compatible with Istio delegation.
```yaml
apiVersion: flagger.app/v1beta1
kind: Canary
metadata:
name: backend
namespace: test
spec:
service:
delegation: true
port: 9898
targetRef:
apiVersion: v1
kind: Deployment
name: podinfo
analysis:
interval: 15s
threshold: 15
maxWeight: 30
stepWeight: 10
```
Based on the above spec, Flagger will create the following virtual service:
```yaml
apiVersion: networking.istio.io/v1alpha3
kind: VirtualService
metadata:
name: backend
namespace: test
ownerReferences:
- apiVersion: flagger.app/v1beta1
blockOwnerDeletion: true
controller: true
kind: Canary
name: backend
uid: 58562662-5e10-4512-b269-2b789c1b30fe
spec:
http:
- route:
- destination:
host: podinfo-primary
weight: 100
- destination:
host: podinfo-canary
weight: 0
```
Therefore, the following virtual service forwards the traffic to `/podinfo` by the above delegate VirtualService.
```yaml
apiVersion: networking.istio.io/v1alpha3
kind: VirtualService
metadata:
name: frontend
namespace: test
spec:
gateways:
- public-gateway.istio-system.svc.cluster.local
- mesh
hosts:
- frontend.example.com
- frontend
http:
- match:
- uri:
prefix: /podinfo
rewrite:
uri: /
delegate:
name: backend
namespace: test
```
Note that pilot env `PILOT_ENABLE_VIRTUAL_SERVICE_DELEGATE` must also be set.
For the use of Istio Delegation, you can refer to the documentation of
[Virtual Service](https://istio.io/latest/docs/reference/config/networking/virtual-service/#Delegate)
and [pilot environment variables](https://istio.io/latest/docs/reference/commands/pilot-discovery/#envvars).
## Istio Ingress Gateway
#### How can I expose multiple canaries on the same external domain?
Assuming you have two apps -- one that serves the main website and one that serves its REST API --
you can define a canary object for each app as:
```yaml
apiVersion: flagger.app/v1beta1
kind: Canary
metadata:
name: website
spec:
service:
port: 8080
gateways:
- public-gateway.istio-system.svc.cluster.local
hosts:
- my-site.com
match:
- uri:
prefix: /
rewrite:
uri: /
---
apiVersion: flagger.app/v1beta1
kind: Canary
metadata:
name: webapi
spec:
service:
port: 8080
gateways:
- public-gateway.istio-system.svc.cluster.local
hosts:
- my-site.com
match:
- uri:
prefix: /api
rewrite:
uri: /
```
Based on the above configuration, Flagger will create two virtual services bounded
to the same ingress gateway and external host.
Istio Pilot will
[merge](https://istio.io/help/ops/traffic-management/deploy-guidelines/#multiple-virtual-services-and-destination-rules-for-the-same-host)
the two services and the website rule will be moved to the end of the list in the merged configuration.
Note that host merging only works if the canaries are bounded to an ingress gateway other than the `mesh` gateway.
## Istio Mutual TLS
#### How can I enable mTLS for a canary?
When deploying Istio with global mTLS enabled, you have to set the TLS mode to `ISTIO_MUTUAL`:
```yaml
apiVersion: flagger.app/v1beta1
kind: Canary
spec:
service:
trafficPolicy:
tls:
mode: ISTIO_MUTUAL
```
If you run Istio in permissive mode, you can disable TLS:
```yaml
apiVersion: flagger.app/v1beta1
kind: Canary
spec:
service:
trafficPolicy:
tls:
mode: DISABLE
```
#### If Flagger is outside of the mesh, how can it start the load test?
In order for Flagger to be able to call the load tester service from outside the mesh,
you need to disable mTLS:
```yaml
apiVersion: networking.istio.io/v1beta1
kind: DestinationRule
metadata:
name: flagger-loadtester
namespace: test
spec:
host: "flagger-loadtester.test.svc.cluster.local"
trafficPolicy:
tls:
mode: DISABLE
---
apiVersion: security.istio.io/v1beta1
kind: PeerAuthentication
metadata:
name: flagger-loadtester
namespace: test
spec:
selector:
matchLabels:
app: flagger-loadtester
mtls:
mode: DISABLE
```
## ExternalDNS
### Can I use annotations?
Flagger propagates annotations (and labels) to all the generated apex,
primary and canary objects. This allows using external-dns annotations.
You can configure Flagger to set annotations with:
```yaml
spec:
service:
apex:
annotations:
external-dns.alpha.kubernetes.io/hostname: "mydomain.com"
primary:
annotations:
external-dns.alpha.kubernetes.io/hostname: "primary.mydomain.com"
canary:
annotations:
external-dns.alpha.kubernetes.io/hostname: "canary.mydomain.com"
```
### Multiple sources and Istio
**/!\\** The apex annotations are added to both the generated Kubernetes Services and the generated Istio
VirtualServices objects. If you have configured external-dns to use both sources,
this will create conflicts!
```yaml
spec:
containers:
args:
- --source=service # choose only one
- --source=istio-virtualservice # of these two
```
[Checkout ExternalDNS documentation](https://github.com/kubernetes-sigs/external-dns/blob/master/docs/tutorials/istio.md)

View File

@ -0,0 +1,5 @@
---
type: docs
weight: 3
title: Install
---

View File

@ -0,0 +1,140 @@
---
type: docs
weight: 4
title: Flagger Install on Alibaba ServiceMesh
---
This guide walks you through setting up Flagger on Alibaba ServiceMesh.
## Prerequisites
- Created an ACK([Alibabacloud Container Service for Kubernetes](https://cs.console.aliyun.com)) cluster instance.
- Created an ASM([Alibaba ServiceMesh](https://servicemesh.console.aliyun.com)) instance, and added ACK cluster.
### Variables declaration
- `$ACK_CONFIG`: the kubeconfig file path of ACK, which be treated as`$HOME/.kube/config` in the rest of guide.
- `$MESH_CONFIG`: the kubeconfig file path of ASM.
- `$ISTIO_RELEASE`: see https://github.com/istio/istio/releases
- `$FLAGGER_SRC`: see https://github.com/fluxcd/flagger
## Install Prometheus
Install Prometheus:
```bash
kubectl apply -f $ISTIO_RELEASE/samples/addons/prometheus.yaml
```
it' same with the below cmd:
```bash
kubectl --kubeconfig "$ACK_CONFIG" apply -f $ISTIO_RELEASE/samples/addons/prometheus.yaml
```
Append the below configs to `scrape_configs` in prometheus configmap, to support telemetry:
```yaml
scrape_configs:
# Mixer scrapping. Defaults to Prometheus and mixer on same namespace.
- job_name: 'istio-mesh'
kubernetes_sd_configs:
- role: endpoints
namespaces:
names:
- istio-system
relabel_configs:
- source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: istio-telemetry;prometheus
# Scrape config for envoy stats
- job_name: 'envoy-stats'
metrics_path: /stats/prometheus
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_container_port_name]
action: keep
regex: '.*-envoy-prom'
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:15090
target_label: __address__
- action: labeldrop
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: pod_name
- job_name: 'istio-policy'
kubernetes_sd_configs:
- role: endpoints
namespaces:
names:
- istio-system
relabel_configs:
- source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: istio-policy;http-policy-monitoring
- job_name: 'istio-telemetry'
kubernetes_sd_configs:
- role: endpoints
namespaces:
names:
- istio-system
relabel_configs:
- source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: istio-telemetry;http-monitoring
- job_name: 'pilot'
kubernetes_sd_configs:
- role: endpoints
namespaces:
names:
- istio-system
relabel_configs:
- source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: istiod;http-monitoring
- source_labels: [__meta_kubernetes_service_label_app]
target_label: app
- job_name: 'sidecar-injector'
kubernetes_sd_configs:
- role: endpoints
namespaces:
names:
- istio-system
relabel_configs:
- source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: istio-sidecar-injector;http-monitoring
```
## Install Flagger
Add Flagger Helm repository:
```bash
helm repo add flagger https://flagger.app
helm repo update
```
Install Flagger's Canary CRD:
```yaml
kubectl apply -f $FLAGGER_SRC/artifacts/flagger/crd.yaml
```
Deploy Flagger for Alibaba ServiceMesh:
```bash
cp $MESH_CONFIG kubeconfig
kubectl -n istio-system create secret generic istio-kubeconfig --from-file kubeconfig
kubectl -n istio-system label secret istio-kubeconfig istio/multiCluster=true
helm upgrade -i flagger flagger/flagger \
--namespace=istio-system \
--set crd.create=false \
--set meshProvider=istio \
--set metricsServer=http://prometheus:9090 \
--set istio.kubeconfig.secretName=istio-kubeconfig \
--set istio.kubeconfig.key=kubeconfig
```

View File

@ -0,0 +1,155 @@
---
type: docs
weight: 3
title: Flagger Install on EKS App Mesh
---
This guide walks you through setting up Flagger and AWS App Mesh on EKS.
## App Mesh
The App Mesh integration with EKS is made out of the following components:
* Kubernetes custom resources
* `mesh.appmesh.k8s.aws` defines a logical boundary for network traffic between the services
* `virtualnode.appmesh.k8s.aws` defines a logical pointer to a Kubernetes workload
* `virtualservice.appmesh.k8s.aws` defines the routing rules for a workload inside the mesh
* CRD controller - keeps the custom resources in sync with the App Mesh control plane
* Admission controller - injects the Envoy sidecar and assigns Kubernetes pods to App Mesh virtual nodes
* Telemetry service - Prometheus instance that collects and stores Envoy's metrics
## Create a Kubernetes cluster
In order to create an EKS cluster you can use [eksctl](https://eksctl.io).
Eksctl is an open source command-line utility made by Weaveworks in collaboration with Amazon.
On MacOS you can install eksctl with Homebrew:
```bash
brew tap weaveworks/tap
brew install weaveworks/tap/eksctl
```
Create an EKS cluster with:
```bash
eksctl create cluster --name=appmesh \
--region=us-west-2 \
--nodes 3 \
--node-volume-size=120 \
--appmesh-access
```
The above command will create a two nodes cluster with
App Mesh [IAM policy](https://docs.aws.amazon.com/app-mesh/latest/userguide/MESH_IAM_user_policies.html)
attached to the EKS node instance role.
Verify the install with:
```bash
kubectl get nodes
```
## Install Helm
Install the [Helm](https://docs.helm.sh/using_helm/#installing-helm) v3 command-line tool:
```text
brew install helm
```
Add the EKS repository to Helm:
```bash
helm repo add eks https://aws.github.io/eks-charts
```
## Enable horizontal pod auto-scaling
Install the Horizontal Pod Autoscaler (HPA) metrics provider:
```bash
helm upgrade -i metrics-server stable/metrics-server \
--namespace kube-system \
--set args[0]=--kubelet-preferred-address-types=InternalIP
```
After a minute, the metrics API should report CPU and memory usage for pods. You can very the metrics API with:
```bash
kubectl -n kube-system top pods
```
## Install the App Mesh components
Install the App Mesh CRDs:
```bash
kubectl apply -k github.com/aws/eks-charts/stable/appmesh-controller//crds?ref=master
```
Create the `appmesh-system` namespace:
```bash
kubectl create ns appmesh-system
```
Install the App Mesh controller:
```bash
helm upgrade -i appmesh-controller eks/appmesh-controller \
--wait --namespace appmesh-system
```
In order to collect the App Mesh metrics that Flagger needs to run the canary analysis,
you'll need to setup a Prometheus instance to scrape the Envoy sidecars.
Install the App Mesh Prometheus:
```bash
helm upgrade -i appmesh-prometheus eks/appmesh-prometheus \
--wait --namespace appmesh-system
```
## Install Flagger
Add Flagger Helm repository:
```bash
helm repo add flagger https://flagger.app
```
Install Flagger's Canary CRD:
```yaml
kubectl apply -f https://raw.githubusercontent.com/fluxcd/flagger/main/artifacts/flagger/crd.yaml
```
Deploy Flagger in the _**appmesh-system**_ namespace:
```bash
helm upgrade -i flagger flagger/flagger \
--namespace=appmesh-system \
--set crd.create=false \
--set meshProvider=appmesh:v1beta2 \
--set metricsServer=http://appmesh-prometheus:9090
```
## Install Grafana
Deploy App Mesh Grafana that comes with a dashboard for monitoring Flagger's canary releases:
```bash
helm upgrade -i appmesh-grafana eks/appmesh-grafana \
--namespace appmesh-system
```
You can access Grafana using port forwarding:
```bash
kubectl -n appmesh-system port-forward svc/appmesh-grafana 3000:3000
```
Now that you have Flagger running, you can try the
[App Mesh canary deployments tutorial](/flagger/usage/appmesh-progressive-delivery).

View File

@ -0,0 +1,404 @@
---
type: docs
weight: 2
title: Flagger Install on GKE Istio
---
This guide walks you through setting up Flagger and Istio on Google Kubernetes Engine.
![GKE Cluster Overview](/img/diagrams/flagger-gke-istio.png)
## Prerequisites
You will be creating a cluster on Googles Kubernetes Engine \(GKE\), if you dont have an account you can sign up [here](https://cloud.google.com/free/) for free credits.
Login into Google Cloud, create a project and enable billing for it.
Install the [gcloud](https://cloud.google.com/sdk/) command line utility and configure your project with `gcloud init`.
Set the default project \(replace `PROJECT_ID` with your own project\):
```text
gcloud config set project PROJECT_ID
```
Set the default compute region and zone:
```text
gcloud config set compute/region us-central1
gcloud config set compute/zone us-central1-a
```
Enable the Kubernetes and Cloud DNS services for your project:
```text
gcloud services enable container.googleapis.com
gcloud services enable dns.googleapis.com
```
Install the kubectl command-line tool:
```text
gcloud components install kubectl
```
## GKE cluster setup
Create a cluster with the Istio add-on:
```bash
K8S_VERSION=$(gcloud container get-server-config --format=json \
| jq -r '.validMasterVersions[0]')
gcloud beta container clusters create istio \
--cluster-version=${K8S_VERSION} \
--zone=us-central1-a \
--num-nodes=2 \
--machine-type=n1-highcpu-4 \
--preemptible \
--no-enable-cloud-logging \
--no-enable-cloud-monitoring \
--disk-size=30 \
--enable-autorepair \
--addons=HorizontalPodAutoscaling,Istio \
--istio-config=auth=MTLS_PERMISSIVE
```
The above command will create a default node pool consisting of two `n1-highcpu-4` \(vCPU: 4, RAM 3.60GB, DISK: 30GB\) preemptible VMs. Preemptible VMs are up to 80% cheaper than regular instances and are terminated and replaced after a maximum of 24 hours.
Set up credentials for `kubectl`:
```bash
gcloud container clusters get-credentials istio
```
Create a cluster admin role binding:
```bash
kubectl create clusterrolebinding "cluster-admin-$(whoami)" \
--clusterrole=cluster-admin \
--user="$(gcloud config get-value core/account)"
```
Validate your setup with:
```bash
kubectl -n istio-system get svc
```
In a couple of seconds GCP should allocate an external IP to the `istio-ingressgateway` service.
## Cloud DNS setup
You will need an internet domain and access to the registrar to change the name servers to Google Cloud DNS.
Create a managed zone named `istio` in Cloud DNS \(replace `example.com` with your domain\):
```bash
gcloud dns managed-zones create \
--dns-name="example.com." \
--description="Istio zone" "istio"
```
Look up your zone's name servers:
```bash
gcloud dns managed-zones describe istio
```
Update your registrar's name server records with the records returned by the above command.
Wait for the name servers to change \(replace `example.com` with your domain\):
```bash
watch dig +short NS example.com
```
Create a static IP address named `istio-gateway` using the Istio ingress IP:
```bash
export GATEWAY_IP=$(kubectl -n istio-system get svc/istio-ingressgateway -ojson \
| jq -r .status.loadBalancer.ingress[0].ip)
gcloud compute addresses create istio-gateway --addresses ${GATEWAY_IP} --region us-central1
```
Create the following DNS records \(replace `example.com` with your domain\):
```bash
DOMAIN="example.com"
gcloud dns record-sets transaction start --zone=istio
gcloud dns record-sets transaction add --zone=istio \
--name="${DOMAIN}" --ttl=300 --type=A ${GATEWAY_IP}
gcloud dns record-sets transaction add --zone=istio \
--name="www.${DOMAIN}" --ttl=300 --type=A ${GATEWAY_IP}
gcloud dns record-sets transaction add --zone=istio \
--name="*.${DOMAIN}" --ttl=300 --type=A ${GATEWAY_IP}
gcloud dns record-sets transaction execute --zone istio
```
Verify that the wildcard DNS is working \(replace `example.com` with your domain\):
```bash
watch host test.example.com
```
## Install Helm
Install the [Helm](https://docs.helm.sh/using_helm/#installing-helm) command-line tool:
```text
brew install kubernetes-helm
```
Create a service account and a cluster role binding for Tiller:
```bash
kubectl -n kube-system create sa tiller
kubectl create clusterrolebinding tiller-cluster-rule \
--clusterrole=cluster-admin \
--serviceaccount=kube-system:tiller
```
Deploy Tiller in the `kube-system` namespace:
```bash
helm init --service-account tiller
```
You should consider using SSL between Helm and Tiller, for more information on securing your Helm installation see [docs.helm.sh](https://docs.helm.sh/using_helm/#securing-your-helm-installation).
## Install cert-manager
Jetstack's [cert-manager](https://github.com/jetstack/cert-manager) is a Kubernetes operator that automatically creates and manages TLS certs issued by Lets Encrypt.
You'll be using cert-manager to provision a wildcard certificate for the Istio ingress gateway.
Install cert-manager's CRDs:
```bash
CERT_REPO=https://raw.githubusercontent.com/jetstack/cert-manager
kubectl apply -f ${CERT_REPO}/release-0.10/deploy/manifests/00-crds.yaml
```
Create the cert-manager namespace and disable resource validation:
```bash
kubectl create namespace cert-manager
kubectl label namespace cert-manager certmanager.k8s.io/disable-validation=true
```
Install cert-manager with Helm:
```bash
helm repo add jetstack https://charts.jetstack.io && \
helm repo update && \
helm upgrade -i cert-manager \
--namespace cert-manager \
--version v0.10.0 \
jetstack/cert-manager
```
## Istio Gateway TLS setup
![Istio Let&apos;s Encrypt](/img/diagrams/istio-cert-manager-gke.png)
Create a generic Istio Gateway to expose services outside the mesh on HTTPS:
```bash
REPO=https://raw.githubusercontent.com/fluxcd/flagger/main
kubectl apply -f ${REPO}/artifacts/gke/istio-gateway.yaml
```
Create a service account with Cloud DNS admin role \(replace `my-gcp-project` with your project ID\):
```bash
GCP_PROJECT=my-gcp-project
gcloud iam service-accounts create dns-admin \
--display-name=dns-admin \
--project=${GCP_PROJECT}
gcloud iam service-accounts keys create ./gcp-dns-admin.json \
--iam-account=dns-admin@${GCP_PROJECT}.iam.gserviceaccount.com \
--project=${GCP_PROJECT}
gcloud projects add-iam-policy-binding ${GCP_PROJECT} \
--member=serviceAccount:dns-admin@${GCP_PROJECT}.iam.gserviceaccount.com \
--role=roles/dns.admin
```
Create a Kubernetes secret with the GCP Cloud DNS admin key:
```bash
kubectl create secret generic cert-manager-credentials \
--from-file=./gcp-dns-admin.json \
--namespace=istio-system
```
Create a letsencrypt issuer for CloudDNS \(replace `email@example.com` with a valid email address and `my-gcp-project`with your project ID\):
```yaml
apiVersion: certmanager.k8s.io/v1alpha1
kind: Issuer
metadata:
name: letsencrypt-prod
namespace: istio-system
spec:
acme:
server: https://acme-v02.api.letsencrypt.org/directory
email: email@example.com
privateKeySecretRef:
name: letsencrypt-prod
dns01:
providers:
- name: cloud-dns
clouddns:
serviceAccountSecretRef:
name: cert-manager-credentials
key: gcp-dns-admin.json
project: my-gcp-project
```
Save the above resource as letsencrypt-issuer.yaml and then apply it:
```text
kubectl apply -f ./letsencrypt-issuer.yaml
```
Create a wildcard certificate \(replace `example.com` with your domain\):
```yaml
apiVersion: certmanager.k8s.io/v1alpha1
kind: Certificate
metadata:
name: istio-gateway
namespace: istio-system
spec:
secretName: istio-ingressgateway-certs
issuerRef:
name: letsencrypt-prod
commonName: "*.example.com"
acme:
config:
- dns01:
provider: cloud-dns
domains:
- "*.example.com"
- "example.com"
```
Save the above resource as istio-gateway-cert.yaml and then apply it:
```text
kubectl apply -f ./istio-gateway-cert.yaml
```
In a couple of seconds cert-manager should fetch a wildcard certificate from letsencrypt.org:
```text
kubectl -n istio-system describe certificate istio-gateway
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal CertIssued 1m52s cert-manager Certificate issued successfully
```
Recreate Istio ingress gateway pods:
```bash
kubectl -n istio-system get pods -l istio=ingressgateway
```
Note that Istio gateway doesn't reload the certificates from the TLS secret on cert-manager renewal. Since the GKE cluster is made out of preemptible VMs the gateway pods will be replaced once every 24h, if your not using preemptible nodes then you need to manually delete the gateway pods every two months before the certificate expires.
## Install Prometheus
The GKE Istio add-on does not include a Prometheus instance that scrapes the Istio telemetry service. Because Flagger uses the Istio HTTP metrics to run the canary analysis you have to deploy the following Prometheus configuration that's similar to the one that comes with the official Istio Helm chart.
Find the GKE Istio version with:
```bash
kubectl -n istio-system get deploy istio-pilot -oyaml | grep image:
```
Install Prometheus in istio-system namespace:
```bash
kubectl -n istio-system apply -f \
https://storage.googleapis.com/gke-release/istio/release/1.0.6-gke.3/patches/install-prometheus.yaml
```
## Install Flagger and Grafana
Add Flagger Helm repository:
```bash
helm repo add flagger https://flagger.app
```
Install Flagger's Canary CRD:
```yaml
kubectl apply -f https://raw.githubusercontent.com/fluxcd/flagger/main/artifacts/flagger/crd.yaml
```
Deploy Flagger in the `istio-system` namespace with Slack notifications enabled:
```bash
helm upgrade -i flagger flagger/flagger \
--namespace=istio-system \
--set crd.create=false \
--set metricsServer=http://prometheus.istio-system:9090 \
--set slack.url=https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK \
--set slack.channel=general \
--set slack.user=flagger
```
Deploy Grafana in the `istio-system` namespace:
```bash
helm upgrade -i flagger-grafana flagger/grafana \
--namespace=istio-system \
--set url=http://prometheus.istio-system:9090 \
--set user=admin \
--set password=replace-me
```
Expose Grafana through the public gateway by creating a virtual service \(replace `example.com` with your domain\):
```yaml
apiVersion: networking.istio.io/v1alpha3
kind: VirtualService
metadata:
name: grafana
namespace: istio-system
spec:
hosts:
- "grafana.example.com"
gateways:
- public-gateway.istio-system.svc.cluster.local
http:
- route:
- destination:
host: flagger-grafana
```
Save the above resource as grafana-virtual-service.yaml and then apply it:
```bash
kubectl apply -f ./grafana-virtual-service.yaml
```
Navigate to `http://grafana.example.com` in your browser and you should be redirected to the HTTPS version.

View File

@ -0,0 +1,260 @@
---
type: docs
weight: 1
title: Flagger Install on Kubernetes
---
This guide walks you through setting up Flagger on a Kubernetes cluster with Helm v3 or Kustomize.
## Prerequisites
Flagger requires a Kubernetes cluster **v1.16** or newer.
## Install Flagger with Helm
Add Flagger Helm repository:
```bash
helm repo add flagger https://flagger.app
```
Install Flagger's Canary CRD:
```yaml
kubectl apply -f https://raw.githubusercontent.com/fluxcd/flagger/main/artifacts/flagger/crd.yaml
```
Deploy Flagger for Istio:
```bash
helm upgrade -i flagger flagger/flagger \
--namespace=istio-system \
--set crd.create=false \
--set meshProvider=istio \
--set metricsServer=http://prometheus:9090
```
Note that Flagger depends on Istio telemetry and Prometheus, if you're installing
Istio with istioctl then you should be using the
[default profile](https://istio.io/docs/setup/additional-setup/config-profiles/).
For Istio multi-cluster shared control plane you can install Flagger on each remote cluster and set the
Istio control plane host cluster kubeconfig:
```bash
helm upgrade -i flagger flagger/flagger \
--namespace=istio-system \
--set crd.create=false \
--set meshProvider=istio \
--set metricsServer=http://istio-cluster-prometheus:9090 \
--set istio.kubeconfig.secretName=istio-kubeconfig \
--set istio.kubeconfig.key=kubeconfig
```
Note that the Istio kubeconfig must be stored in a Kubernetes secret with a data key named `kubeconfig`.
For more details on how to configure Istio multi-cluster
credentials read the [Istio docs](https://istio.io/docs/setup/install/multicluster/shared-vpn/#credentials).
Deploy Flagger for Linkerd:
```bash
helm upgrade -i flagger flagger/flagger \
--namespace=linkerd \
--set crd.create=false \
--set meshProvider=linkerd \
--set metricsServer=http://linkerd-prometheus:9090
```
Deploy Flagger for App Mesh:
```bash
helm upgrade -i flagger flagger/flagger \
--namespace=appmesh-system \
--set crd.create=false \
--set meshProvider=appmesh \
--set metricsServer=http://appmesh-prometheus:9090
```
Deploy Flagger for **Open Service Mesh (OSM)** (requires OSM to have been installed with Prometheus):
```console
$ helm upgrade -i flagger flagger/flagger \
--namespace=osm-system \
--set crd.create=false \
--set meshProvider=osm \
--set metricsServer=http://osm-prometheus.osm-system.svc:7070
```
You can install Flagger in any namespace as long as it can talk to the Prometheus service on port 9090.
For ingress controllers, the install instructions are:
* [Contour](/flagger/tutorials/contour-progressive-delivery)
* [Gloo](/flagger/tutorials/gloo-progressive-delivery)
* [NGINX](/flagger/tutorials/nginx-progressive-delivery)
* [Skipper](/flagger/tutorials/skipper-progressive-delivery)
* [Traefik](/flagger/tutorials/traefik-progressive-delivery)
You can use the helm template command and apply the generated yaml with kubectl:
```bash
# generate
helm fetch --untar --untardir . flagger/flagger &&
helm template flagger ./flagger \
--namespace=istio-system \
--set metricsServer=http://prometheus.istio-system:9090 \
> flagger.yaml
# apply
kubectl apply -f flagger.yaml
```
To uninstall the Flagger release with Helm run:
```text
helm delete flagger
```
The command removes all the Kubernetes components associated with the chart and deletes the release.
> **Note** that on uninstall the Canary CRD will not be removed. Deleting the CRD will make Kubernetes
> remove all the objects owned by Flagger like Istio virtual services, Kubernetes deployments and ClusterIP services.
If you want to remove all the objects created by Flagger you have delete the Canary CRD with kubectl:
```text
kubectl delete crd canaries.flagger.app
```
## Install Grafana with Helm
Flagger comes with a Grafana dashboard made for monitoring the canary analysis.
Deploy Grafana in the _**istio-system**_ namespace:
```bash
helm upgrade -i flagger-grafana flagger/grafana \
--namespace=istio-system \
--set url=http://prometheus.istio-system:9090 \
--set user=admin \
--set password=change-me
```
Or use helm template command and apply the generated yaml with kubectl:
```bash
# generate
helm fetch --untar --untardir . flagger/grafana &&
helm template flagger-grafana ./grafana \
--namespace=istio-system \
> flagger-grafana.yaml
# apply
kubectl apply -f flagger-grafana.yaml
```
You can access Grafana using port forwarding:
```bash
kubectl -n istio-system port-forward svc/flagger-grafana 3000:80
```
## Install Flagger with Kustomize
As an alternative to Helm, Flagger can be installed with Kustomize **3.5.0** or newer.
**Service mesh specific installers**
Install Flagger for Istio:
```bash
kustomize build https://github.com/fluxcd/flagger/kustomize/istio?ref=main | kubectl apply -f -
```
Install Flagger for AWS App Mesh:
```bash
kustomize build https://github.com/fluxcd/flagger/kustomize/appmesh?ref=main | kubectl apply -f -
```
This deploys Flagger and sets the metrics server URL to App Mesh's Prometheus instance.
Install Flagger for Linkerd:
```bash
kustomize build https://github.com/fluxcd/flagger/kustomize/linkerd?ref=main | kubectl apply -f -
```
This deploys Flagger in the `linkerd` namespace and sets the metrics server URL to Linkerd's Prometheus instance.
Install Flagger for Open Service Mesh:
```bash
kustomize build https://github.com/fluxcd/flagger/kustomize/osm?ref=main | kubectl apply -f -
```
This deploys Flagger in the `osm-system` namespace and sets the metrics server URL to OSM's Prometheus instance.
If you want to install a specific Flagger release, add the version number to the URL:
```bash
kustomize build https://github.com/fluxcd/flagger/kustomize/linkerd?ref=v1.0.0 | kubectl apply -f -
```
**Generic installer**
Install Flagger and Prometheus for Contour, Gloo, NGINX, Skipper, or Traefik ingress:
```bash
kustomize build https://github.com/fluxcd/flagger/kustomize/kubernetes?ref=main | kubectl apply -f -
```
This deploys Flagger and Prometheus in the `flagger-system` namespace,
sets the metrics server URL to `http://flagger-prometheus.flagger-system:9090` and the mesh provider to `kubernetes`.
The Prometheus instance has a two hours data retention and is configured to scrape all pods in your cluster
that have the `prometheus.io/scrape: "true"` annotation.
To target a different provider you can specify it in the canary custom resource:
```yaml
apiVersion: flagger.app/v1beta1
kind: Canary
metadata:
name: app
namespace: test
spec:
# can be: kubernetes, istio, linkerd, appmesh, nginx, skipper, gloo, traefik, osm
# use the kubernetes provider for Blue/Green style deployments
provider: nginx
```
**Customized installer**
Create a kustomization file using Flagger as base and patch the container args:
```bash
cat > kustomization.yaml <<EOF
namespace: istio-system
bases:
- https://github.com/fluxcd/flagger/kustomize/kubernetes?ref=main
patches:
- target:
kind: Deployment
name: flagger
patch: |-
apiVersion: apps/v1
kind: Deployment
metadata:
name: flagger
spec:
template:
spec:
containers:
- name: flagger
args:
- -mesh-provider=istio
- -metrics-server=http://prometheus.istio-system:9090
- -include-label-prefix=app.kubernetes.io
EOF
```

View File

@ -0,0 +1,5 @@
---
type: docs
weight: 4
title: Tutorials
---

View File

@ -0,0 +1,438 @@
---
type: docs
weight: 4
title: App Mesh Canary Deployments
---
This guide shows you how to use App Mesh and Flagger to automate canary deployments.
You'll need an EKS cluster (Kubernetes >= 1.16) configured with App Mesh,
you can find the installation guide [here](/flagger/install/flagger-install-on-eks-appmesh).
## Bootstrap
Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler (HPA),
then creates a series of objects (Kubernetes deployments, ClusterIP services,
App Mesh virtual nodes and services).
These objects expose the application on the mesh and drive the canary analysis and promotion.
The only App Mesh object you need to create by yourself is the mesh resource.
Create a mesh called `global`:
```bash
cat << EOF | kubectl apply -f -
apiVersion: appmesh.k8s.aws/v1beta2
kind: Mesh
metadata:
name: global
spec:
namespaceSelector:
matchLabels:
appmesh.k8s.aws/sidecarInjectorWebhook: enabled
EOF
```
Create a test namespace with App Mesh sidecar injection enabled:
```bash
cat << EOF | kubectl apply -f -
apiVersion: v1
kind: Namespace
metadata:
name: test
labels:
appmesh.k8s.aws/sidecarInjectorWebhook: enabled
EOF
```
Create a deployment and a horizontal pod autoscaler:
```bash
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/podinfo?ref=main
```
Deploy the load testing service to generate traffic during the canary analysis:
```bash
helm upgrade -i flagger-loadtester flagger/loadtester \
--namespace=test \
--set appmesh.enabled=true \
--set "appmesh.backends[0]=podinfo" \
--set "appmesh.backends[1]=podinfo-canary"
```
Create a canary definition:
```yaml
apiVersion: flagger.app/v1beta1
kind: Canary
metadata:
annotations:
# Enable Envoy access logging to stdout.
appmesh.flagger.app/accesslog: enabled
name: podinfo
namespace: test
spec:
# App Mesh API reference
provider: appmesh:v1beta2
# deployment reference
targetRef:
apiVersion: apps/v1
kind: Deployment
name: podinfo
# the maximum time in seconds for the canary deployment
# to make progress before it is rollback (default 600s)
progressDeadlineSeconds: 60
# HPA reference (optional)
autoscalerRef:
apiVersion: autoscaling/v2beta2
kind: HorizontalPodAutoscaler
name: podinfo
service:
# container port
port: 9898
# App Mesh ingress timeout (optional)
timeout: 15s
# App Mesh retry policy (optional)
retries:
attempts: 3
perTryTimeout: 5s
retryOn: "gateway-error,client-error,stream-error"
# App Mesh URI settings
match:
- uri:
prefix: /
rewrite:
uri: /
# define the canary analysis timing and KPIs
analysis:
# schedule interval (default 60s)
interval: 1m
# max number of failed metric checks before rollback
threshold: 5
# max traffic percentage routed to canary
# percentage (0-100)
maxWeight: 50
# canary increment step
# percentage (0-100)
stepWeight: 5
# App Mesh Prometheus checks
metrics:
- name: request-success-rate
# minimum req success rate (non 5xx responses)
# percentage (0-100)
thresholdRange:
min: 99
interval: 1m
- name: request-duration
# maximum req duration P99
# milliseconds
thresholdRange:
max: 500
interval: 30s
# testing (optional)
webhooks:
- name: acceptance-test
type: pre-rollout
url: http://flagger-loadtester.test/
timeout: 30s
metadata:
type: bash
cmd: "curl -sd 'test' http://podinfo-canary.test:9898/token | grep token"
- name: load-test
url: http://flagger-loadtester.test/
timeout: 5s
metadata:
cmd: "hey -z 1m -q 10 -c 2 http://podinfo-canary.test:9898/"
```
Save the above resource as podinfo-canary.yaml and then apply it:
```bash
kubectl apply -f ./podinfo-canary.yaml
```
After a couple of seconds Flagger will create the canary objects:
```bash
# applied
deployment.apps/podinfo
horizontalpodautoscaler.autoscaling/podinfo
canary.flagger.app/podinfo
# generated Kubernetes objects
deployment.apps/podinfo-primary
horizontalpodautoscaler.autoscaling/podinfo-primary
service/podinfo
service/podinfo-canary
service/podinfo-primary
# generated App Mesh objects
virtualnode.appmesh.k8s.aws/podinfo-canary
virtualnode.appmesh.k8s.aws/podinfo-primary
virtualrouter.appmesh.k8s.aws/podinfo
virtualrouter.appmesh.k8s.aws/podinfo-canary
virtualservice.appmesh.k8s.aws/podinfo
virtualservice.appmesh.k8s.aws/podinfo-canary
```
After the boostrap, the podinfo deployment will be scaled to zero and the traffic to `podinfo.test`
will be routed to the primary pods.
During the canary analysis, the `podinfo-canary.test` address can be used to target directly the canary pods.
App Mesh blocks all egress traffic by default.
If your application needs to call another service, you have to create an App Mesh virtual service for it
and add the virtual service name to the backend list.
```yaml
service:
port: 9898
backends:
- backend1
- arn:aws:appmesh:eu-west-1:12345678910:mesh/my-mesh/virtualService/backend2
```
## Setup App Mesh Gateway (optional)
In order to expose the podinfo app outside the mesh you can use the App Mesh Gateway.
Deploy the App Mesh Gateway behind an AWS NLB:
```bash
helm upgrade -i appmesh-gateway eks/appmesh-gateway \
--namespace test
```
Find the gateway public address:
```bash
export URL="http://$(kubectl -n test get svc/appmesh-gateway -ojson | jq -r ".status.loadBalancer.ingress[].hostname")"
echo $URL
```
Wait for the NLB to become active:
```bash
watch curl -sS $URL
```
Create a gateway route that points to the podinfo virtual service:
```yaml
cat << EOF | kubectl apply -f -
apiVersion: appmesh.k8s.aws/v1beta2
kind: GatewayRoute
metadata:
name: podinfo
namespace: test
spec:
httpRoute:
match:
prefix: "/"
action:
target:
virtualService:
virtualServiceRef:
name: podinfo
EOF
```
Open your browser and navigate to the ingress address to access podinfo UI.
## Automated canary promotion
A canary deployment is triggered by changes in any of the following objects:
* Deployment PodSpec (container image, command, ports, env, resources, etc)
* ConfigMaps and Secrets mounted as volumes or mapped to environment variables
Trigger a canary deployment by updating the container image:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=ghcr.io/stefanprodan/podinfo:6.0.1
```
Flagger detects that the deployment revision changed and starts a new rollout:
```text
kubectl -n test describe canary/podinfo
Status:
Canary Weight: 0
Failed Checks: 0
Phase: Succeeded
Events:
New revision detected! Scaling up podinfo.test
Waiting for podinfo.test rollout to finish: 0 of 1 updated replicas are available
Pre-rollout check acceptance-test passed
Advance podinfo.test canary weight 5
Advance podinfo.test canary weight 10
Advance podinfo.test canary weight 15
Advance podinfo.test canary weight 20
Advance podinfo.test canary weight 25
Advance podinfo.test canary weight 30
Advance podinfo.test canary weight 35
Advance podinfo.test canary weight 40
Advance podinfo.test canary weight 45
Advance podinfo.test canary weight 50
Copying podinfo.test template spec to podinfo-primary.test
Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
Routing all traffic to primary
Promotion completed! Scaling down podinfo.test
```
When the canary analysis starts, Flagger will call the pre-rollout webhooks before routing traffic to the canary.
**Note** that if you apply new changes to the deployment during the canary analysis, Flagger will restart the analysis.
During the analysis the canarys progress can be monitored with Grafana.
The App Mesh dashboard URL is
[http://localhost:3000/d/flagger-appmesh/appmesh-canary?refresh=10s&orgId=1&var-namespace=test&var-primary=podinfo-primary&var-canary=podinfo](http://localhost:3000/d/flagger-appmesh/appmesh-canary?refresh=10s&orgId=1&var-namespace=test&var-primary=podinfo-primary&var-canary=podinfo).
![App Mesh Canary Dashboard](/img/screens/flagger-grafana-appmesh.png)
You can monitor all canaries with:
```bash
watch kubectl get canaries --all-namespaces
NAMESPACE NAME STATUS WEIGHT
test podinfo Progressing 15
prod frontend Succeeded 0
prod backend Failed 0
```
If youve enabled the Slack notifications, you should receive the following messages:
![Flagger Slack Notifications](/img/screens/slack-canary-notifications.png)
## Automated rollback
During the canary analysis you can generate HTTP 500 errors or high latency to test if Flagger pauses the rollout.
Trigger a canary deployment:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=ghcr.io/stefanprodan/podinfo:6.0.2
```
Exec into the load tester pod with:
```bash
kubectl -n test exec -it deploy/flagger-loadtester bash
```
Generate HTTP 500 errors:
```bash
hey -z 1m -c 5 -q 5 http://podinfo-canary.test:9898/status/500
```
Generate latency:
```bash
watch -n 1 curl http://podinfo-canary.test:9898/delay/1
```
When the number of failed checks reaches the canary analysis threshold, the traffic is routed back to the primary,
the canary is scaled to zero and the rollout is marked as failed.
```text
kubectl -n appmesh-system logs deploy/flagger -f | jq .msg
New revision detected! progressing canary analysis for podinfo.test
Pre-rollout check acceptance-test passed
Advance podinfo.test canary weight 5
Advance podinfo.test canary weight 10
Advance podinfo.test canary weight 15
Halt podinfo.test advancement success rate 69.17% < 99%
Halt podinfo.test advancement success rate 61.39% < 99%
Halt podinfo.test advancement success rate 55.06% < 99%
Halt podinfo.test advancement request duration 1.20s > 0.5s
Halt podinfo.test advancement request duration 1.45s > 0.5s
Rolling back podinfo.test failed checks threshold reached 5
Canary failed! Scaling down podinfo.test
```
If youve enabled the Slack notifications, youll receive a message if the progress deadline is exceeded,
or if the analysis reached the maximum number of failed checks:
![Flagger Slack Notifications](/img/screens/slack-canary-failed.png)
## A/B Testing
Besides weighted routing, Flagger can be configured to route traffic to the canary based on HTTP match conditions.
In an A/B testing scenario, you'll be using HTTP headers or cookies to target a certain segment of your users.
This is particularly useful for frontend applications that require session affinity.
![Flagger A/B Testing Stages](/img/diagrams/flagger-abtest-steps.png)
Edit the canary analysis, remove the max/step weight and add the match conditions and iterations:
```yaml
analysis:
interval: 1m
threshold: 5
iterations: 10
match:
- headers:
x-canary:
exact: "insider"
webhooks:
- name: load-test
url: http://flagger-loadtester.test/
metadata:
cmd: "hey -z 1m -q 10 -c 2 -H 'X-Canary: insider' http://podinfo.test:9898/"
```
The above configuration will run an analysis for ten minutes targeting users that have a `X-Canary: insider` header.
You can also use a HTTP cookie, to target all users with a `canary` cookie set to `insider` the match condition should be:
```yaml
match:
- headers:
cookie:
regex: "^(.*?;)?(canary=insider)(;.*)?$"
webhooks:
- name: load-test
url: http://flagger-loadtester.test/
metadata:
cmd: "hey -z 1m -q 10 -c 2 -H 'Cookie: canary=insider' http://podinfo.test:9898/"
```
Trigger a canary deployment by updating the container image:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=ghcr.io/stefanprodan/podinfo:6.0.3
```
Flagger detects that the deployment revision changed and starts the A/B test:
```text
kubectl -n appmesh-system logs deploy/flagger -f | jq .msg
New revision detected! progressing canary analysis for podinfo.test
Advance podinfo.test canary iteration 1/10
Advance podinfo.test canary iteration 2/10
Advance podinfo.test canary iteration 3/10
Advance podinfo.test canary iteration 4/10
Advance podinfo.test canary iteration 5/10
Advance podinfo.test canary iteration 6/10
Advance podinfo.test canary iteration 7/10
Advance podinfo.test canary iteration 8/10
Advance podinfo.test canary iteration 9/10
Advance podinfo.test canary iteration 10/10
Copying podinfo.test template spec to podinfo-primary.test
Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
Routing all traffic to primary
Promotion completed! Scaling down podinfo.test
```
The above procedure can be extended with
[custom metrics](../usage/metrics.md) checks,
[webhooks](../usage/webhooks.md),
[manual promotion](../usage/webhooks.md#manual-gating) approval and
[Slack or MS Teams](../usage/alerting.md) notifications.

View File

@ -0,0 +1,431 @@
---
type: docs
weight: 5
title: Contour Canary Deployments
---
This guide shows you how to use [Contour](https://projectcontour.io/) ingress controller and Flagger to automate canary releases and A/B testing.
![Flagger Contour Overview](/img/diagrams/flagger-contour-overview.png)
## Prerequisites
Flagger requires a Kubernetes cluster **v1.16** or newer and Contour **v1.0** or newer.
Install Contour on a cluster with LoadBalancer support:
```bash
kubectl apply -f https://projectcontour.io/quickstart/contour.yaml
```
The above command will deploy Contour and an Envoy daemonset in the `projectcontour` namespace.
Install Flagger using Kustomize (kubectl 1.14) in the `projectcontour` namespace:
```bash
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/contour?ref=main
```
The above command will deploy Flagger and Prometheus configured to scrape the Contour's Envoy instances.
Or you can install Flagger using Helm v3:
```bash
helm repo add flagger https://flagger.app
helm upgrade -i flagger flagger/flagger \
--namespace projectcontour \
--set meshProvider=contour \
--set ingressClass=contour \
--set prometheus.install=true
```
You can also enable Slack, Discord, Rocket or MS Teams notifications, see the alerting [docs](../usage/alerting.md).
## Bootstrap
Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler \(HPA\), then creates a series of objects \(Kubernetes deployments, ClusterIP services and Contour HTTPProxy\). These objects expose the application in the cluster and drive the canary analysis and promotion.
Create a test namespace:
```bash
kubectl create ns test
```
Install the load testing service to generate traffic during the canary analysis:
```bash
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/tester?ref=main
```
Create a deployment and a horizontal pod autoscaler:
```bash
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/podinfo?ref=main
```
Create a canary custom resource \(replace `app.example.com` with your own domain\):
```yaml
apiVersion: flagger.app/v1beta1
kind: Canary
metadata:
name: podinfo
namespace: test
spec:
# deployment reference
targetRef:
apiVersion: apps/v1
kind: Deployment
name: podinfo
# HPA reference
autoscalerRef:
apiVersion: autoscaling/v2beta2
kind: HorizontalPodAutoscaler
name: podinfo
service:
# service port
port: 80
# container port
targetPort: 9898
# Contour request timeout
timeout: 15s
# Contour retry policy
retries:
attempts: 3
perTryTimeout: 5s
# define the canary analysis timing and KPIs
analysis:
# schedule interval (default 60s)
interval: 30s
# max number of failed metric checks before rollback
threshold: 5
# max traffic percentage routed to canary
# percentage (0-100)
maxWeight: 50
# canary increment step
# percentage (0-100)
stepWeight: 5
# Contour Prometheus checks
metrics:
- name: request-success-rate
# minimum req success rate (non 5xx responses)
# percentage (0-100)
thresholdRange:
min: 99
interval: 1m
- name: request-duration
# maximum req duration P99 in milliseconds
thresholdRange:
max: 500
interval: 30s
# testing
webhooks:
- name: acceptance-test
type: pre-rollout
url: http://flagger-loadtester.test/
timeout: 30s
metadata:
type: bash
cmd: "curl -sd 'test' http://podinfo-canary.test/token | grep token"
- name: load-test
url: http://flagger-loadtester.test/
type: rollout
timeout: 5s
metadata:
cmd: "hey -z 1m -q 10 -c 2 -host app.example.com http://envoy.projectcontour"
```
Save the above resource as podinfo-canary.yaml and then apply it:
```bash
kubectl apply -f ./podinfo-canary.yaml
```
The canary analysis will run for five minutes while validating the HTTP metrics and rollout hooks every half a minute.
After a couple of seconds Flagger will create the canary objects:
```bash
# applied
deployment.apps/podinfo
horizontalpodautoscaler.autoscaling/podinfo
canary.flagger.app/podinfo
# generated
deployment.apps/podinfo-primary
horizontalpodautoscaler.autoscaling/podinfo-primary
service/podinfo
service/podinfo-canary
service/podinfo-primary
httpproxy.projectcontour.io/podinfo
```
After the boostrap, the podinfo deployment will be scaled to zero and the traffic to `podinfo.test` will be routed to the primary pods. During the canary analysis, the `podinfo-canary.test` address can be used to target directly the canary pods.
## Expose the app outside the cluster
Find the external address of Contour's Envoy load balancer:
```bash
export ADDRESS="$(kubectl -n projectcontour get svc/envoy -ojson \
| jq -r ".status.loadBalancer.ingress[].hostname")"
echo $ADDRESS
```
Configure your DNS server with a CNAME record \(AWS\) or A record \(GKE/AKS/DOKS\) and point a domain e.g. `app.example.com` to the LB address.
Create a HTTPProxy definition and include the podinfo proxy generated by Flagger \(replace `app.example.com` with your own domain\):
```yaml
apiVersion: projectcontour.io/v1
kind: HTTPProxy
metadata:
name: podinfo-ingress
namespace: test
spec:
virtualhost:
fqdn: app.example.com
includes:
- name: podinfo
namespace: test
conditions:
- prefix: /
```
Save the above resource as podinfo-ingress.yaml and then apply it:
```bash
kubectl apply -f ./podinfo-ingress.yaml
```
Verify that Contour processed the proxy definition with:
```bash
kubectl -n test get httpproxies
NAME FQDN STATUS
podinfo valid
podinfo-ingress app.example.com valid
```
Now you can access podinfo UI using your domain address.
Note that you should be using HTTPS when exposing production workloads on internet. You can obtain free TLS certs from Let's Encrypt, read this [guide](https://github.com/stefanprodan/eks-contour-ingress) on how to configure cert-manager to secure Contour with TLS certificates.
## Automated canary promotion
Flagger implements a control loop that gradually shifts traffic to the canary while measuring key performance indicators like HTTP requests success rate, requests average duration and pod health. Based on analysis of the KPIs a canary is promoted or aborted.
![Flagger Canary Stages](/img/diagrams/flagger-canary-steps.png)
A canary deployment is triggered by changes in any of the following objects:
* Deployment PodSpec \(container image, command, ports, env, resources, etc\)
* ConfigMaps and Secrets mounted as volumes or mapped to environment variables
Trigger a canary deployment by updating the container image:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=ghcr.io/stefanprodan/podinfo:6.0.1
```
Flagger detects that the deployment revision changed and starts a new rollout:
```text
kubectl -n test describe canary/podinfo
Status:
Canary Weight: 0
Failed Checks: 0
Phase: Succeeded
Events:
New revision detected! Scaling up podinfo.test
Waiting for podinfo.test rollout to finish: 0 of 1 updated replicas are available
Pre-rollout check acceptance-test passed
Advance podinfo.test canary weight 5
Advance podinfo.test canary weight 10
Advance podinfo.test canary weight 15
Advance podinfo.test canary weight 20
Advance podinfo.test canary weight 25
Advance podinfo.test canary weight 30
Advance podinfo.test canary weight 35
Advance podinfo.test canary weight 40
Advance podinfo.test canary weight 45
Advance podinfo.test canary weight 50
Copying podinfo.test template spec to podinfo-primary.test
Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
Routing all traffic to primary
Promotion completed! Scaling down podinfo.test
```
When the canary analysis starts, Flagger will call the pre-rollout webhooks before routing traffic to the canary.
**Note** that if you apply new changes to the deployment during the canary analysis, Flagger will restart the analysis.
You can monitor all canaries with:
```bash
watch kubectl get canaries --all-namespaces
NAMESPACE NAME STATUS WEIGHT LASTTRANSITIONTIME
test podinfo Progressing 15 2019-12-20T14:05:07Z
```
If youve enabled the Slack notifications, you should receive the following messages:
![Flagger Slack Notifications](/img/screens/slack-canary-notifications.png)
## Automated rollback
During the canary analysis you can generate HTTP 500 errors or high latency to test if Flagger pauses the rollout.
Trigger a canary deployment:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=ghcr.io/stefanprodan/podinfo:6.0.2
```
Exec into the load tester pod with:
```bash
kubectl -n test exec -it deploy/flagger-loadtester bash
```
Generate HTTP 500 errors:
```bash
hey -z 1m -c 5 -q 5 http://app.example.com/status/500
```
Generate latency:
```bash
watch -n 1 curl http://app.example.com/delay/1
```
When the number of failed checks reaches the canary analysis threshold, the traffic is routed back to the primary, the canary is scaled to zero and the rollout is marked as failed.
```text
kubectl -n projectcontour logs deploy/flagger -f | jq .msg
New revision detected! progressing canary analysis for podinfo.test
Pre-rollout check acceptance-test passed
Advance podinfo.test canary weight 5
Advance podinfo.test canary weight 10
Advance podinfo.test canary weight 15
Halt podinfo.test advancement success rate 69.17% < 99%
Halt podinfo.test advancement success rate 61.39% < 99%
Halt podinfo.test advancement success rate 55.06% < 99%
Halt podinfo.test advancement request duration 1.20s > 500ms
Halt podinfo.test advancement request duration 1.45s > 500ms
Rolling back podinfo.test failed checks threshold reached 5
Canary failed! Scaling down podinfo.test
```
If youve enabled the Slack notifications, youll receive a message if the progress deadline is exceeded, or if the analysis reached the maximum number of failed checks:
![Flagger Slack Notifications](/img/screens/slack-canary-failed.png)
## A/B Testing
Besides weighted routing, Flagger can be configured to route traffic to the canary based on HTTP match conditions. In an A/B testing scenario, you'll be using HTTP headers or cookies to target a certain segment of your users. This is particularly useful for frontend applications that require session affinity.
![Flagger A/B Testing Stages](/img/diagrams/flagger-abtest-steps.png)
Edit the canary analysis, remove the max/step weight and add the match conditions and iterations:
```yaml
analysis:
interval: 1m
threshold: 5
iterations: 10
match:
- headers:
x-canary:
exact: "insider"
webhooks:
- name: load-test
url: http://flagger-loadtester.test/
metadata:
cmd: "hey -z 1m -q 5 -c 5 -H 'X-Canary: insider' -host app.example.com http://envoy.projectcontour"
```
The above configuration will run an analysis for ten minutes targeting users that have a `X-Canary: insider` header.
You can also use a HTTP cookie. To target all users with a cookie set to `insider`, the match condition should be:
```yaml
match:
- headers:
cookie:
suffix: "insider"
webhooks:
- name: load-test
url: http://flagger-loadtester.test/
metadata:
cmd: "hey -z 1m -q 5 -c 5 -H 'Cookie: canary=insider' -host app.example.com http://envoy.projectcontour"
```
Trigger a canary deployment by updating the container image:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=ghcr.io/stefanprodan/podinfo:6.0.3
```
Flagger detects that the deployment revision changed and starts the A/B test:
```text
kubectl -n projectcontour logs deploy/flagger -f | jq .msg
New revision detected! Progressing canary analysis for podinfo.test
Advance podinfo.test canary iteration 1/10
Advance podinfo.test canary iteration 2/10
Advance podinfo.test canary iteration 3/10
Advance podinfo.test canary iteration 4/10
Advance podinfo.test canary iteration 5/10
Advance podinfo.test canary iteration 6/10
Advance podinfo.test canary iteration 7/10
Advance podinfo.test canary iteration 8/10
Advance podinfo.test canary iteration 9/10
Advance podinfo.test canary iteration 10/10
Copying podinfo.test template spec to podinfo-primary.test
Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
Routing all traffic to primary
Promotion completed! Scaling down podinfo.test
```
The web browser user agent header allows user segmentation based on device or OS.
For example, if you want to route all mobile users to the canary instance:
```yaml
match:
- headers:
user-agent:
prefix: "Mobile"
```
Or if you want to target only Android users:
```yaml
match:
- headers:
user-agent:
prefix: "Android"
```
Or a specific browser version:
```yaml
match:
- headers:
user-agent:
suffix: "Firefox/71.0"
```
For an in-depth look at the analysis process read the [usage docs](../usage/how-it-works.md).

View File

@ -0,0 +1,488 @@
---
weight: 11
title: Gateway API Canary Deployments
type: docs
---
This guide shows you how to use [Gateway API](https://gateway-api.sigs.k8s.io/) and Flagger to automate canary deployments and A/B testing.
![Flagger Canary Stages](/img/diagrams/flagger-gatewayapi-canary.png)
## Prerequisites
Flagger requires a Kubernetes cluster **v1.16** or newer and any mesh/ingress that implements the `v1alpha2` of Gateway API. We'll be using Contour for the sake of this tutorial, but you can use any other implementation.
Install the GatewayAPI CRDs:
```bash
kubectl apply -k github.com/kubernetes-sigs/gateway-api/config/crd?ref=v0.4.1
```
Install a cluster-wide GatewayClass; a Gateway belonging to the GatewayClass and Contour components in the `projectcontour` namespace:
```bash
kubectl apply -f https://raw.githubusercontent.com/projectcontour/contour/release-1.20/examples/render/contour-gateway.yaml
```
Install Flagger in the `flagger-system` namespace:
```bash
kubectl apply -k github.com/fluxcd/flagger//kustomize/gatewayapi
```
## Bootstrap
Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler \(HPA\), then creates a series of objects \(Kubernetes deployments, ClusterIP services, HTTPRoutes for the Gateway\). These objects expose the application inside the mesh and drive the canary analysis and promotion.
Create a test namespace:
```bash
kubectl create ns test
```
Create a deployment and a horizontal pod autoscaler:
```bash
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/podinfo?ref=main
```
Deploy the load testing service to generate traffic during the canary analysis:
```bash
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/tester?ref=main
```
Create metric templates targeting the Prometheus server in the `flagger-system` namespace. The PromQL queries below are meant for `Envoy`, but you can [change it to your ingress/mesh provider](/flagger/faq#metrics) accordingly.
```yaml
apiVersion: flagger.app/v1beta1
kind: MetricTemplate
metadata:
name: latency
namespace: flagger-system
spec:
provider:
type: prometheus
address: http://flagger-prometheus:9090
query: |
histogram_quantile(0.99,
sum(
rate(
envoy_cluster_upstream_rq_time_bucket{
envoy_cluster_name=~"{{ namespace }}_{{ target }}-canary_[0-9a-zA-Z-]+",
}[{{ interval }}]
)
) by (le)
)/1000
---
apiVersion: flagger.app/v1beta1
kind: MetricTemplate
metadata:
name: error-rate
namespace: flagger-system
spec:
provider:
type: prometheus
address: http://flagger-prometheus:9090
query: |
100 - sum(
rate(
envoy_cluster_upstream_rq{
envoy_cluster_name=~"{{ namespace }}_{{ target }}-canary_[0-9a-zA-Z-]+",
envoy_response_code!~"5.*"
}[{{ interval }}]
)
)
/
sum(
rate(
envoy_cluster_upstream_rq{
envoy_cluster_name=~"{{ namespace }}_{{ target }}-canary_[0-9a-zA-Z-]+",
}[{{ interval }}]
)
)
* 100
```
Save the above resource as metric-templates.yaml and then apply it:
```bash
kubectl apply -f metric-templates.yaml
```
Create a canary custom resource \(replace "loaclproject.contour.io" with your own domain\):
```yaml
apiVersion: flagger.app/v1beta1
kind: Canary
metadata:
name: podinfo
namespace: test
spec:
# deployment reference
targetRef:
apiVersion: apps/v1
kind: Deployment
name: podinfo
# the maximum time in seconds for the canary deployment
# to make progress before it is rollback (default 600s)
progressDeadlineSeconds: 60
# HPA reference (optional)
autoscalerRef:
apiVersion: autoscaling/v2beta2
kind: HorizontalPodAutoscaler
name: podinfo
service:
# service port number
port: 9898
# container port number or name (optional)
targetPort: 9898
# Gateway API HTTPRoute host names
hosts:
- localproject.contour.io
# Reference to the Gateway that the generated HTTPRoute would attach to.
gatewayRefs:
- name: contour
namespace: projectcontour
analysis:
# schedule interval (default 60s)
interval: 1m
# max number of failed metric checks before rollback
threshold: 5
# max traffic percentage routed to canary
# percentage (0-100)
maxWeight: 50
# canary increment step
# percentage (0-100)
stepWeight: 10
metrics:
- name: error-rate
# max error rate (5xx responses)
# percentage (0-100)
templateRef:
name: error-rate
namespace: flagger-system
thresholdRange:
max: 1
interval: 1m
- name: latency
templateRef:
name: latency
namespace: flagger-system
# seconds
thresholdRange:
max: 0.5
interval: 30s
# testing (optional)
webhooks:
- name: smoke-test
type: pre-rollout
url: http://flagger-loadtester.test/
timeout: 15s
metadata:
type: bash
cmd: "curl -sd 'anon' http://podinfo-canary.test:9898/token | grep token"
- name: load-test
url: http://flagger-loadtester.test/
timeout: 5s
metadata:
cmd: "hey -z 2m -q 10 -c 2 -host localproject.contour.io http://envoy.projectcontour/"
```
Save the above resource as podinfo-canary.yaml and then apply it:
```bash
kubectl apply -f ./podinfo-canary.yaml
```
When the canary analysis starts, Flagger will call the pre-rollout webhooks before routing traffic to the canary. The canary analysis will run for five minutes while validating the HTTP metrics and rollout hooks every minute.
After a couple of seconds Flagger will create the canary objects:
```bash
# applied
deployment.apps/podinfo
horizontalpodautoscaler.autoscaling/podinfo
canary.flagger.app/podinfo
# generated
deployment.apps/podinfo-primary
horizontalpodautoscaler.autoscaling/podinfo-primary
service/podinfo
service/podinfo-canary
service/podinfo-primary
httproutes.gateway.networking.k8s.io/podinfo
```
## Expose the app outside the cluster
Find the external address of Contour's Envoy load balancer:
```bash
export ADDRESS="$(kubectl -n projectcontour get svc/envoy -ojson \
| jq -r ".status.loadBalancer.ingress[].hostname")"
echo $ADDRESS
```
Configure your DNS server with a CNAME record \(AWS\) or A record \(GKE/AKS/DOKS\) and point a domain e.g. `localproject.contour.io` to the LB address.
Now you can access the podinfo UI using your domain address.
Note that you should be using HTTPS when exposing production workloads on internet. You can obtain free TLS certs from Let's Encrypt, read this [guide](https://github.com/stefanprodan/eks-contour-ingress) on how to configure cert-manager to secure Contour with TLS certificates.
If you're using a local cluster via kind/k3s you can port forward the Envoy LoadBalancer service:
```bash
kubectl port-forward -n projectcontour svc/envoy 8080:80
```
Now you can access podinfo via `curl -H "Host: localproject.contour.io" localhost:8080`
## Automated canary promotion
Trigger a canary deployment by updating the container image:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=stefanprodan/podinfo:6.0.1
```
Flagger detects that the deployment revision changed and starts a new rollout:
```text
kubectl -n test describe canary/podinfo
Status:
Canary Weight: 0
Failed Checks: 0
Phase: Succeeded
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal Synced 3m flagger New revision detected podinfo.test
Normal Synced 3m flagger Scaling up podinfo.test
Warning Synced 3m flagger Waiting for podinfo.test rollout to finish: 0 of 1 updated replicas are available
Normal Synced 3m flagger Advance podinfo.test canary weight 5
Normal Synced 3m flagger Advance podinfo.test canary weight 10
Normal Synced 3m flagger Advance podinfo.test canary weight 15
Normal Synced 2m flagger Advance podinfo.test canary weight 20
Normal Synced 2m flagger Advance podinfo.test canary weight 25
Normal Synced 1m flagger Advance podinfo.test canary weight 30
Normal Synced 1m flagger Advance podinfo.test canary weight 35
Normal Synced 55s flagger Advance podinfo.test canary weight 40
Normal Synced 45s flagger Advance podinfo.test canary weight 45
Normal Synced 35s flagger Advance podinfo.test canary weight 50
Normal Synced 25s flagger Copying podinfo.test template spec to podinfo-primary.test
Warning Synced 15s flagger Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
Normal Synced 5s flagger Promotion completed! Scaling down podinfo.test
```
**Note** that if you apply new changes to the deployment during the canary analysis, Flagger will restart the analysis.
A canary deployment is triggered by changes in any of the following objects:
* Deployment PodSpec \(container image, command, ports, env, resources, etc\)
* ConfigMaps mounted as volumes or mapped to environment variables
* Secrets mounted as volumes or mapped to environment variables
You can monitor how Flagger progressively changes the weights of the HTTPRoute object that is attahed to the Gateway with:
```bash
watch kubectl get httproute -n test podinfo -o=jsonpath='{.spec.rules}'
```
You can monitor all canaries with:
```bash
watch kubectl get canaries --all-namespaces
NAMESPACE NAME STATUS WEIGHT LASTTRANSITIONTIME
test podinfo Progressing 15 2022-01-16T14:05:07Z
prod frontend Succeeded 0 2022-01-15T16:15:07Z
prod backend Failed 0 2022-01-14T17:05:07Z
```
## Automated rollback
During the canary analysis you can generate HTTP 500 errors and high latency to test if Flagger pauses the rollout.
Trigger another canary deployment:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=stefanprodan/podinfo:6.0.2
```
Exec into the load tester pod with:
```bash
kubectl -n test exec -it flagger-loadtester-xx-xx sh
```
Generate HTTP 500 errors:
```bash
watch curl http://podinfo-canary:9898/status/500
```
Generate latency:
```bash
watch curl http://podinfo-canary:9898/delay/1
```
When the number of failed checks reaches the canary analysis threshold, the traffic is routed back to the primary, the canary is scaled to zero and the rollout is marked as failed.
```text
kubectl -n test describe canary/podinfo
Status:
Canary Weight: 0
Failed Checks: 10
Phase: Failed
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal Synced 3m flagger Starting canary deployment for podinfo.test
Normal Synced 3m flagger Advance podinfo.test canary weight 5
Normal Synced 3m flagger Advance podinfo.test canary weight 10
Normal Synced 3m flagger Advance podinfo.test canary weight 15
Normal Synced 3m flagger Halt podinfo.test advancement error rate 69.17% > 1%
Normal Synced 2m flagger Halt podinfo.test advancement error rate 61.39% > 1%
Normal Synced 2m flagger Halt podinfo.test advancement error rate 55.06% > 1%
Normal Synced 2m flagger Halt podinfo.test advancement error rate 47.00% > 1%
Normal Synced 2m flagger (combined from similar events): Halt podinfo.test advancement error rate 38.08% > 1%
Warning Synced 1m flagger Rolling back podinfo.test failed checks threshold reached 10
Warning Synced 1m flagger Canary failed! Scaling down podinfo.test
```
# A/B Testing
Besides weighted routing, Flagger can be configured to route traffic to the canary based on HTTP match conditions. In an A/B testing scenario, you'll be using HTTP headers or cookies to target a certain segment of your users. This is particularly useful for frontend applications that require session affinity.
![Flagger A/B Testing Stages](/img/diagrams/flagger-abtest-steps.png)
Create a canary custom resource \(replace "loaclproject.contour.io" with your own domain\):
```yaml
apiVersion: flagger.app/v1beta1
kind: Canary
metadata:
name: podinfo
namespace: test
spec:
# deployment reference
targetRef:
apiVersion: apps/v1
kind: Deployment
name: podinfo
# the maximum time in seconds for the canary deployment
# to make progress before it is rollback (default 600s)
progressDeadlineSeconds: 60
# HPA reference (optional)
autoscalerRef:
apiVersion: autoscaling/v2beta2
kind: HorizontalPodAutoscaler
name: podinfo
service:
# service port number
port: 9898
# container port number or name (optional)
targetPort: 9898
# Gateway API HTTPRoute host names
hosts:
- localproject.contour.io
# Reference to the Gateway that the generated HTTPRoute would attach to.
gatewayRefs:
- name: contour
namespace: projectcontour
analysis:
# schedule interval (default 60s)
interval: 1m
# max number of failed metric checks before rollback
threshold: 5
# max traffic percentage routed to canary
# percentage (0-100)
maxWeight: 50
# canary increment step
# percentage (0-100)
stepWeight: 10
metrics:
- name: error-rate
# max error rate (5xx responses)
# percentage (0-100)
templateRef:
name: error-rate
namespace: flagger-system
thresholdRange:
max: 1
interval: 1m
- name: latency
templateRef:
name: latency
namespace: flagger-system
# seconds
thresholdRange:
max: 0.5
interval: 30s
# testing (optional)
webhooks:
- name: smoke-test
type: pre-rollout
url: http://flagger-loadtester.test/
timeout: 15s
metadata:
type: bash
cmd: "curl -sd 'anon' http://podinfo-canary.test:9898/token | grep token"
- name: load-test
url: http://flagger-loadtester.test/
timeout: 5s
metadata:
cmd: "hey -z 2m -q 10 -c 2 -host localproject.contour.io -H 'X-Canary: insider' http://envoy.projectcontour/"
```
The above configuration will run an analysis for ten minutes targeting those users that have an insider cookie.
Save the above resource as podinfo-ab-canary.yaml and then apply it:
```bash
kubectl apply -f ./podinfo-ab-canary.yaml
```
Trigger a canary deployment by updating the container image:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=stefanprodan/podinfo:6.0.3
```
Flagger detects that the deployment revision changed and starts a new rollout:
```text
kubectl -n test describe canary/abtest
Status:
Failed Checks: 0
Phase: Succeeded
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal Synced 3m flagger New revision detected podinfo.test
Normal Synced 3m flagger Scaling up podinfo.test
Warning Synced 3m flagger Waiting for podinfo.test rollout to finish: 0 of 1 updated replicas are available
Normal Synced 3m flagger Advance podinfo.test canary iteration 1/10
Normal Synced 3m flagger Advance podinfo.test canary iteration 2/10
Normal Synced 3m flagger Advance podinfo.test canary iteration 3/10
Normal Synced 2m flagger Advance podinfo.test canary iteration 4/10
Normal Synced 2m flagger Advance podinfo.test canary iteration 5/10
Normal Synced 1m flagger Advance podinfo.test canary iteration 6/10
Normal Synced 1m flagger Advance podinfo.test canary iteration 7/10
Normal Synced 55s flagger Advance podinfo.test canary iteration 8/10
Normal Synced 45s flagger Advance podinfo.test canary iteration 9/10
Normal Synced 35s flagger Advance podinfo.test canary iteration 10/10
Normal Synced 25s flagger Copying podinfo.test template spec to podinfo-primary.test
Warning Synced 15s flagger Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
Normal Synced 5s flagger Promotion completed! Scaling down podinfo.test
```
The above procedures can be extended with [custom metrics](../usage/metrics.md) checks, [webhooks](../usage/webhooks.md), [manual promotion](../usage/webhooks.md#manual-gating) approval and [Slack or MS Teams](../usage/alerting.md) notifications.

View File

@ -0,0 +1,488 @@
---
weight: 5
title: Gloo Canary Deployments
type: docs
---
This guide shows you how to use the [Gloo Edge](https://gloo.solo.io/) ingress controller
and Flagger to automate canary releases and A/B testing.
![Flagger Gloo Ingress Controller](/img/diagrams/flagger-gloo-overview.png)
## Prerequisites
Flagger requires a Kubernetes cluster **v1.16** or newer and Gloo Edge ingress **1.6.0** or newer.
This guide was written for Flagger version **1.6.0** or higher. Prior versions of Flagger
used Gloo `UpstreamGroup`s to handle canaries, but newer versions of Flagger use Gloo
`RouteTable`s to handle canaries as well as A/B testing.
Install Gloo with Helm v3:
```bash
helm repo add gloo https://storage.googleapis.com/solo-public-helm
kubectl create ns gloo-system
helm upgrade -i gloo gloo/gloo \
--namespace gloo-system
```
Install Flagger and the Prometheus add-on in the same namespace as Gloo:
```bash
helm repo add flagger https://flagger.app
helm upgrade -i flagger flagger/flagger \
--namespace gloo-system \
--set prometheus.install=true \
--set meshProvider=gloo
```
## Bootstrap
Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler (HPA),
then creates a series of objects (Kubernetes deployments, ClusterIP services, Gloo route tables and upstreams).
These objects expose the application outside the cluster and drive the canary analysis and promotion.
Create a test namespace:
```bash
kubectl create ns test
```
Create a deployment and a horizontal pod autoscaler:
```bash
kubectl -n test apply -k https://github.com/fluxcd/flagger//kustomize/podinfo?ref=main
```
Deploy the load testing service to generate traffic during the canary analysis:
```bash
kubectl -n test apply -k https://github.com/fluxcd/flagger//kustomize/tester?ref=main
```
Create a virtual service definition that references a route table that will be generated by Flagger
(replace `app.example.com` with your own domain):
```yaml
apiVersion: gateway.solo.io/v1
kind: VirtualService
metadata:
name: podinfo
namespace: test
spec:
virtualHost:
domains:
- 'app.example.com'
routes:
- matchers:
- prefix: /
delegateAction:
ref:
name: podinfo
namespace: test
```
Save the above resource as podinfo-virtualservice.yaml and then apply it:
```bash
kubectl apply -f ./podinfo-virtualservice.yaml
```
Create a canary custom resource (replace `app.example.com` with your own domain):
```yaml
apiVersion: flagger.app/v1beta1
kind: Canary
metadata:
name: podinfo
namespace: test
spec:
# upstreamRef (optional)
# defines an upstream to copy the spec from when flagger generates new upstreams.
# necessary to copy over TLS config, circuit breakers, etc. (anything nonstandard)
# upstreamRef:
# apiVersion: gloo.solo.io/v1
# kind: Upstream
# name: podinfo-upstream
# namespace: gloo-system
provider: gloo
# deployment reference
targetRef:
apiVersion: apps/v1
kind: Deployment
name: podinfo
# HPA reference (optional)
autoscalerRef:
apiVersion: autoscaling/v2beta2
kind: HorizontalPodAutoscaler
name: podinfo
service:
# ClusterIP port number
port: 9898
# container port number or name (optional)
targetPort: 9898
analysis:
# schedule interval (default 60s)
interval: 10s
# max number of failed metric checks before rollback
threshold: 5
# max traffic percentage routed to canary
# percentage (0-100)
maxWeight: 50
# canary increment step
# percentage (0-100)
stepWeight: 5
# Gloo Prometheus checks
metrics:
- name: request-success-rate
# minimum req success rate (non 5xx responses)
# percentage (0-100)
thresholdRange:
min: 99
interval: 1m
- name: request-duration
# maximum req duration P99
# milliseconds
thresholdRange:
max: 500
interval: 30s
# testing (optional)
webhooks:
- name: acceptance-test
type: pre-rollout
url: http://flagger-loadtester.test/
timeout: 10s
metadata:
type: bash
cmd: "curl -sd 'test' http://podinfo-canary:9898/token | grep token"
- name: load-test
url: http://flagger-loadtester.test/
timeout: 5s
metadata:
type: cmd
cmd: "hey -z 2m -q 5 -c 2 -host app.example.com http://gateway-proxy.gloo-system"
```
*Note: when using upstreamRef the following fields are copied over from the original upstream: `Labels, SslConfig, CircuitBreakers, ConnectionConfig, UseHttp2, InitialStreamWindowSize`*
Save the above resource as podinfo-canary.yaml and then apply it:
```bash
kubectl apply -f ./podinfo-canary.yaml
```
After a couple of seconds Flagger will create the canary objects:
```bash
# applied
deployment.apps/podinfo
horizontalpodautoscaler.autoscaling/podinfo
virtualservices.gateway.solo.io/podinfo
canary.flagger.app/podinfo
# generated
deployment.apps/podinfo-primary
horizontalpodautoscaler.autoscaling/podinfo-primary
service/podinfo
service/podinfo-canary
service/podinfo-primary
routetables.gateway.solo.io/podinfo
upstreams.gloo.solo.io/test-podinfo-canaryupstream-9898
upstreams.gloo.solo.io/test-podinfo-primaryupstream-9898
```
When the bootstrap finishes Flagger will set the canary status to initialized:
```bash
kubectl -n test get canary podinfo
NAME STATUS WEIGHT LASTTRANSITIONTIME
podinfo Initialized 0 2019-05-17T08:09:51Z
```
## Automated canary promotion
Flagger implements a control loop that gradually shifts traffic to the canary while measuring
key performance indicators like HTTP requests success rate, requests average duration and pod health.
Based on analysis of the KPIs a canary is promoted or aborted, and the analysis result is published to Slack.
![Flagger Canary Stages](/img/diagrams/flagger-canary-steps.png)
Trigger a canary deployment by updating the container image:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=ghcr.io/stefanprodan/podinfo:6.0.1
```
Flagger detects that the deployment revision changed and starts a new rollout:
```text
kubectl -n test describe canary/podinfo
Status:
Canary Weight: 0
Failed Checks: 0
Phase: Succeeded
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal Synced 3m flagger New revision detected podinfo.test
Normal Synced 3m flagger Scaling up podinfo.test
Warning Synced 3m flagger Waiting for podinfo.test rollout to finish: 0 of 1 updated replicas are available
Normal Synced 3m flagger Advance podinfo.test canary weight 5
Normal Synced 3m flagger Advance podinfo.test canary weight 10
Normal Synced 3m flagger Advance podinfo.test canary weight 15
Normal Synced 2m flagger Advance podinfo.test canary weight 20
Normal Synced 2m flagger Advance podinfo.test canary weight 25
Normal Synced 1m flagger Advance podinfo.test canary weight 30
Normal Synced 1m flagger Advance podinfo.test canary weight 35
Normal Synced 55s flagger Advance podinfo.test canary weight 40
Normal Synced 45s flagger Advance podinfo.test canary weight 45
Normal Synced 35s flagger Advance podinfo.test canary weight 50
Normal Synced 25s flagger Copying podinfo.test template spec to podinfo-primary.test
Warning Synced 15s flagger Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
Normal Synced 5s flagger Promotion completed! Scaling down podinfo.test
```
**Note** that if you apply new changes to the deployment during the canary analysis, Flagger will restart the analysis.
You can monitor all canaries with:
```bash
watch kubectl get canaries --all-namespaces
NAMESPACE NAME STATUS WEIGHT LASTTRANSITIONTIME
test podinfo Progressing 15 2019-05-17T14:05:07Z
prod frontend Succeeded 0 2019-05-17T16:15:07Z
prod backend Failed 0 2019-05-17T17:05:07Z
```
## Automated rollback
During the canary analysis you can generate HTTP 500 errors and high latency to test if
Flagger pauses and rolls back the faulted version.
Trigger another canary deployment:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=ghcr.io/stefanprodan/podinfo:6.0.2
```
Generate HTTP 500 errors:
```bash
watch curl -H 'Host: app.example.com' http://gateway-proxy.gloo-system/status/500
```
Generate high latency:
```bash
watch curl -H 'Host: app.example.com' http://gateway-proxy.gloo-system/delay/2
```
When the number of failed checks reaches the canary analysis threshold, the traffic is routed back to the primary,
the canary is scaled to zero and the rollout is marked as failed.
```text
kubectl -n test describe canary/podinfo
Status:
Canary Weight: 0
Failed Checks: 10
Phase: Failed
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal Synced 3m flagger Starting canary deployment for podinfo.test
Normal Synced 3m flagger Advance podinfo.test canary weight 5
Normal Synced 3m flagger Advance podinfo.test canary weight 10
Normal Synced 3m flagger Advance podinfo.test canary weight 15
Normal Synced 3m flagger Halt podinfo.test advancement success rate 69.17% < 99%
Normal Synced 2m flagger Halt podinfo.test advancement success rate 61.39% < 99%
Normal Synced 2m flagger Halt podinfo.test advancement success rate 55.06% < 99%
Normal Synced 2m flagger Halt podinfo.test advancement success rate 47.00% < 99%
Normal Synced 2m flagger (combined from similar events): Halt podinfo.test advancement success rate 38.08% < 99%
Warning Synced 1m flagger Rolling back podinfo.test failed checks threshold reached 10
Warning Synced 1m flagger Canary failed! Scaling down podinfo.test
```
## Custom metrics
The canary analysis can be extended with Prometheus queries.
The demo app is instrumented with Prometheus so you can create a custom check that will use the HTTP request
duration histogram to validate the canary.
Create a metric template and apply it on the cluster:
```yaml
apiVersion: flagger.app/v1beta1
kind: MetricTemplate
metadata:
name: not-found-percentage
namespace: test
spec:
provider:
type: prometheus
address: http://flagger-prometheus.gloo-system:9090
query: |
100 - sum(
rate(
http_request_duration_seconds_count{
kubernetes_namespace="{{ namespace }}",
kubernetes_pod_name=~"{{ target }}-[0-9a-zA-Z]+(-[0-9a-zA-Z]+)"
status!="{{ interval }}"
}[1m]
)
)
/
sum(
rate(
http_request_duration_seconds_count{
kubernetes_namespace="{{ namespace }}",
kubernetes_pod_name=~"{{ target }}-[0-9a-zA-Z]+(-[0-9a-zA-Z]+)"
}[{{ interval }}]
)
) * 100
```
Edit the canary analysis and add the following metric:
```yaml
analysis:
metrics:
- name: "404s percentage"
templateRef:
name: not-found-percentage
thresholdRange:
max: 5
interval: 1m
```
The above configuration validates the canary by checking if the HTTP 404 req/sec percentage
is below 5 percent of the total traffic. If the 404s rate reaches the 5% threshold, then the canary fails.
Trigger a canary deployment by updating the container image:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=ghcr.io/stefanprodan/podinfo:6.0.3
```
Generate 404s:
```bash
watch curl -H 'Host: app.example.com' http://gateway-proxy.gloo-system/status/404
```
Watch Flagger logs:
```text
kubectl -n gloo-system logs deployment/flagger -f | jq .msg
Starting canary deployment for podinfo.test
Advance podinfo.test canary weight 5
Advance podinfo.test canary weight 10
Advance podinfo.test canary weight 15
Halt podinfo.test advancement 404s percentage 6.20 > 5
Halt podinfo.test advancement 404s percentage 6.45 > 5
Halt podinfo.test advancement 404s percentage 7.60 > 5
Halt podinfo.test advancement 404s percentage 8.69 > 5
Halt podinfo.test advancement 404s percentage 9.70 > 5
Rolling back podinfo.test failed checks threshold reached 5
Canary failed! Scaling down podinfo.test
```
If you have [alerting](../usage/alerting.md) configured,
Flagger will send a notification with the reason why the canary failed.
## A/B Testing
Besides weighted routing, Flagger can be configured to route traffic to the canary based on HTTP match conditions.
In an A/B testing scenario, you'll be using HTTP headers or cookies to target a certain segment of your users.
This is particularly useful for frontend applications that require session affinity.
![Flagger A/B Testing Stages](/img/diagrams/flagger-abtest-steps.png)
Edit the canary analysis, remove the max/step weight and add the match conditions and iterations:
```yaml
analysis:
interval: 1m
threshold: 5
iterations: 10
match:
- headers:
x-canary:
exact: "insider"
webhooks:
- name: load-test
url: http://flagger-loadtester.test/
metadata:
cmd: "hey -z 1m -q 5 -c 5 -H 'X-Canary: insider' -host app.example.com http://gateway-proxy.gloo-system"
```
The above configuration will run an analysis for ten minutes targeting users that have a `X-Canary: insider` header.
Trigger a canary deployment by updating the container image:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=ghcr.io/stefanprodan/podinfo:6.0.4
```
Flagger detects that the deployment revision changed and starts the A/B test:
```text
kubectl -n gloo-system logs deploy/flagger -f | jq .msg
New revision detected! Progressing canary analysis for podinfo.test
Advance podinfo.test canary iteration 1/10
Advance podinfo.test canary iteration 2/10
Advance podinfo.test canary iteration 3/10
Advance podinfo.test canary iteration 4/10
Advance podinfo.test canary iteration 5/10
Advance podinfo.test canary iteration 6/10
Advance podinfo.test canary iteration 7/10
Advance podinfo.test canary iteration 8/10
Advance podinfo.test canary iteration 9/10
Advance podinfo.test canary iteration 10/10
Copying podinfo.test template spec to podinfo-primary.test
Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
Routing all traffic to primary
Promotion completed! Scaling down podinfo.test
```
The web browser user agent header allows user segmentation based on device or OS.
For example, if you want to route all mobile users to the canary instance:
```yaml
match:
- headers:
user-agent:
regex: ".*Mobile.*"
```
Or if you want to target only Android users:
```yaml
match:
- headers:
user-agent:
regex: ".*Android.*"
```
Or a specific browser version:
```yaml
match:
- headers:
user-agent:
regex: ".*Firefox.*"
```
For an in-depth look at the analysis process read the [usage docs](../usage/how-it-works.md).

View File

@ -0,0 +1,263 @@
---
type: docs
weight: 2
title: Istio A/B Testing
---
This guide shows you how to automate A/B testing with Istio and Flagger.
Besides weighted routing, Flagger can be configured to route traffic to the canary based on HTTP match conditions. In an A/B testing scenario, you'll be using HTTP headers or cookies to target a certain segment of your users. This is particularly useful for frontend applications that require session affinity.
![Flagger A/B Testing Stages](/img/diagrams/flagger-abtest-steps.png)
## Prerequisites
Flagger requires a Kubernetes cluster **v1.16** or newer and Istio **v1.0** or newer.
Install Istio with telemetry support and Prometheus:
```bash
istioctl manifest install --set profile=default
kubectl apply -f https://raw.githubusercontent.com/istio/istio/release-1.8/samples/addons/prometheus.yaml
```
Install Flagger in the `istio-system` namespace:
```bash
kubectl apply -k github.com/fluxcd/flagger//kustomize/istio
```
Create an ingress gateway to expose the demo app outside of the mesh:
```yaml
apiVersion: networking.istio.io/v1alpha3
kind: Gateway
metadata:
name: public-gateway
namespace: istio-system
spec:
selector:
istio: ingressgateway
servers:
- port:
number: 80
name: http
protocol: HTTP
hosts:
- "*"
```
## Bootstrap
Create a test namespace with Istio sidecar injection enabled:
```bash
kubectl create ns test
kubectl label namespace test istio-injection=enabled
```
Create a deployment and a horizontal pod autoscaler:
```bash
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/podinfo?ref=main
```
Deploy the load testing service to generate traffic during the canary analysis:
```bash
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/tester?ref=main
```
Create a canary custom resource \(replace example.com with your own domain\):
```yaml
apiVersion: flagger.app/v1beta1
kind: Canary
metadata:
name: podinfo
namespace: test
spec:
# deployment reference
targetRef:
apiVersion: apps/v1
kind: Deployment
name: podinfo
# the maximum time in seconds for the canary deployment
# to make progress before it is rollback (default 600s)
progressDeadlineSeconds: 60
# HPA reference (optional)
autoscalerRef:
apiVersion: autoscaling/v2beta2
kind: HorizontalPodAutoscaler
name: podinfo
service:
# container port
port: 9898
# Istio gateways (optional)
gateways:
- public-gateway.istio-system.svc.cluster.local
# Istio virtual service host names (optional)
hosts:
- app.example.com
# Istio traffic policy (optional)
trafficPolicy:
tls:
# use ISTIO_MUTUAL when mTLS is enabled
mode: DISABLE
analysis:
# schedule interval (default 60s)
interval: 1m
# total number of iterations
iterations: 10
# max number of failed iterations before rollback
threshold: 2
# canary match condition
match:
- headers:
user-agent:
regex: ".*Firefox.*"
- headers:
cookie:
regex: "^(.*?;)?(type=insider)(;.*)?$"
metrics:
- name: request-success-rate
# minimum req success rate (non 5xx responses)
# percentage (0-100)
thresholdRange:
min: 99
interval: 1m
- name: request-duration
# maximum req duration P99
# milliseconds
thresholdRange:
max: 500
interval: 30s
# generate traffic during analysis
webhooks:
- name: load-test
url: http://flagger-loadtester.test/
timeout: 15s
metadata:
cmd: "hey -z 1m -q 10 -c 2 -H 'Cookie: type=insider' http://podinfo.test:9898/"
```
**Note** that when using Istio 1.5 you have to replace the `request-duration` with a [metric template](/flagger/dev/upgrade-guide#istio-telemetry-v2).
The above configuration will run an analysis for ten minutes targeting Firefox users and those that have an insider cookie.
Save the above resource as podinfo-abtest.yaml and then apply it:
```bash
kubectl apply -f ./podinfo-abtest.yaml
```
After a couple of seconds Flagger will create the canary objects:
```bash
# applied
deployment.apps/podinfo
horizontalpodautoscaler.autoscaling/podinfo
canary.flagger.app/podinfo
# generated
deployment.apps/podinfo-primary
horizontalpodautoscaler.autoscaling/podinfo-primary
service/podinfo
service/podinfo-canary
service/podinfo-primary
destinationrule.networking.istio.io/podinfo-canary
destinationrule.networking.istio.io/podinfo-primary
virtualservice.networking.istio.io/podinfo
```
## Automated canary promotion
Trigger a canary deployment by updating the container image:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=ghcr.io/stefanprodan/podinfo:6.0.1
```
Flagger detects that the deployment revision changed and starts a new rollout:
```text
kubectl -n test describe canary/abtest
Status:
Failed Checks: 0
Phase: Succeeded
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal Synced 3m flagger New revision detected podinfo.test
Normal Synced 3m flagger Scaling up podinfo.test
Warning Synced 3m flagger Waiting for podinfo.test rollout to finish: 0 of 1 updated replicas are available
Normal Synced 3m flagger Advance podinfo.test canary iteration 1/10
Normal Synced 3m flagger Advance podinfo.test canary iteration 2/10
Normal Synced 3m flagger Advance podinfo.test canary iteration 3/10
Normal Synced 2m flagger Advance podinfo.test canary iteration 4/10
Normal Synced 2m flagger Advance podinfo.test canary iteration 5/10
Normal Synced 1m flagger Advance podinfo.test canary iteration 6/10
Normal Synced 1m flagger Advance podinfo.test canary iteration 7/10
Normal Synced 55s flagger Advance podinfo.test canary iteration 8/10
Normal Synced 45s flagger Advance podinfo.test canary iteration 9/10
Normal Synced 35s flagger Advance podinfo.test canary iteration 10/10
Normal Synced 25s flagger Copying podinfo.test template spec to podinfo-primary.test
Warning Synced 15s flagger Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
Normal Synced 5s flagger Promotion completed! Scaling down podinfo.test
```
**Note** that if you apply new changes to the deployment during the canary analysis, Flagger will restart the analysis.
You can monitor all canaries with:
```bash
watch kubectl get canaries --all-namespaces
NAMESPACE NAME STATUS WEIGHT LASTTRANSITIONTIME
test podinfo Progressing 100 2019-03-16T14:05:07Z
prod frontend Succeeded 0 2019-03-15T16:15:07Z
prod backend Failed 0 2019-03-14T17:05:07Z
```
## Automated rollback
During the canary analysis you can generate HTTP 500 errors and high latency to test Flagger's rollback.
Generate HTTP 500 errors:
```bash
watch curl -b 'type=insider' http://app.example.com/status/500
```
Generate latency:
```bash
watch curl -b 'type=insider' http://app.example.com/delay/1
```
When the number of failed checks reaches the canary analysis threshold, the traffic is routed back to the primary, the canary is scaled to zero and the rollout is marked as failed.
```text
kubectl -n test describe canary/podinfo
Status:
Failed Checks: 2
Phase: Failed
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal Synced 3m flagger Starting canary deployment for podinfo.test
Normal Synced 3m flagger Advance podinfo.test canary iteration 1/10
Normal Synced 3m flagger Advance podinfo.test canary iteration 2/10
Normal Synced 3m flagger Advance podinfo.test canary iteration 3/10
Normal Synced 3m flagger Halt podinfo.test advancement success rate 69.17% < 99%
Normal Synced 2m flagger Halt podinfo.test advancement success rate 61.39% < 99%
Warning Synced 2m flagger Rolling back podinfo.test failed checks threshold reached 2
Warning Synced 1m flagger Canary failed! Scaling down podinfo.test
```
The above procedure can be extended with [custom metrics](../usage/metrics.md) checks, [webhooks](../usage/webhooks.md), [manual promotion](../usage/webhooks.md#manual-gating) approval and [Slack or MS Teams](../usage/alerting.md) notifications.

View File

@ -0,0 +1,374 @@
---
type: docs
weight: 1
title: Istio Canary Deployments
---
This guide shows you how to use Istio and Flagger to automate canary deployments.
![Flagger Canary Stages](/img/diagrams/flagger-canary-steps.png)
## Prerequisites
Flagger requires a Kubernetes cluster **v1.16** or newer and Istio **v1.5** or newer.
Install Istio with telemetry support and Prometheus:
```bash
istioctl manifest install --set profile=default
# Suggestion: Please change release-1.8 in below command, to your real istio version.
kubectl apply -f https://raw.githubusercontent.com/istio/istio/release-1.8/samples/addons/prometheus.yaml
```
Install Flagger in the `istio-system` namespace:
```bash
kubectl apply -k github.com/fluxcd/flagger//kustomize/istio
```
Create an ingress gateway to expose the demo app outside of the mesh:
```yaml
apiVersion: networking.istio.io/v1alpha3
kind: Gateway
metadata:
name: public-gateway
namespace: istio-system
spec:
selector:
istio: ingressgateway
servers:
- port:
number: 80
name: http
protocol: HTTP
hosts:
- "*"
```
## Bootstrap
Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler \(HPA\), then creates a series of objects \(Kubernetes deployments, ClusterIP services, Istio destination rules and virtual services\). These objects expose the application inside the mesh and drive the canary analysis and promotion.
Create a test namespace with Istio sidecar injection enabled:
```bash
kubectl create ns test
kubectl label namespace test istio-injection=enabled
```
Create a deployment and a horizontal pod autoscaler:
```bash
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/podinfo?ref=main
```
Deploy the load testing service to generate traffic during the canary analysis:
```bash
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/tester?ref=main
```
Create a canary custom resource \(replace example.com with your own domain\):
```yaml
apiVersion: flagger.app/v1beta1
kind: Canary
metadata:
name: podinfo
namespace: test
spec:
# deployment reference
targetRef:
apiVersion: apps/v1
kind: Deployment
name: podinfo
# the maximum time in seconds for the canary deployment
# to make progress before it is rollback (default 600s)
progressDeadlineSeconds: 60
# HPA reference (optional)
autoscalerRef:
apiVersion: autoscaling/v2beta2
kind: HorizontalPodAutoscaler
name: podinfo
service:
# service port number
port: 9898
# container port number or name (optional)
targetPort: 9898
# Istio gateways (optional)
gateways:
- public-gateway.istio-system.svc.cluster.local
# Istio virtual service host names (optional)
hosts:
- app.example.com
# Istio traffic policy (optional)
trafficPolicy:
tls:
# use ISTIO_MUTUAL when mTLS is enabled
mode: DISABLE
# Istio retry policy (optional)
retries:
attempts: 3
perTryTimeout: 1s
retryOn: "gateway-error,connect-failure,refused-stream"
analysis:
# schedule interval (default 60s)
interval: 1m
# max number of failed metric checks before rollback
threshold: 5
# max traffic percentage routed to canary
# percentage (0-100)
maxWeight: 50
# canary increment step
# percentage (0-100)
stepWeight: 10
metrics:
- name: request-success-rate
# minimum req success rate (non 5xx responses)
# percentage (0-100)
thresholdRange:
min: 99
interval: 1m
- name: request-duration
# maximum req duration P99
# milliseconds
thresholdRange:
max: 500
interval: 30s
# testing (optional)
webhooks:
- name: acceptance-test
type: pre-rollout
url: http://flagger-loadtester.test/
timeout: 30s
metadata:
type: bash
cmd: "curl -sd 'test' http://podinfo-canary:9898/token | grep token"
- name: load-test
url: http://flagger-loadtester.test/
timeout: 5s
metadata:
cmd: "hey -z 1m -q 10 -c 2 http://podinfo-canary.test:9898/"
```
**Note** that when using Istio 1.4 you have to replace the `request-duration` with a [metric template](/flagger/dev/upgrade-guide#istio-telemetry-v2).
Save the above resource as podinfo-canary.yaml and then apply it:
```bash
kubectl apply -f ./podinfo-canary.yaml
```
When the canary analysis starts, Flagger will call the pre-rollout webhooks before routing traffic to the canary. The canary analysis will run for five minutes while validating the HTTP metrics and rollout hooks every minute.
![Flagger Canary Process](/img/diagrams/flagger-canary-hpa.png)
After a couple of seconds Flagger will create the canary objects:
```bash
# applied
deployment.apps/podinfo
horizontalpodautoscaler.autoscaling/podinfo
canary.flagger.app/podinfo
# generated
deployment.apps/podinfo-primary
horizontalpodautoscaler.autoscaling/podinfo-primary
service/podinfo
service/podinfo-canary
service/podinfo-primary
destinationrule.networking.istio.io/podinfo-canary
destinationrule.networking.istio.io/podinfo-primary
virtualservice.networking.istio.io/podinfo
```
## Automated canary promotion
Trigger a canary deployment by updating the container image:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=ghcr.io/stefanprodan/podinfo:6.0.1
```
Flagger detects that the deployment revision changed and starts a new rollout:
```text
kubectl -n test describe canary/podinfo
Status:
Canary Weight: 0
Failed Checks: 0
Phase: Succeeded
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal Synced 3m flagger New revision detected podinfo.test
Normal Synced 3m flagger Scaling up podinfo.test
Warning Synced 3m flagger Waiting for podinfo.test rollout to finish: 0 of 1 updated replicas are available
Normal Synced 3m flagger Advance podinfo.test canary weight 5
Normal Synced 3m flagger Advance podinfo.test canary weight 10
Normal Synced 3m flagger Advance podinfo.test canary weight 15
Normal Synced 2m flagger Advance podinfo.test canary weight 20
Normal Synced 2m flagger Advance podinfo.test canary weight 25
Normal Synced 1m flagger Advance podinfo.test canary weight 30
Normal Synced 1m flagger Advance podinfo.test canary weight 35
Normal Synced 55s flagger Advance podinfo.test canary weight 40
Normal Synced 45s flagger Advance podinfo.test canary weight 45
Normal Synced 35s flagger Advance podinfo.test canary weight 50
Normal Synced 25s flagger Copying podinfo.test template spec to podinfo-primary.test
Warning Synced 15s flagger Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
Normal Synced 5s flagger Promotion completed! Scaling down podinfo.test
```
**Note** that if you apply new changes to the deployment during the canary analysis, Flagger will restart the analysis.
A canary deployment is triggered by changes in any of the following objects:
* Deployment PodSpec \(container image, command, ports, env, resources, etc\)
* ConfigMaps mounted as volumes or mapped to environment variables
* Secrets mounted as volumes or mapped to environment variables
You can monitor all canaries with:
```bash
watch kubectl get canaries --all-namespaces
NAMESPACE NAME STATUS WEIGHT LASTTRANSITIONTIME
test podinfo Progressing 15 2019-01-16T14:05:07Z
prod frontend Succeeded 0 2019-01-15T16:15:07Z
prod backend Failed 0 2019-01-14T17:05:07Z
```
## Automated rollback
During the canary analysis you can generate HTTP 500 errors and high latency to test if Flagger pauses the rollout.
Trigger another canary deployment:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=ghcr.io/stefanprodan/podinfo:6.0.2
```
Exec into the load tester pod with:
```bash
kubectl -n test exec -it flagger-loadtester-xx-xx sh
```
Generate HTTP 500 errors:
```bash
watch curl http://podinfo-canary:9898/status/500
```
Generate latency:
```bash
watch curl http://podinfo-canary:9898/delay/1
```
When the number of failed checks reaches the canary analysis threshold, the traffic is routed back to the primary, the canary is scaled to zero and the rollout is marked as failed.
```text
kubectl -n test describe canary/podinfo
Status:
Canary Weight: 0
Failed Checks: 10
Phase: Failed
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal Synced 3m flagger Starting canary deployment for podinfo.test
Normal Synced 3m flagger Advance podinfo.test canary weight 5
Normal Synced 3m flagger Advance podinfo.test canary weight 10
Normal Synced 3m flagger Advance podinfo.test canary weight 15
Normal Synced 3m flagger Halt podinfo.test advancement success rate 69.17% < 99%
Normal Synced 2m flagger Halt podinfo.test advancement success rate 61.39% < 99%
Normal Synced 2m flagger Halt podinfo.test advancement success rate 55.06% < 99%
Normal Synced 2m flagger Halt podinfo.test advancement success rate 47.00% < 99%
Normal Synced 2m flagger (combined from similar events): Halt podinfo.test advancement success rate 38.08% < 99%
Warning Synced 1m flagger Rolling back podinfo.test failed checks threshold reached 10
Warning Synced 1m flagger Canary failed! Scaling down podinfo.test
```
## Traffic mirroring
![Flagger Canary Traffic Shadowing](/img/diagrams/flagger-canary-traffic-mirroring.png)
For applications that perform read operations, Flagger can be configured to drive canary releases with traffic mirroring. Istio traffic mirroring will copy each incoming request, sending one request to the primary and one to the canary service. The response from the primary is sent back to the user and the response from the canary is discarded. Metrics are collected on both requests so that the deployment will only proceed if the canary metrics are within the threshold values.
Note that mirroring should be used for requests that are **idempotent** or capable of being processed twice \(once by the primary and once by the canary\).
You can enable mirroring by replacing `stepWeight/maxWeight` with `iterations` and by setting `analysis.mirror` to `true`:
```yaml
apiVersion: flagger.app/v1beta1
kind: Canary
metadata:
name: podinfo
namespace: test
spec:
analysis:
# schedule interval
interval: 1m
# max number of failed metric checks before rollback
threshold: 5
# total number of iterations
iterations: 10
# enable traffic shadowing
mirror: true
# weight of the traffic mirrored to your canary (defaults to 100%)
mirrorWeight: 100
metrics:
- name: request-success-rate
thresholdRange:
min: 99
interval: 1m
- name: request-duration
thresholdRange:
max: 500
interval: 1m
webhooks:
- name: acceptance-test
type: pre-rollout
url: http://flagger-loadtester.test/
timeout: 30s
metadata:
type: bash
cmd: "curl -sd 'test' http://podinfo-canary:9898/token | grep token"
- name: load-test
url: http://flagger-loadtester.test/
timeout: 5s
metadata:
cmd: "hey -z 1m -q 10 -c 2 http://podinfo.test:9898/"
```
With the above configuration, Flagger will run a canary release with the following steps:
* detect new revision \(deployment spec, secrets or configmaps changes\)
* scale from zero the canary deployment
* wait for the HPA to set the canary minimum replicas
* check canary pods health
* run the acceptance tests
* abort the canary release if tests fail
* start the load tests
* mirror 100% of the traffic from primary to canary
* check request success rate and request duration every minute
* abort the canary release if the metrics check failure threshold is reached
* stop traffic mirroring after the number of iterations is reached
* route live traffic to the canary pods
* promote the canary \(update the primary secrets, configmaps and deployment spec\)
* wait for the primary deployment rollout to finish
* wait for the HPA to set the primary minimum replicas
* check primary pods health
* switch live traffic back to primary
* scale to zero the canary
* send notification with the canary analysis result
The above procedure can be extended with [custom metrics](../usage/metrics.md) checks, [webhooks](../usage/webhooks.md), [manual promotion](../usage/webhooks.md#manual-gating) approval and [Slack or MS Teams](../usage/alerting.md) notifications.

View File

@ -0,0 +1,376 @@
---
weight: 12
title: Blue/Green Deployments
type: docs
---
This guide shows you how to automate Blue/Green deployments with Flagger and Kubernetes.
For applications that are not deployed on a service mesh, Flagger can orchestrate Blue/Green style deployments with Kubernetes L4 networking. When using a service mesh blue/green can be used as specified [here](../usage/deployment-strategies.md).
![Flagger Blue/Green Stages](/img/diagrams/flagger-bluegreen-steps.png)
## Prerequisites
Flagger requires a Kubernetes cluster **v1.16** or newer.
Install Flagger and the Prometheus add-on:
```bash
helm repo add flagger https://flagger.app
helm upgrade -i flagger flagger/flagger \
--namespace flagger \
--set prometheus.install=true \
--set meshProvider=kubernetes
```
If you already have a Prometheus instance running in your cluster, you can point Flagger to the ClusterIP service with:
```bash
helm upgrade -i flagger flagger/flagger \
--namespace flagger \
--set metricsServer=http://prometheus.monitoring:9090
```
Optionally you can enable Slack notifications:
```bash
helm upgrade -i flagger flagger/flagger \
--reuse-values \
--namespace flagger \
--set slack.url=https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK \
--set slack.channel=general \
--set slack.user=flagger
```
## Bootstrap
Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler \(HPA\), then creates a series of objects \(Kubernetes deployment and ClusterIP services\). These objects expose the application inside the cluster and drive the canary analysis and Blue/Green promotion.
Create a test namespace:
```bash
kubectl create ns test
```
Create a deployment and a horizontal pod autoscaler:
```bash
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/podinfo?ref=main
```
Deploy the load testing service to generate traffic during the analysis:
```bash
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/tester?ref=main
```
Create a canary custom resource:
```yaml
apiVersion: flagger.app/v1beta1
kind: Canary
metadata:
name: podinfo
namespace: test
spec:
# service mesh provider can be: kubernetes, istio, appmesh, nginx, gloo
provider: kubernetes
# deployment reference
targetRef:
apiVersion: apps/v1
kind: Deployment
name: podinfo
# the maximum time in seconds for the canary deployment
# to make progress before rollback (default 600s)
progressDeadlineSeconds: 60
# HPA reference (optional)
autoscalerRef:
apiVersion: autoscaling/v2beta2
kind: HorizontalPodAutoscaler
name: podinfo
service:
port: 9898
portDiscovery: true
analysis:
# schedule interval (default 60s)
interval: 30s
# max number of failed checks before rollback
threshold: 2
# number of checks to run before rollback
iterations: 10
# Prometheus checks based on
# http_request_duration_seconds histogram
metrics:
- name: request-success-rate
# minimum req success rate (non 5xx responses)
# percentage (0-100)
thresholdRange:
min: 99
interval: 1m
- name: request-duration
# maximum req duration P99
# milliseconds
thresholdRange:
max: 500
interval: 30s
# acceptance/load testing hooks
webhooks:
- name: smoke-test
type: pre-rollout
url: http://flagger-loadtester.test/
timeout: 15s
metadata:
type: bash
cmd: "curl -sd 'anon' http://podinfo-canary.test:9898/token | grep token"
- name: load-test
url: http://flagger-loadtester.test/
timeout: 5s
metadata:
type: cmd
cmd: "hey -z 1m -q 10 -c 2 http://podinfo-canary.test:9898/"
```
The above configuration will run an analysis for five minutes.
Save the above resource as podinfo-canary.yaml and then apply it:
```bash
kubectl apply -f ./podinfo-canary.yaml
```
After a couple of seconds Flagger will create the canary objects:
```bash
# applied
deployment.apps/podinfo
horizontalpodautoscaler.autoscaling/podinfo
canary.flagger.app/podinfo
# generated
deployment.apps/podinfo-primary
horizontalpodautoscaler.autoscaling/podinfo-primary
service/podinfo
service/podinfo-canary
service/podinfo-primary
```
Blue/Green scenario:
* on bootstrap, Flagger will create three ClusterIP services \(`app-primary`,`app-canary`, `app`\)
and a shadow deployment named `app-primary` that represents the blue version
* when a new version is detected, Flagger would scale up the green version and run the conformance tests
\(the tests should target the `app-canary` ClusterIP service to reach the green version\)
* if the conformance tests are passing, Flagger would start the load tests and validate them with custom Prometheus queries
* if the load test analysis is successful, Flagger will promote the new version to `app-primary` and scale down the green version
## Automated Blue/Green promotion
Trigger a deployment by updating the container image:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=ghcr.io/stefanprodan/podinfo:6.0.1
```
Flagger detects that the deployment revision changed and starts a new rollout:
```text
kubectl -n test describe canary/podinfo
Events:
New revision detected podinfo.test
Waiting for podinfo.test rollout to finish: 0 of 1 updated replicas are available
Pre-rollout check acceptance-test passed
Advance podinfo.test canary iteration 1/10
Advance podinfo.test canary iteration 2/10
Advance podinfo.test canary iteration 3/10
Advance podinfo.test canary iteration 4/10
Advance podinfo.test canary iteration 5/10
Advance podinfo.test canary iteration 6/10
Advance podinfo.test canary iteration 7/10
Advance podinfo.test canary iteration 8/10
Advance podinfo.test canary iteration 9/10
Advance podinfo.test canary iteration 10/10
Copying podinfo.test template spec to podinfo-primary.test
Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
Promotion completed! Scaling down podinfo.test
```
**Note** that if you apply new changes to the deployment during the canary analysis, Flagger will restart the analysis.
You can monitor all canaries with:
```bash
watch kubectl get canaries --all-namespaces
NAMESPACE NAME STATUS WEIGHT LASTTRANSITIONTIME
test podinfo Progressing 100 2019-06-16T14:05:07Z
prod frontend Succeeded 0 2019-06-15T16:15:07Z
prod backend Failed 0 2019-06-14T17:05:07Z
```
## Automated rollback
During the analysis you can generate HTTP 500 errors and high latency to test Flagger's rollback.
Exec into the load tester pod with:
```bash
kubectl -n test exec -it flagger-loadtester-xx-xx sh
```
Generate HTTP 500 errors:
```bash
watch curl http://podinfo-canary.test:9898/status/500
```
Generate latency:
```bash
watch curl http://podinfo-canary.test:9898/delay/1
```
When the number of failed checks reaches the analysis threshold, the green version is scaled to zero and the rollout is marked as failed.
```text
kubectl -n test describe canary/podinfo
Status:
Failed Checks: 2
Phase: Failed
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal Synced 3m flagger New revision detected podinfo.test
Normal Synced 3m flagger Advance podinfo.test canary iteration 1/10
Normal Synced 3m flagger Advance podinfo.test canary iteration 2/10
Normal Synced 3m flagger Advance podinfo.test canary iteration 3/10
Normal Synced 3m flagger Halt podinfo.test advancement success rate 69.17% < 99%
Normal Synced 2m flagger Halt podinfo.test advancement success rate 61.39% < 99%
Warning Synced 2m flagger Rolling back podinfo.test failed checks threshold reached 2
Warning Synced 1m flagger Canary failed! Scaling down podinfo.test
```
## Custom metrics
The analysis can be extended with Prometheus queries. The demo app is instrumented with Prometheus so you can create a custom check that will use the HTTP request duration histogram to validate the canary \(green version\).
Create a metric template and apply it on the cluster:
```yaml
apiVersion: flagger.app/v1beta1
kind: MetricTemplate
metadata:
name: not-found-percentage
namespace: test
spec:
provider:
type: prometheus
address: http://flagger-prometheus.flagger:9090
query: |
100 - sum(
rate(
http_request_duration_seconds_count{
kubernetes_namespace="{{ namespace }}",
kubernetes_pod_name=~"{{ target }}-[0-9a-zA-Z]+(-[0-9a-zA-Z]+)"
status!="{{ interval }}"
}[1m]
)
)
/
sum(
rate(
http_request_duration_seconds_count{
kubernetes_namespace="{{ namespace }}",
kubernetes_pod_name=~"{{ target }}-[0-9a-zA-Z]+(-[0-9a-zA-Z]+)"
}[{{ interval }}]
)
) * 100
```
Edit the canary analysis and add the following metric:
```yaml
analysis:
metrics:
- name: "404s percentage"
templateRef:
name: not-found-percentage
thresholdRange:
max: 5
interval: 1m
```
The above configuration validates the canary \(green version\) by checking if the HTTP 404 req/sec percentage is below 5 percent of the total traffic. If the 404s rate reaches the 5% threshold, then the rollout is rolled back.
Trigger a deployment by updating the container image:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=ghcr.io/stefanprodan/podinfo:6.0.3
```
Generate 404s:
```bash
watch curl http://podinfo-canary.test:9898/status/400
```
Watch Flagger logs:
```text
kubectl -n flagger logs deployment/flagger -f | jq .msg
New revision detected podinfo.test
Scaling up podinfo.test
Advance podinfo.test canary iteration 1/10
Halt podinfo.test advancement 404s percentage 6.20 > 5
Halt podinfo.test advancement 404s percentage 6.45 > 5
Rolling back podinfo.test failed checks threshold reached 2
Canary failed! Scaling down podinfo.test
```
If you have [alerting](../usage/alerting.md) configured, Flagger will send a notification with the reason why the canary failed.
## Conformance Testing with Helm
Flagger comes with a testing service that can run Helm tests when configured as a pre-rollout webhook.
Deploy the Helm test runner in the `kube-system` namespace using the `tiller` service account:
```bash
helm repo add flagger https://flagger.app
helm upgrade -i flagger-helmtester flagger/loadtester \
--namespace=kube-system \
--set serviceAccountName=tiller
```
When deployed the Helm tester API will be available at `http://flagger-helmtester.kube-system/`.
Add a helm test pre-rollout hook to your chart:
```yaml
analysis:
webhooks:
- name: "conformance testing"
type: pre-rollout
url: http://flagger-helmtester.kube-system/
timeout: 3m
metadata:
type: "helm"
cmd: "test {{ .Release.Name }} --cleanup"
```
When the canary analysis starts, Flagger will call the pre-rollout webhooks. If the helm test fails, Flagger will retry until the analysis threshold is reached and the canary is rolled back.
For an in-depth look at the analysis process read the [usage docs](../usage/how-it-works.md).

View File

@ -0,0 +1,256 @@
---
weight: 10
title: Kuma Canary Deployments
type: docs
---
This guide shows you how to use Kuma and Flagger to automate canary deployments.
![Flagger Kuma Canary](/img/diagrams/flagger-kuma-canary.png)
## Prerequisites
Flagger requires a Kubernetes cluster **v1.16** or newer and Kuma **1.3** or newer.
Install Kuma and Prometheus (part of Kuma Metrics):
```bash
kumactl install control-plane | kubectl apply -f -
kumactl install metrics | kubectl apply -f -
```
Install Flagger in the `kuma-system` namespace:
```bash
kubectl apply -k github.com/fluxcd/flagger//kustomize/kuma
```
## Bootstrap
Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler (HPA),
then creates a series of objects (Kubernetes deployments, ClusterIP services and Kuma `TrafficRoute`).
These objects expose the application inside the mesh and drive the canary analysis and promotion.
Create a test namespace and enable Kuma sidecar injection:
```bash
kubectl create ns test
kubectl annotate namespace test kuma.io/sidecar-injection=enabled
```
Install the load testing service to generate traffic during the canary analysis:
```bash
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/tester?ref=main
```
Create a deployment and a horizontal pod autoscaler:
```bash
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/podinfo?ref=main
```
Create a canary custom resource for the `podinfo` deployment:
```yaml
apiVersion: flagger.app/v1beta1
kind: Canary
metadata:
name: podinfo
namespace: test
annotations:
kuma.io/mesh: default
spec:
targetRef:
apiVersion: apps/v1
kind: Deployment
name: podinfo
progressDeadlineSeconds: 60
service:
port: 9898
targetPort: 9898
apex:
annotations:
9898.service.kuma.io/protocol: "http"
canary:
annotations:
9898.service.kuma.io/protocol: "http"
primary:
annotations:
9898.service.kuma.io/protocol: "http"
analysis:
# schedule interval (default 60s)
interval: 30s
# max number of failed metric checks before rollback
threshold: 5
# max traffic percentage routed to canary
# percentage (0-100)
maxWeight: 50
# canary increment step
# percentage (0-100)
stepWeight: 5
metrics:
- name: request-success-rate
threshold: 99
interval: 1m
- name: request-duration
threshold: 500
interval: 30s
webhooks:
- name: acceptance-test
type: pre-rollout
url: http://flagger-loadtester.test/
timeout: 30s
metadata:
type: bash
cmd: "curl -sd 'test' http://podinfo-canary.test:9898/token | grep token"
- name: load-test
type: rollout
url: http://flagger-loadtester.test/
metadata:
cmd: "hey -z 2m -q 10 -c 2 http://podinfo-canary.test:9898/"
```
Save the above resource as `podinfo-canary.yaml` and then apply it:
```bash
kubectl apply -f ./podinfo-canary.yaml
```
When the canary analysis starts, Flagger will call the pre-rollout webhooks before routing traffic to the canary. The canary analysis will run for five minutes while validating the HTTP metrics and rollout hooks every half a minute.
After a couple of seconds Flagger will create the canary objects:
```bash
# applied
deployment.apps/podinfo
horizontalpodautoscaler.autoscaling/podinfo
ingresses.extensions/podinfo
canary.flagger.app/podinfo
# generated
deployment.apps/podinfo-primary
horizontalpodautoscaler.autoscaling/podinfo-primary
service/podinfo
service/podinfo-canary
service/podinfo-primary
trafficroutes.kuma.io/podinfo
```
After the boostrap, the podinfo deployment will be scaled to zero and the traffic to `podinfo.test` will be routed to the primary pods. During the canary analysis, the `podinfo-canary.test` address can be used to target directly the canary pods.
## Automated canary promotion
Flagger implements a control loop that gradually shifts traffic to the canary while measuring key performance indicators like HTTP requests success rate, requests average duration and pod health. Based on analysis of the KPIs a canary is promoted or aborted, and the analysis result is published to Slack.
![Flagger Canary Stages](/img/diagrams/flagger-canary-steps.png)
Trigger a canary deployment by updating the container image:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=ghcr.io/stefanprodan/podinfo:6.0.1
```
Flagger detects that the deployment revision changed and starts a new rollout:
```text
kubectl -n test describe canary/podinfo
Status:
Canary Weight: 0
Failed Checks: 0
Phase: Succeeded
Events:
New revision detected! Scaling up podinfo.test
Waiting for podinfo.test rollout to finish: 0 of 1 updated replicas are available
Pre-rollout check acceptance-test passed
Advance podinfo.test canary weight 5
Advance podinfo.test canary weight 10
Advance podinfo.test canary weight 15
Advance podinfo.test canary weight 20
Advance podinfo.test canary weight 25
Waiting for podinfo.test rollout to finish: 1 of 2 updated replicas are available
Advance podinfo.test canary weight 30
Advance podinfo.test canary weight 35
Advance podinfo.test canary weight 40
Advance podinfo.test canary weight 45
Advance podinfo.test canary weight 50
Copying podinfo.test template spec to podinfo-primary.test
Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
Promotion completed! Scaling down podinfo.test
```
**Note** that if you apply new changes to the deployment during the canary analysis, Flagger will restart the analysis.
A canary deployment is triggered by changes in any of the following objects:
* Deployment PodSpec \(container image, command, ports, env, resources, etc\)
* ConfigMaps mounted as volumes or mapped to environment variables
* Secrets mounted as volumes or mapped to environment variables
You can monitor all canaries with:
```bash
watch kubectl get canaries --all-namespaces
NAMESPACE NAME STATUS WEIGHT LASTTRANSITIONTIME
test podinfo Progressing 15 2019-06-30T14:05:07Z
prod frontend Succeeded 0 2019-06-30T16:15:07Z
prod backend Failed 0 2019-06-30T17:05:07Z
```
## Automated rollback
During the canary analysis you can generate HTTP 500 errors and high latency to test if Flagger pauses and rolls back the faulted version.
Trigger another canary deployment:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=ghcr.io/stefanprodan/podinfo:6.0.2
```
Exec into the load tester pod with:
```bash
kubectl -n test exec -it flagger-loadtester-xx-xx sh
```
Generate HTTP 500 errors:
```bash
watch -n 1 curl http://podinfo-canary.test:9898/status/500
```
Generate latency:
```bash
watch -n 1 curl http://podinfo-canary.test:9898/delay/1
```
When the number of failed checks reaches the canary analysis threshold, the traffic is routed back to the primary, the canary is scaled to zero and the rollout is marked as failed.
```text
kubectl -n test describe canary/podinfo
Status:
Canary Weight: 0
Failed Checks: 10
Phase: Failed
Events:
Starting canary analysis for podinfo.test
Pre-rollout check acceptance-test passed
Advance podinfo.test canary weight 5
Advance podinfo.test canary weight 10
Advance podinfo.test canary weight 15
Halt podinfo.test advancement success rate 69.17% < 99%
Halt podinfo.test advancement success rate 61.39% < 99%
Halt podinfo.test advancement success rate 55.06% < 99%
Halt podinfo.test advancement request duration 1.20s > 0.5s
Halt podinfo.test advancement request duration 1.45s > 0.5s
Rolling back podinfo.test failed checks threshold reached 5
Canary failed! Scaling down podinfo.test
```
The above procedures can be extended with [custom metrics](../usage/metrics.md) checks, [webhooks](../usage/webhooks.md), [manual promotion](../usage/webhooks.md#manual-gating) approval and [Slack or MS Teams](../usage/alerting.md) notifications.

View File

@ -0,0 +1,476 @@
---
type: docs
weight: 3
title: Linkerd Canary Deployments
---
This guide shows you how to use Linkerd and Flagger to automate canary deployments.
![Flagger Linkerd Traffic Split](/img/diagrams/flagger-linkerd-traffic-split.png)
## Prerequisites
Flagger requires a Kubernetes cluster **v1.16** or newer and Linkerd **2.10** or newer.
Install Linkerd the Promethues (part of Linkerd Viz):
```bash
linkerd install | kubectl apply -f -
linkerd viz install | kubectl apply -f -
```
Install Flagger in the linkerd namespace:
```bash
kubectl apply -k github.com/fluxcd/flagger//kustomize/linkerd
```
## Bootstrap
Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler (HPA),
then creates a series of objects (Kubernetes deployments, ClusterIP services and SMI traffic split).
These objects expose the application inside the mesh and drive the canary analysis and promotion.
Create a test namespace and enable Linkerd proxy injection:
```bash
kubectl create ns test
kubectl annotate namespace test linkerd.io/inject=enabled
```
Install the load testing service to generate traffic during the canary analysis:
```bash
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/tester?ref=main
```
Create a deployment and a horizontal pod autoscaler:
```bash
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/podinfo?ref=main
```
Create a canary custom resource for the podinfo deployment:
```yaml
apiVersion: flagger.app/v1beta1
kind: Canary
metadata:
name: podinfo
namespace: test
spec:
# deployment reference
targetRef:
apiVersion: apps/v1
kind: Deployment
name: podinfo
# HPA reference (optional)
autoscalerRef:
apiVersion: autoscaling/v2beta2
kind: HorizontalPodAutoscaler
name: podinfo
# the maximum time in seconds for the canary deployment
# to make progress before it is rollback (default 600s)
progressDeadlineSeconds: 60
service:
# ClusterIP port number
port: 9898
# container port number or name (optional)
targetPort: 9898
analysis:
# schedule interval (default 60s)
interval: 30s
# max number of failed metric checks before rollback
threshold: 5
# max traffic percentage routed to canary
# percentage (0-100)
maxWeight: 50
# canary increment step
# percentage (0-100)
stepWeight: 5
# Linkerd Prometheus checks
metrics:
- name: request-success-rate
# minimum req success rate (non 5xx responses)
# percentage (0-100)
thresholdRange:
min: 99
interval: 1m
- name: request-duration
# maximum req duration P99
# milliseconds
thresholdRange:
max: 500
interval: 30s
# testing (optional)
webhooks:
- name: acceptance-test
type: pre-rollout
url: http://flagger-loadtester.test/
timeout: 30s
metadata:
type: bash
cmd: "curl -sd 'test' http://podinfo-canary.test:9898/token | grep token"
- name: load-test
type: rollout
url: http://flagger-loadtester.test/
metadata:
cmd: "hey -z 2m -q 10 -c 2 http://podinfo-canary.test:9898/"
```
Save the above resource as podinfo-canary.yaml and then apply it:
```bash
kubectl apply -f ./podinfo-canary.yaml
```
When the canary analysis starts, Flagger will call the pre-rollout webhooks before routing traffic to the canary. The canary analysis will run for five minutes while validating the HTTP metrics and rollout hooks every half a minute.
After a couple of seconds Flagger will create the canary objects:
```bash
# applied
deployment.apps/podinfo
horizontalpodautoscaler.autoscaling/podinfo
ingresses.extensions/podinfo
canary.flagger.app/podinfo
# generated
deployment.apps/podinfo-primary
horizontalpodautoscaler.autoscaling/podinfo-primary
service/podinfo
service/podinfo-canary
service/podinfo-primary
trafficsplits.split.smi-spec.io/podinfo
```
After the boostrap, the podinfo deployment will be scaled to zero and the traffic to `podinfo.test` will be routed to the primary pods. During the canary analysis, the `podinfo-canary.test` address can be used to target directly the canary pods.
## Automated canary promotion
Flagger implements a control loop that gradually shifts traffic to the canary while measuring key performance indicators like HTTP requests success rate, requests average duration and pod health. Based on analysis of the KPIs a canary is promoted or aborted, and the analysis result is published to Slack.
![Flagger Canary Stages](/img/diagrams/flagger-canary-steps.png)
Trigger a canary deployment by updating the container image:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=ghcr.io/stefanprodan/podinfo:6.0.1
```
Flagger detects that the deployment revision changed and starts a new rollout:
```text
kubectl -n test describe canary/podinfo
Status:
Canary Weight: 0
Failed Checks: 0
Phase: Succeeded
Events:
New revision detected! Scaling up podinfo.test
Waiting for podinfo.test rollout to finish: 0 of 1 updated replicas are available
Pre-rollout check acceptance-test passed
Advance podinfo.test canary weight 5
Advance podinfo.test canary weight 10
Advance podinfo.test canary weight 15
Advance podinfo.test canary weight 20
Advance podinfo.test canary weight 25
Waiting for podinfo.test rollout to finish: 1 of 2 updated replicas are available
Advance podinfo.test canary weight 30
Advance podinfo.test canary weight 35
Advance podinfo.test canary weight 40
Advance podinfo.test canary weight 45
Advance podinfo.test canary weight 50
Copying podinfo.test template spec to podinfo-primary.test
Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
Promotion completed! Scaling down podinfo.test
```
**Note** that if you apply new changes to the deployment during the canary analysis, Flagger will restart the analysis.
A canary deployment is triggered by changes in any of the following objects:
* Deployment PodSpec \(container image, command, ports, env, resources, etc\)
* ConfigMaps mounted as volumes or mapped to environment variables
* Secrets mounted as volumes or mapped to environment variables
You can monitor all canaries with:
```bash
watch kubectl get canaries --all-namespaces
NAMESPACE NAME STATUS WEIGHT LASTTRANSITIONTIME
test podinfo Progressing 15 2019-06-30T14:05:07Z
prod frontend Succeeded 0 2019-06-30T16:15:07Z
prod backend Failed 0 2019-06-30T17:05:07Z
```
## Automated rollback
During the canary analysis you can generate HTTP 500 errors and high latency to test if Flagger pauses and rolls back the faulted version.
Trigger another canary deployment:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=ghcr.io/stefanprodan/podinfo:6.0.2
```
Exec into the load tester pod with:
```bash
kubectl -n test exec -it flagger-loadtester-xx-xx sh
```
Generate HTTP 500 errors:
```bash
watch -n 1 curl http://podinfo-canary.test:9898/status/500
```
Generate latency:
```bash
watch -n 1 curl http://podinfo-canary.test:9898/delay/1
```
When the number of failed checks reaches the canary analysis threshold, the traffic is routed back to the primary, the canary is scaled to zero and the rollout is marked as failed.
```text
kubectl -n test describe canary/podinfo
Status:
Canary Weight: 0
Failed Checks: 10
Phase: Failed
Events:
Starting canary analysis for podinfo.test
Pre-rollout check acceptance-test passed
Advance podinfo.test canary weight 5
Advance podinfo.test canary weight 10
Advance podinfo.test canary weight 15
Halt podinfo.test advancement success rate 69.17% < 99%
Halt podinfo.test advancement success rate 61.39% < 99%
Halt podinfo.test advancement success rate 55.06% < 99%
Halt podinfo.test advancement request duration 1.20s > 0.5s
Halt podinfo.test advancement request duration 1.45s > 0.5s
Rolling back podinfo.test failed checks threshold reached 5
Canary failed! Scaling down podinfo.test
```
## Custom metrics
The canary analysis can be extended with Prometheus queries.
Let's a define a check for not found errors. Edit the canary analysis and add the following metric:
```yaml
analysis:
metrics:
- name: "404s percentage"
threshold: 3
query: |
100 - sum(
rate(
response_total{
namespace="test",
deployment="podinfo",
status_code!="404",
direction="inbound"
}[1m]
)
)
/
sum(
rate(
response_total{
namespace="test",
deployment="podinfo",
direction="inbound"
}[1m]
)
)
* 100
```
The above configuration validates the canary version by checking if the HTTP 404 req/sec percentage is below three percent of the total traffic. If the 404s rate reaches the 3% threshold, then the analysis is aborted and the canary is marked as failed.
Trigger a canary deployment by updating the container image:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=ghcr.io/stefanprodan/podinfo:6.0.3
```
Generate 404s:
```bash
watch -n 1 curl http://podinfo-canary:9898/status/404
```
Watch Flagger logs:
```text
kubectl -n linkerd logs deployment/flagger -f | jq .msg
Starting canary deployment for podinfo.test
Pre-rollout check acceptance-test passed
Advance podinfo.test canary weight 5
Halt podinfo.test advancement 404s percentage 6.20 > 3
Halt podinfo.test advancement 404s percentage 6.45 > 3
Halt podinfo.test advancement 404s percentage 7.22 > 3
Halt podinfo.test advancement 404s percentage 6.50 > 3
Halt podinfo.test advancement 404s percentage 6.34 > 3
Rolling back podinfo.test failed checks threshold reached 5
Canary failed! Scaling down podinfo.test
```
If you have Slack configured, Flagger will send a notification with the reason why the canary failed.
## Linkerd Ingress
There are two ingress controllers that are compatible with both Flagger and Linkerd: NGINX and Gloo.
Install NGINX:
```bash
helm upgrade -i nginx-ingress stable/nginx-ingress \
--namespace ingress-nginx
```
Create an ingress definition for podinfo that rewrites the incoming header to the internal service name \(required by Linkerd\):
```yaml
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
name: podinfo
namespace: test
labels:
app: podinfo
annotations:
kubernetes.io/ingress.class: "nginx"
nginx.ingress.kubernetes.io/configuration-snippet: |
proxy_set_header l5d-dst-override $service_name.$namespace.svc.cluster.local:9898;
proxy_hide_header l5d-remote-ip;
proxy_hide_header l5d-server-id;
spec:
rules:
- host: app.example.com
http:
paths:
- backend:
serviceName: podinfo
servicePort: 9898
```
When using an ingress controller, the Linkerd traffic split does not apply to incoming traffic since NGINX in running outside of the mesh. In order to run a canary analysis for a frontend app, Flagger creates a shadow ingress and sets the NGINX specific annotations.
## A/B Testing
Besides weighted routing, Flagger can be configured to route traffic to the canary based on HTTP match conditions. In an A/B testing scenario, you'll be using HTTP headers or cookies to target a certain segment of your users. This is particularly useful for frontend applications that require session affinity.
![Flagger Linkerd Ingress](/img/diagrams/flagger-nginx-linkerd.png)
Edit podinfo canary analysis, set the provider to `nginx`, add the ingress reference, remove the max/step weight and add the match conditions and iterations:
```yaml
apiVersion: flagger.app/v1beta1
kind: Canary
metadata:
name: podinfo
namespace: test
spec:
# ingress reference
provider: nginx
ingressRef:
apiVersion: extensions/v1beta1
kind: Ingress
name: podinfo
targetRef:
apiVersion: apps/v1
kind: Deployment
name: podinfo
autoscalerRef:
apiVersion: autoscaling/v2beta2
kind: HorizontalPodAutoscaler
name: podinfo
service:
# container port
port: 9898
analysis:
interval: 1m
threshold: 10
iterations: 10
match:
# curl -H 'X-Canary: always' http://app.example.com
- headers:
x-canary:
exact: "always"
# curl -b 'canary=always' http://app.example.com
- headers:
cookie:
exact: "canary"
# Linkerd Prometheus checks
metrics:
- name: request-success-rate
thresholdRange:
min: 99
interval: 1m
- name: request-duration
thresholdRange:
max: 500
interval: 30s
webhooks:
- name: acceptance-test
type: pre-rollout
url: http://flagger-loadtester.test/
timeout: 30s
metadata:
type: bash
cmd: "curl -sd 'test' http://podinfo-canary:9898/token | grep token"
- name: load-test
type: rollout
url: http://flagger-loadtester.test/
metadata:
cmd: "hey -z 2m -q 10 -c 2 -H 'Cookie: canary=always' http://app.example.com"
```
The above configuration will run an analysis for ten minutes targeting users that have a `canary` cookie set to `always` or those that call the service using the `X-Canary: always` header.
**Note** that the load test now targets the external address and uses the canary cookie.
Trigger a canary deployment by updating the container image:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=ghcr.io/stefanprodan/podinfo:6.0.4
```
Flagger detects that the deployment revision changed and starts the A/B testing:
```text
kubectl -n test describe canary/podinfo
Events:
Starting canary deployment for podinfo.test
Pre-rollout check acceptance-test passed
Advance podinfo.test canary iteration 1/10
Advance podinfo.test canary iteration 2/10
Advance podinfo.test canary iteration 3/10
Advance podinfo.test canary iteration 4/10
Advance podinfo.test canary iteration 5/10
Advance podinfo.test canary iteration 6/10
Advance podinfo.test canary iteration 7/10
Advance podinfo.test canary iteration 8/10
Advance podinfo.test canary iteration 9/10
Advance podinfo.test canary iteration 10/10
Copying podinfo.test template spec to podinfo-primary.test
Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
Promotion completed! Scaling down podinfo.test
```
The above procedure can be extended with [custom metrics](../usage/metrics.md) checks, [webhooks](../usage/webhooks.md), [manual promotion](../usage/webhooks.md#manual-gating) approval and [Slack or MS Teams](../usage/alerting.md) notifications.

View File

@ -0,0 +1,450 @@
---
weight: 6
title: NGINX Canary Deployments
type: docs
---
This guide shows you how to use the NGINX ingress controller and Flagger to automate canary deployments and A/B testing.
![Flagger NGINX Ingress Controller](/img/diagrams/flagger-nginx-overview.png)
## Prerequisites
Flagger requires a Kubernetes cluster **v1.19** or newer and NGINX ingress **v1.0.2** or newer.
Install the NGINX ingress controller with Helm v3:
```bash
helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx
kubectl create ns ingress-nginx
helm upgrade -i ingress-nginx ingress-nginx/ingress-nginx \
--namespace ingress-nginx \
--set controller.metrics.enabled=true \
--set controller.podAnnotations."prometheus\.io/scrape"=true \
--set controller.podAnnotations."prometheus\.io/port"=10254
```
Install Flagger and the Prometheus add-on in the same namespace as the ingress controller:
```bash
helm repo add flagger https://flagger.app
helm upgrade -i flagger flagger/flagger \
--namespace ingress-nginx \
--set prometheus.install=true \
--set meshProvider=nginx
```
## Bootstrap
Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler (HPA),
then creates a series of objects (Kubernetes deployments, ClusterIP services and canary ingress).
These objects expose the application outside the cluster and drive the canary analysis and promotion.
Create a test namespace:
```bash
kubectl create ns test
```
Create a deployment and a horizontal pod autoscaler:
```bash
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/podinfo?ref=main
```
Deploy the load testing service to generate traffic during the canary analysis:
```bash
helm upgrade -i flagger-loadtester flagger/loadtester \
--namespace=test
```
Create an ingress definition (replace `app.example.com` with your own domain):
```yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: podinfo
namespace: test
labels:
app: podinfo
annotations:
kubernetes.io/ingress.class: "nginx"
spec:
rules:
- host: "app.example.com"
http:
paths:
- pathType: Prefix
path: "/"
backend:
service:
name: podinfo
port:
number: 80
```
Save the above resource as podinfo-ingress.yaml and then apply it:
```bash
kubectl apply -f ./podinfo-ingress.yaml
```
Create a canary custom resource (replace `app.example.com` with your own domain):
```yaml
apiVersion: flagger.app/v1beta1
kind: Canary
metadata:
name: podinfo
namespace: test
spec:
provider: nginx
# deployment reference
targetRef:
apiVersion: apps/v1
kind: Deployment
name: podinfo
# ingress reference
ingressRef:
apiVersion: networking.k8s.io/v1
kind: Ingress
name: podinfo
# HPA reference (optional)
autoscalerRef:
apiVersion: autoscaling/v2beta2
kind: HorizontalPodAutoscaler
name: podinfo
# the maximum time in seconds for the canary deployment
# to make progress before it is rollback (default 600s)
progressDeadlineSeconds: 60
service:
# ClusterIP port number
port: 80
# container port number or name
targetPort: 9898
analysis:
# schedule interval (default 60s)
interval: 10s
# max number of failed metric checks before rollback
threshold: 10
# max traffic percentage routed to canary
# percentage (0-100)
maxWeight: 50
# canary increment step
# percentage (0-100)
stepWeight: 5
# NGINX Prometheus checks
metrics:
- name: request-success-rate
# minimum req success rate (non 5xx responses)
# percentage (0-100)
thresholdRange:
min: 99
interval: 1m
# testing (optional)
webhooks:
- name: acceptance-test
type: pre-rollout
url: http://flagger-loadtester.test/
timeout: 30s
metadata:
type: bash
cmd: "curl -sd 'test' http://podinfo-canary/token | grep token"
- name: load-test
url: http://flagger-loadtester.test/
timeout: 5s
metadata:
cmd: "hey -z 1m -q 10 -c 2 http://app.example.com/"
```
Save the above resource as podinfo-canary.yaml and then apply it:
```bash
kubectl apply -f ./podinfo-canary.yaml
```
After a couple of seconds Flagger will create the canary objects:
```bash
# applied
deployment.apps/podinfo
horizontalpodautoscaler.autoscaling/podinfo
ingresses.extensions/podinfo
canary.flagger.app/podinfo
# generated
deployment.apps/podinfo-primary
horizontalpodautoscaler.autoscaling/podinfo-primary
service/podinfo
service/podinfo-canary
service/podinfo-primary
ingresses.extensions/podinfo-canary
```
## Automated canary promotion
Flagger implements a control loop that gradually shifts traffic to the canary while measuring key performance
indicators like HTTP requests success rate, requests average duration and pod health.
Based on analysis of the KPIs a canary is promoted or aborted, and the analysis result is published to Slack or MS Teams.
![Flagger Canary Stages](/img/diagrams/flagger-canary-steps.png)
Trigger a canary deployment by updating the container image:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=ghcr.io/stefanprodan/podinfo:6.0.1
```
Flagger detects that the deployment revision changed and starts a new rollout:
```text
kubectl -n test describe canary/podinfo
Status:
Canary Weight: 0
Failed Checks: 0
Phase: Succeeded
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal Synced 3m flagger New revision detected podinfo.test
Normal Synced 3m flagger Scaling up podinfo.test
Warning Synced 3m flagger Waiting for podinfo.test rollout to finish: 0 of 1 updated replicas are available
Normal Synced 3m flagger Advance podinfo.test canary weight 5
Normal Synced 3m flagger Advance podinfo.test canary weight 10
Normal Synced 3m flagger Advance podinfo.test canary weight 15
Normal Synced 2m flagger Advance podinfo.test canary weight 20
Normal Synced 2m flagger Advance podinfo.test canary weight 25
Normal Synced 1m flagger Advance podinfo.test canary weight 30
Normal Synced 1m flagger Advance podinfo.test canary weight 35
Normal Synced 55s flagger Advance podinfo.test canary weight 40
Normal Synced 45s flagger Advance podinfo.test canary weight 45
Normal Synced 35s flagger Advance podinfo.test canary weight 50
Normal Synced 25s flagger Copying podinfo.test template spec to podinfo-primary.test
Warning Synced 15s flagger Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
Normal Synced 5s flagger Promotion completed! Scaling down podinfo.test
```
**Note** that if you apply new changes to the deployment during the canary analysis, Flagger will restart the analysis.
You can monitor all canaries with:
```bash
watch kubectl get canaries --all-namespaces
NAMESPACE NAME STATUS WEIGHT LASTTRANSITIONTIME
test podinfo Progressing 15 2019-05-06T14:05:07Z
prod frontend Succeeded 0 2019-05-05T16:15:07Z
prod backend Failed 0 2019-05-04T17:05:07Z
```
## Automated rollback
During the canary analysis you can generate HTTP 500 errors to test if Flagger pauses and rolls back the faulted version.
Trigger another canary deployment:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=ghcr.io/stefanprodan/podinfo:6.0.2
```
Generate HTTP 500 errors:
```bash
watch curl http://app.example.com/status/500
```
When the number of failed checks reaches the canary analysis threshold,
the traffic is routed back to the primary,
the canary is scaled to zero and the rollout is marked as failed.
```text
kubectl -n test describe canary/podinfo
Status:
Canary Weight: 0
Failed Checks: 10
Phase: Failed
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal Synced 3m flagger Starting canary deployment for podinfo.test
Normal Synced 3m flagger Advance podinfo.test canary weight 5
Normal Synced 3m flagger Advance podinfo.test canary weight 10
Normal Synced 3m flagger Advance podinfo.test canary weight 15
Normal Synced 3m flagger Halt podinfo.test advancement success rate 69.17% < 99%
Normal Synced 2m flagger Halt podinfo.test advancement success rate 61.39% < 99%
Normal Synced 2m flagger Halt podinfo.test advancement success rate 55.06% < 99%
Normal Synced 2m flagger Halt podinfo.test advancement success rate 47.00% < 99%
Normal Synced 2m flagger (combined from similar events): Halt podinfo.test advancement success rate 38.08% < 99%
Warning Synced 1m flagger Rolling back podinfo.test failed checks threshold reached 10
Warning Synced 1m flagger Canary failed! Scaling down podinfo.test
```
## Custom metrics
The canary analysis can be extended with Prometheus queries.
The demo app is instrumented with Prometheus so you can create a custom check
that will use the HTTP request duration histogram to validate the canary.
Create a metric template and apply it on the cluster:
```yaml
apiVersion: flagger.app/v1beta1
kind: MetricTemplate
metadata:
name: latency
namespace: test
spec:
provider:
type: prometheus
address: http://flagger-prometheus.ingress-nginx:9090
query: |
histogram_quantile(0.99,
sum(
rate(
http_request_duration_seconds_bucket{
kubernetes_namespace="{{ namespace }}",
kubernetes_pod_name=~"{{ target }}-[0-9a-zA-Z]+(-[0-9a-zA-Z]+)"
}[1m]
)
) by (le)
)
```
Edit the canary analysis and add the latency check:
```yaml
analysis:
metrics:
- name: "latency"
templateRef:
name: latency
thresholdRange:
max: 0.5
interval: 1m
```
The threshold is set to 500ms so if the average request duration in the last minute goes over
half a second then the analysis will fail and the canary will not be promoted.
Trigger a canary deployment by updating the container image:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=ghcr.io/stefanprodan/podinfo:6.0.3
```
Generate high response latency:
```bash
watch curl http://app.example.com/delay/2
```
Watch Flagger logs:
```text
kubectl -n nginx-ingress logs deployment/flagger -f | jq .msg
Starting canary deployment for podinfo.test
Advance podinfo.test canary weight 5
Advance podinfo.test canary weight 10
Advance podinfo.test canary weight 15
Halt podinfo.test advancement latency 1.20 > 0.5
Halt podinfo.test advancement latency 1.45 > 0.5
Halt podinfo.test advancement latency 1.60 > 0.5
Halt podinfo.test advancement latency 1.69 > 0.5
Halt podinfo.test advancement latency 1.70 > 0.5
Rolling back podinfo.test failed checks threshold reached 5
Canary failed! Scaling down podinfo.test
```
If you have alerting configured, Flagger will send a notification with the reason why the canary failed.
## A/B Testing
Besides weighted routing, Flagger can be configured to route traffic to the canary based on HTTP match conditions.
In an A/B testing scenario, you'll be using HTTP headers or cookies to target a certain segment of your users.
This is particularly useful for frontend applications that require session affinity.
![Flagger A/B Testing Stages](/img/diagrams/flagger-abtest-steps.png)
Edit the canary analysis, remove the max/step weight and add the match conditions and iterations:
```yaml
analysis:
interval: 1m
threshold: 10
iterations: 10
match:
# curl -H 'X-Canary: insider' http://app.example.com
- headers:
x-canary:
exact: "insider"
# curl -b 'canary=always' http://app.example.com
- headers:
cookie:
exact: "canary"
metrics:
- name: request-success-rate
thresholdRange:
min: 99
interval: 1m
webhooks:
- name: load-test
url: http://flagger-loadtester.test/
timeout: 5s
metadata:
cmd: "hey -z 1m -q 10 -c 2 -H 'Cookie: canary=always' http://app.example.com/"
```
The above configuration will run an analysis for ten minutes targeting users that have
a `canary` cookie set to `always` or those that call the service using the `X-Canary: insider` header.
Trigger a canary deployment by updating the container image:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=ghcr.io/stefanprodan/podinfo:6.0.4
```
Flagger detects that the deployment revision changed and starts the A/B testing:
```text
kubectl -n test describe canary/podinfo
Status:
Failed Checks: 0
Phase: Succeeded
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal Synced 3m flagger New revision detected podinfo.test
Normal Synced 3m flagger Scaling up podinfo.test
Warning Synced 3m flagger Waiting for podinfo.test rollout to finish: 0 of 1 updated replicas are available
Normal Synced 3m flagger Advance podinfo.test canary iteration 1/10
Normal Synced 3m flagger Advance podinfo.test canary iteration 2/10
Normal Synced 3m flagger Advance podinfo.test canary iteration 3/10
Normal Synced 2m flagger Advance podinfo.test canary iteration 4/10
Normal Synced 2m flagger Advance podinfo.test canary iteration 5/10
Normal Synced 1m flagger Advance podinfo.test canary iteration 6/10
Normal Synced 1m flagger Advance podinfo.test canary iteration 7/10
Normal Synced 55s flagger Advance podinfo.test canary iteration 8/10
Normal Synced 45s flagger Advance podinfo.test canary iteration 9/10
Normal Synced 35s flagger Advance podinfo.test canary iteration 10/10
Normal Synced 25s flagger Copying podinfo.test template spec to podinfo-primary.test
Warning Synced 15s flagger Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
Normal Synced 5s flagger Promotion completed! Scaling down podinfo.test
```
The above procedure can be extended with
[custom metrics](../usage/metrics.md) checks,
[webhooks](../usage/webhooks.md),
[manual promotion](../usage/webhooks.md#manual-gating) approval and
[Slack or MS Teams](../usage/alerting.md) notifications.

View File

@ -0,0 +1,367 @@
---
weight: 9
title: Open Service Mesh Canary Deployments
type: docs
---
This guide shows you how to use Open Service Mesh (OSM) and Flagger to automate canary deployments.
![Flagger OSM Traffic Split](/img/diagrams/flagger-osm-traffic-split.png)
## Prerequisites
Flagger requires a Kubernetes cluster **v1.16** or newer and Open Service Mesh **0.9.1** or newer.
OSM must have permissive traffic policy enabled and have an instance of Prometheus for metrics.
- If the OSM CLI is being used for installation, install OSM using the following command:
```bash
osm install \
--set=OpenServiceMesh.deployPrometheus=true \
--set=OpenServiceMesh.enablePermissiveTrafficPolicy=true
```
- If a managed instance of OSM is being used:
- [Bring your own instance](docs.openservicemesh.io/docs/guides/observability/metrics/#byo-prometheus) of Prometheus,
setting the namespace to match the managed OSM controller namespace
- Enable permissive traffic policy after installation by updating the OSM MeshConfig resource:
```bash
# Replace <osm-namespace> with OSM controller's namespace
kubectl patch meshconfig osm-mesh-config -n <osm-namespace> -p '{"spec":{"traffic":{"enablePermissiveTrafficPolicyMode":true}}}' --type=merge
```
To install Flagger in the default `osm-system` namespace, use:
```bash
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/osm?ref=main
```
Alternatively, if a non-default namespace or managed instance of OSM is in use, install Flagger with Helm, replacing the <osm-namespace>
values as appropriate. If a custom instance of Prometheus is being used, replace `osm-prometheus` with the relevant Prometheus service name.
```bash
helm upgrade -i flagger flagger/flagger \
--namespace=<osm-namespace> \
--set meshProvider=osm \
--set metricsServer=http://osm-prometheus.<osm-namespace>.svc:7070
```
## Bootstrap
Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler (HPA),
then creates a series of objects (Kubernetes deployments, ClusterIP services and SMI traffic split).
These objects expose the application inside the mesh and drive the canary analysis and promotion.
Create a `test` namespace and enable OSM namespace monitoring and metrics scraping for the namespace.
```bash
kubectl create namespace test
osm namespace add test
osm metrics enable --namespace test
```
Create a `podinfo` deployment and a horizontal pod autoscaler:
```bash
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/podinfo?ref=main
```
Install the load testing service to generate traffic during the canary analysis:
```bash
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/tester?ref=main
```
Create a canary custom resource for the `podinfo` deployment.
The following `podinfo` canary custom resource instructs Flagger to:
1. monitor any changes to the `podinfo` deployment created earlier,
2. detect `podinfo` deployment revision changes, and
3. start a Flagger canary analysis, rollout, and promotion if there were deployment revision changes.
```yaml
apiVersion: flagger.app/v1beta1
kind: Canary
metadata:
name: podinfo
namespace: test
spec:
provider: osm
# deployment reference
targetRef:
apiVersion: apps/v1
kind: Deployment
name: podinfo
# HPA reference (optional)
autoscalerRef:
apiVersion: autoscaling/v2beta2
kind: HorizontalPodAutoscaler
name: podinfo
# the maximum time in seconds for the canary deployment
# to make progress before it is rolled back (default 600s)
progressDeadlineSeconds: 60
service:
# ClusterIP port number
port: 9898
# container port number or name (optional)
targetPort: 9898
analysis:
# schedule interval (default 60s)
interval: 30s
# max number of failed metric checks before rollback
threshold: 5
# max traffic percentage routed to canary
# percentage (0-100)
maxWeight: 50
# canary increment step
# percentage (0-100)
stepWeight: 5
# OSM Prometheus checks
metrics:
- name: request-success-rate
# minimum req success rate (non 5xx responses)
# percentage (0-100)
thresholdRange:
min: 99
interval: 1m
- name: request-duration
# maximum req duration P99
# milliseconds
thresholdRange:
max: 500
interval: 30s
# testing (optional)
webhooks:
- name: acceptance-test
type: pre-rollout
url: http://flagger-loadtester.test/
timeout: 30s
metadata:
type: bash
cmd: "curl -sd 'test' http://podinfo-canary.test:9898/token | grep token"
- name: load-test
type: rollout
url: http://flagger-loadtester.test/
timeout: 5s
metadata:
cmd: "hey -z 2m -q 10 -c 2 http://podinfo-canary.test:9898/"
```
Save the above resource as podinfo-canary.yaml and then apply it:
```bash
kubectl apply -f ./podinfo-canary.yaml
```
When the canary analysis starts, Flagger will call the pre-rollout webhooks before routing traffic to the canary.
The canary analysis will run for five minutes while validating the HTTP metrics and rollout hooks every half a minute.
After a couple of seconds Flagger will create the canary objects.
```bash
# applied
deployment.apps/podinfo
horizontalpodautoscaler.autoscaling/podinfo
ingresses.extensions/podinfo
canary.flagger.app/podinfo
# generated
deployment.apps/podinfo-primary
horizontalpodautoscaler.autoscaling/podinfo-primary
service/podinfo
service/podinfo-canary
service/podinfo-primary
trafficsplits.split.smi-spec.io/podinfo
```
After the boostrap, the `podinfo` deployment will be scaled to zero and the traffic to `podinfo.test` will be routed to the primary pods.
During the canary analysis, the `podinfo-canary.test` address can be used to target directly the canary pods.
## Automated Canary Promotion
Flagger implements a control loop that gradually shifts traffic to the canary while measuring key performance indicators like HTTP requests success rate, requests average duration and pod health.
Based on analysis of the KPIs a canary is promoted or aborted.
![Flagger Canary Stages](/img/diagrams/flagger-canary-steps.png)
Trigger a canary deployment by updating the container image:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=ghcr.io/stefanprodan/podinfo:6.0.1
```
Flagger detects that the deployment revision changed and starts a new rollout.
```text
kubectl -n test describe canary/podinfo
Status:
Canary Weight: 0
Failed Checks: 0
Phase: Succeeded
Events:
New revision detected! Scaling up podinfo.test
Waiting for podinfo.test rollout to finish: 0 of 1 updated replicas are available
Pre-rollout check acceptance-test passed
Advance podinfo.test canary weight 5
Advance podinfo.test canary weight 10
Advance podinfo.test canary weight 15
Advance podinfo.test canary weight 20
Advance podinfo.test canary weight 25
Waiting for podinfo.test rollout to finish: 1 of 2 updated replicas are available
Advance podinfo.test canary weight 30
Advance podinfo.test canary weight 35
Advance podinfo.test canary weight 40
Advance podinfo.test canary weight 45
Advance podinfo.test canary weight 50
Copying podinfo.test template spec to podinfo-primary.test
Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
Promotion completed! Scaling down podinfo.test
```
**Note** that if you apply any new changes to the `podinfo` deployment during the canary analysis, Flagger will restart the analysis.
A canary deployment is triggered by changes in any of the following objects:
* Deployment PodSpec \(container image, command, ports, env, resources, etc\)
* ConfigMaps mounted as volumes or mapped to environment variables
* Secrets mounted as volumes or mapped to environment variables
You can monitor all canaries with:
```bash
watch kubectl get canaries --all-namespaces
NAMESPACE NAME STATUS WEIGHT LASTTRANSITIONTIME
test podinfo Progressing 15 2019-06-30T14:05:07Z
prod frontend Succeeded 0 2019-06-30T16:15:07Z
prod backend Failed 0 2019-06-30T17:05:07Z
```
## Automated Rollback
During the canary analysis you can generate HTTP 500 errors and high latency to test if Flagger pauses and rolls back the faulted version.
Trigger another canary deployment:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=ghcr.io/stefanprodan/podinfo:6.0.2
```
Exec into the load tester pod with:
```bash
kubectl -n test exec -it flagger-loadtester-xx-xx sh
```
Repeatedly generate HTTP 500 errors until the `kubectl describe` output below shows canary rollout failure:
```bash
watch -n 0.1 curl http://podinfo-canary.test:9898/status/500
```
Repeatedly generate latency until canary rollout fails:
```bash
watch -n 0.1 curl http://podinfo-canary.test:9898/delay/1
```
When the number of failed checks reaches the canary analysis thresholds defined in the `podinfo` canary custom resource earlier, the traffic is routed back to the primary, the canary is scaled to zero and the rollout is marked as failed.
```text
kubectl -n test describe canary/podinfo
Status:
Canary Weight: 0
Failed Checks: 10
Phase: Failed
Events:
Starting canary analysis for podinfo.test
Pre-rollout check acceptance-test passed
Advance podinfo.test canary weight 5
Advance podinfo.test canary weight 10
Advance podinfo.test canary weight 15
Halt podinfo.test advancement success rate 69.17% < 99%
Halt podinfo.test advancement success rate 61.39% < 99%
Halt podinfo.test advancement success rate 55.06% < 99%
Halt podinfo.test advancement request duration 1.20s > 0.5s
Halt podinfo.test advancement request duration 1.45s > 0.5s
Rolling back podinfo.test failed checks threshold reached 5
Canary failed! Scaling down podinfo.test
```
## Custom Metrics
The canary analysis can be extended with Prometheus queries.
Let's define a check for 404 not found errors.
Edit the canary analysis (`podinfo-canary.yaml` file) and add the following metric.
For more information on creating additional custom metrics using OSM metrics, please check the [metrics available in OSM](https://docs.openservicemesh.io/docs/guides/observability/metrics/#available-metrics).
```yaml
analysis:
metrics:
- name: "404s percentage"
threshold: 3
query: |
100 - (
sum(
rate(
osm_request_total{
destination_namespace="test",
destination_kind="Deployment",
destination_name="podinfo",
response_code!="404"
}[1m]
)
)
/
sum(
rate(
osm_request_total{
destination_namespace="test",
destination_kind="Deployment",
destination_name="podinfo"
}[1m]
)
) * 100
)
```
The above configuration validates the canary version by checking if the HTTP 404 req/sec percentage is below three percent of the total traffic.
If the 404s rate reaches the 3% threshold, then the analysis is aborted and the canary is marked as failed.
Trigger a canary deployment by updating the container image:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=ghcr.io/stefanprodan/podinfo:6.0.3
```
Exec into the load tester pod with:
```bash
kubectl -n test exec -it flagger-loadtester-xx-xx sh
```
Repeatedly generate 404s until canary rollout fails:
```bash
watch -n 0.1 curl http://podinfo-canary.test:9898/status/404
```
Watch Flagger logs to confirm successful canary rollback.
```text
kubectl -n osm-system logs deployment/flagger -f | jq .msg
Starting canary deployment for podinfo.test
Pre-rollout check acceptance-test passed
Advance podinfo.test canary weight 5
Halt podinfo.test advancement 404s percentage 6.20 > 3
Halt podinfo.test advancement 404s percentage 6.45 > 3
Halt podinfo.test advancement 404s percentage 7.22 > 3
Halt podinfo.test advancement 404s percentage 6.50 > 3
Halt podinfo.test advancement 404s percentage 6.34 > 3
Rolling back podinfo.test failed checks threshold reached 5
Canary failed! Scaling down podinfo.test
```

View File

@ -0,0 +1,203 @@
---
weight: 13
title: Canary analysis with Prometheus Operator
type: docs
---
This guide show you how to use
[Prometheus Operator](https://github.com/prometheus-operator/prometheus-operator) for canary analysis.
## Prerequisites
Flagger requires a Kubernetes cluster **v1.16** or newer and Prometheus Operator **v0.40** or newer.
Install Prometheus Operator with Helm v3:
```bash
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
kubectl create ns monitoring
helm upgrade -i prometheus prometheus-community/kube-prometheus-stack \
--namespace monitoring \
--set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \
--set fullnameOverride=prometheus
```
The `prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false`
option allows Prometheus Operator to watch serviceMonitors outside of its namespace.
Install Flagger by setting the metrics server to Prometheus:
```bash
helm repo add flagger https://flagger.app
kubectl create ns flagger-system
helm upgrade -i flagger flagger/flagger \
--namespace flagger-system \
--set metricsServer=http://prometheus-prometheus.monitoring:9090 \
--set meshProvider=kubernetes
```
Install Flagger's tester:
```bash
helm upgrade -i loadtester flagger/loadtester \
--namespace flagger-system
```
Install [podinfo](https://github.com/stefanprodan/podinfo) demo app:
```bash
helm repo add podinfo https://stefanprodan.github.io/podinfo
kubectl create ns test
helm upgrade -i podinfo podinfo/podinfo \
--namespace test \
--set service.enabled=false
```
## Service monitors
The demo app is instrumented with Prometheus,
so you can create a `ServiceMonitor` objects to scrape podinfo's metrics endpoint:
```yaml
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: podinfo-canary
namespace: test
spec:
endpoints:
- path: /metrics
port: http
interval: 5s
selector:
matchLabels:
app: podinfo-canary
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: podinfo-primary
namespace: test
spec:
endpoints:
- path: /metrics
port: http
interval: 5s
selector:
matchLabels:
app: podinfo
```
We are setting `interval: 5s` to have a more aggressive scraping.
If you do not define it, you should use a longer interval in the Canary object.
## Metric templates
Create a metric template to measure the HTTP requests error rate:
```yaml
apiVersion: flagger.app/v1beta1
kind: MetricTemplate
metadata:
name: error-rate
namespace: test
spec:
provider:
address: http://prometheus-prometheus.monitoring:9090
type: prometheus
query: |
100 - rate(
http_requests_total{
namespace="{{ namespace }}",
job="{{ target }}-canary",
status!~"5.*"
}[{{ interval }}])
/
rate(
http_requests_total{
namespace="{{ namespace }}",
job="{{ target }}-canary"
}[{{ interval }}]
) * 100
```
Create a metric template to measure the HTTP requests average duration:
```yaml
apiVersion: flagger.app/v1beta1
kind: MetricTemplate
metadata:
name: latency
namespace: test
spec:
provider:
address: http://prometheus-prometheus.monitoring:9090
type: prometheus
query: |
histogram_quantile(0.99,
sum(
rate(
http_request_duration_seconds_bucket{
namespace="{{ namespace }}",
job="{{ target }}-canary"
}[{{ interval }}]
)
) by (le)
)
```
## Canary analysis
Using the metrics template you can configure the canary analysis with HTTP error rate and latency checks:
```yaml
apiVersion: flagger.app/v1beta1
kind: Canary
metadata:
name: podinfo
namespace: test
spec:
provider: kubernetes
targetRef:
apiVersion: apps/v1
kind: Deployment
name: podinfo
progressDeadlineSeconds: 60
service:
port: 80
targetPort: http
name: podinfo
analysis:
interval: 30s
iterations: 10
threshold: 2
metrics:
- name: error-rate
templateRef:
name: error-rate
thresholdRange:
max: 1
interval: 30s
- name: latency
templateRef:
name: latency
thresholdRange:
max: 0.5
interval: 30s
webhooks:
- name: load-test
type: rollout
url: "http://loadtester.flagger-system/"
timeout: 5s
metadata:
type: cmd
cmd: "hey -z 1m -q 10 -c 2 http://podinfo-canary.test/"
```
Based on the above specification, Flagger creates the primary and canary Kubernetes ClusterIP service.
During the canary analysis, Prometheus will scrape the canary service and Flagger will use the HTTP error rate
and latency queries to determine if the release should be promoted or rolled back.

View File

@ -0,0 +1,383 @@
---
weight: 7
title: Skipper Canary Deployments
type: docs
---
This guide shows you how to use the [Skipper ingress controller](https://opensource.zalando.com/skipper/kubernetes/ingress-controller/) and Flagger to automate canary deployments.
![Flagger Skipper Ingress Controller](/img/diagrams/flagger-skipper-overview.png)
## Prerequisites
Flagger requires a Kubernetes cluster **v1.19** or newer and Skipper ingress **v0.13** or newer.
Install Skipper ingress-controller using [upstream definition](https://opensource.zalando.com/skipper/kubernetes/ingress-controller/#install-skipper-as-ingress-controller).
Certain arguments are relevant:
```yaml
- -enable-connection-metrics
- -histogram-metric-buckets=.01,1,10,100
- -kubernetes
- -kubernetes-in-cluster
- -kubernetes-path-mode=path-prefix
- -metrics-exp-decay-sample
- -metrics-flavour=prometheus
- -route-backend-metrics
- -route-backend-error-counters
- -route-response-metrics
- -serve-host-metrics
- -serve-route-metrics
- -whitelisted-healthcheck-cidr=0.0.0.0/0 # permit Kind source health checks
```
Install Flagger using kustomize:
```bash
kustomize build https://github.com/fluxcd/flagger/kustomize/kubernetes | kubectl apply -f -
```
## Bootstrap
Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler (HPA),
then creates a series of objects (Kubernetes deployments, ClusterIP services and canary ingress).
These objects expose the application outside the cluster and drive the canary analysis and promotion.
Create a test namespace:
```bash
kubectl create ns test
```
Create a deployment and a horizontal pod autoscaler:
```bash
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/podinfo?ref=main
```
Deploy the load testing service to generate traffic during the canary analysis:
```bash
helm upgrade -i flagger-loadtester flagger/loadtester \
--namespace=test
```
Create an ingress definition \(replace `app.example.com` with your own domain\):
```yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: podinfo
namespace: test
labels:
app: podinfo
annotations:
kubernetes.io/ingress.class: "skipper"
spec:
rules:
- host: "app.example.com"
http:
paths:
- pathType: Prefix
path: "/"
backend:
service:
name: podinfo
port:
number: 80
```
Save the above resource as podinfo-ingress.yaml and then apply it:
```bash
kubectl apply -f ./podinfo-ingress.yaml
```
Create a canary custom resource (replace `app.example.com` with your own domain):
```yaml
apiVersion: flagger.app/v1beta1
kind: Canary
metadata:
name: podinfo
namespace: test
spec:
provider: skipper
# deployment reference
targetRef:
apiVersion: apps/v1
kind: Deployment
name: podinfo
# ingress reference
ingressRef:
apiVersion: networking.k8s.io/v1
kind: Ingress
name: podinfo
# HPA reference (optional)
autoscalerRef:
apiVersion: autoscaling/v2beta2
kind: HorizontalPodAutoscaler
name: podinfo
# the maximum time in seconds for the canary deployment
# to make progress before it is rollback (default 600s)
progressDeadlineSeconds: 60
service:
# ClusterIP port number
port: 80
# container port number or name
targetPort: 9898
analysis:
# schedule interval (default 60s)
interval: 10s
# max number of failed metric checks before rollback
threshold: 10
# max traffic percentage routed to canary
# percentage (0-100)
maxWeight: 50
# canary increment step
# percentage (0-100)
stepWeight: 5
# Skipper Prometheus checks
metrics:
- name: request-success-rate
interval: 1m
# minimum req success rate (non 5xx responses)
# percentage (0-100)
thresholdRange:
min: 99
- name: request-duration
interval: 1m
# maximum req duration P99
# milliseconds
thresholdRange:
max: 500
webhooks:
- name: gate
type: confirm-rollout
url: http://flagger-loadtester.test/gate/approve
- name: acceptance-test
type: pre-rollout
url: http://flagger-loadtester.test/
timeout: 10s
metadata:
type: bash
cmd: "curl -sd 'test' http://podinfo-canary/token | grep token"
- name: load-test
type: rollout
url: http://flagger-loadtester.test/
timeout: 5s
metadata:
type: cmd
cmd: "hey -z 10m -q 10 -c 2 -host app.example.com http://skipper-ingress.kube-system"
logCmdOutput: "true"
```
Save the above resource as podinfo-canary.yaml and then apply it:
```bash
kubectl apply -f ./podinfo-canary.yaml
```
After a couple of seconds Flagger will create the canary objects:
```bash
# applied
deployment.apps/podinfo
horizontalpodautoscaler.autoscaling/podinfo
ingress.networking.k8s.io/podinfo-ingress
canary.flagger.app/podinfo
# generated
deployment.apps/podinfo-primary
horizontalpodautoscaler.autoscaling/podinfo-primary
service/podinfo
service/podinfo-canary
service/podinfo-primary
ingress.networking.k8s.io/podinfo-canary
```
## Automated canary promotion
Flagger implements a control loop that gradually shifts traffic to the canary while measuring
key performance indicators like HTTP requests success rate, requests average duration and pod health.
Based on analysis of the KPIs a canary is promoted or aborted, and the analysis result is published to Slack or MS Teams.
![Flagger Canary Stages](/img/diagrams/flagger-canary-steps.png)
Trigger a canary deployment by updating the container image:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=stefanprodan/podinfo:4.0.6
```
Flagger detects that the deployment revision changed and starts a new rollout:
```text
kubectl -n test describe canary/podinfo
Status:
Canary Weight: 0
Failed Checks: 0
Phase: Succeeded
Events:
New revision detected! Scaling up podinfo.test
Waiting for podinfo.test rollout to finish: 0 of 1 updated replicas are available
Pre-rollout check acceptance-test passed
Advance podinfo.test canary weight 5
Advance podinfo.test canary weight 10
Advance podinfo.test canary weight 15
Advance podinfo.test canary weight 20
Advance podinfo.test canary weight 25
Advance podinfo.test canary weight 30
Advance podinfo.test canary weight 35
Advance podinfo.test canary weight 40
Advance podinfo.test canary weight 45
Advance podinfo.test canary weight 50
Copying podinfo.test template spec to podinfo-primary.test
Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
Routing all traffic to primary
Promotion completed! Scaling down podinfo.test
```
**Note** that if you apply new changes to the deployment during the canary analysis, Flagger will restart the analysis.
You can monitor all canaries with:
```bash
watch kubectl get canaries --all-namespaces
NAMESPACE NAME STATUS WEIGHT LASTTRANSITIONTIME
test podinfo-2 Progressing 30 2020-08-14T12:32:12Z
test podinfo Succeeded 0 2020-08-14T11:23:88Z
```
## Automated rollback
During the canary analysis you can generate HTTP 500 errors to test if Flagger pauses and rolls back the faulted version.
Trigger another canary deployment:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=stefanprodan/podinfo:4.0.6
```
Exec into the load tester pod with:
```bash
kubectl -n test exec -it deploy/flagger-loadtester bash
```
Generate HTTP 500 errors:
```bash
hey -z 1m -c 5 -q 5 http://app.example.com/status/500
```
Generate latency:
```bash
watch -n 1 curl http://app.example.com/delay/1
```
When the number of failed checks reaches the canary analysis threshold, the traffic is routed back to the primary,
the canary is scaled to zero and the rollout is marked as failed.
```text
kubectl -n flagger-system logs deploy/flagger -f | jq .msg
New revision detected! Scaling up podinfo.test
Canary deployment podinfo.test not ready: waiting for rollout to finish: 0 of 1 updated replicas are available
Starting canary analysis for podinfo.test
Pre-rollout check acceptance-test passed
Advance podinfo.test canary weight 5
Advance podinfo.test canary weight 10
Advance podinfo.test canary weight 15
Advance podinfo.test canary weight 20
Halt podinfo.test advancement success rate 53.42% < 99%
Halt podinfo.test advancement success rate 53.19% < 99%
Halt podinfo.test advancement success rate 48.05% < 99%
Rolling back podinfo.test failed checks threshold reached 3
Canary failed! Scaling down podinfo.test
```
## Custom metrics
The canary analysis can be extended with Prometheus queries.
Create a metric template and apply it on the cluster:
```yaml
apiVersion: flagger.app/v1beta1
kind: MetricTemplate
metadata:
name: latency
namespace: test
spec:
provider:
type: prometheus
address: http://flagger-prometheus.flagger-system:9090
query: |
histogram_quantile(0.99,
sum(
rate(
skipper_serve_route_duration_seconds_bucket{
route=~"{{ printf "kube(ew)?_%s__%s_canary__.*__%s_canary(_[0-9]+)?" namespace ingress service }}",
le="+Inf"
}[1m]
)
) by (le)
)
```
Edit the canary analysis and add the latency check:
```yaml
analysis:
metrics:
- name: "latency"
templateRef:
name: latency
thresholdRange:
max: 0.5
interval: 1m
```
The threshold is set to 500ms so if the average request duration in the last minute goes over half a second
then the analysis will fail and the canary will not be promoted.
Trigger a canary deployment by updating the container image:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=stefanprodan/podinfo:4.0.6
```
Generate high response latency:
```bash
watch curl http://app.example.com/delay/2
```
Watch Flagger logs:
```text
kubectl -n flagger-system logs deployment/flagger -f | jq .msg
Starting canary deployment for podinfo.test
Advance podinfo.test canary weight 5
Advance podinfo.test canary weight 10
Advance podinfo.test canary weight 15
Halt podinfo.test advancement latency 1.20 > 0.5
Halt podinfo.test advancement latency 1.45 > 0.5
Halt podinfo.test advancement latency 1.60 > 0.5
Halt podinfo.test advancement latency 1.69 > 0.5
Halt podinfo.test advancement latency 1.70 > 0.5
Rolling back podinfo.test failed checks threshold reached 5
Canary failed! Scaling down podinfo.test
```
If you have alerting configured, Flagger will send a notification with the reason why the canary failed.

View File

@ -0,0 +1,374 @@
---
weight: 8
title: Traefik Canary Deployments
type: docs
---
This guide shows you how to use the [Traefik](https://doc.traefik.io/traefik/) and Flagger to automate canary deployments.
![Flagger Traefik Overview](/img/diagrams/flagger-traefik-overview.png)
## Prerequisites
Flagger requires a Kubernetes cluster **v1.16** or newer and Traefik **v2.3** or newer.
Install Traefik with Helm v3:
```bash
helm repo add traefik https://helm.traefik.io/traefik
kubectl create ns traefik
cat <<EOF | helm upgrade -i traefik traefik/traefik --namespace traefik -f -
deployment:
podAnnotations:
prometheus.io/port: "9100"
prometheus.io/scrape: "true"
prometheus.io/path: "/metrics"
metrics:
prometheus:
entryPoint: metrics
EOF
```
Install Flagger and the Prometheus add-on in the same namespace as Traefik:
```bash
helm repo add flagger https://flagger.app
helm upgrade -i flagger flagger/flagger \
--namespace traefik \
--set prometheus.install=true \
--set meshProvider=traefik
```
## Bootstrap
Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler \(HPA\), then creates a series of objects \(Kubernetes deployments, ClusterIP services and TraefikService\). These objects expose the application outside the cluster and drive the canary analysis and promotion.
Create a test namespace:
```bash
kubectl create ns test
```
Create a deployment and a horizontal pod autoscaler:
```bash
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/podinfo?ref=main
```
Deploy the load testing service to generate traffic during the canary analysis:
```bash
helm upgrade -i flagger-loadtester flagger/loadtester \
--namespace=test
```
Create Traefik IngressRoute that references TraefikService generated by Flagger \(replace `app.example.com` with your own domain\):
```yaml
apiVersion: traefik.containo.us/v1alpha1
kind: IngressRoute
metadata:
name: podinfo
namespace: test
spec:
entryPoints:
- web
routes:
- match: Host(`app.example.com`)
kind: Rule
services:
- name: podinfo
kind: TraefikService
port: 80
```
Save the above resource as podinfo-ingressroute.yaml and then apply it:
```bash
kubectl apply -f ./podinfo-ingressroute.yaml
```
Create a canary custom resource \(replace `app.example.com` with your own domain\):
```yaml
apiVersion: flagger.app/v1beta1
kind: Canary
metadata:
name: podinfo
namespace: test
spec:
provider: traefik
# deployment reference
targetRef:
apiVersion: apps/v1
kind: Deployment
name: podinfo
# HPA reference (optional)
autoscalerRef:
apiVersion: autoscaling/v2beta2
kind: HorizontalPodAutoscaler
name: podinfo
# the maximum time in seconds for the canary deployment
# to make progress before it is rollback (default 600s)
progressDeadlineSeconds: 60
service:
# ClusterIP port number
port: 80
# container port number or name
targetPort: 9898
analysis:
# schedule interval (default 60s)
interval: 10s
# max number of failed metric checks before rollback
threshold: 10
# max traffic percentage routed to canary
# percentage (0-100)
maxWeight: 50
# canary increment step
# percentage (0-100)
stepWeight: 5
# Traefik Prometheus checks
metrics:
- name: request-success-rate
interval: 1m
# minimum req success rate (non 5xx responses)
# percentage (0-100)
thresholdRange:
min: 99
- name: request-duration
interval: 1m
# maximum req duration P99
# milliseconds
thresholdRange:
max: 500
webhooks:
- name: acceptance-test
type: pre-rollout
url: http://flagger-loadtester.test/
timeout: 10s
metadata:
type: bash
cmd: "curl -sd 'test' http://podinfo-canary.test/token | grep token"
- name: load-test
type: rollout
url: http://flagger-loadtester.test/
timeout: 5s
metadata:
type: cmd
cmd: "hey -z 10m -q 10 -c 2 -host app.example.com http://traefik.traefik"
logCmdOutput: "true"
```
Save the above resource as podinfo-canary.yaml and then apply it:
```bash
kubectl apply -f ./podinfo-canary.yaml
```
After a couple of seconds Flagger will create the canary objects:
```bash
# applied
deployment.apps/podinfo
horizontalpodautoscaler.autoscaling/podinfo
canary.flagger.app/podinfo
# generated
deployment.apps/podinfo-primary
horizontalpodautoscaler.autoscaling/podinfo-primary
service/podinfo
service/podinfo-canary
service/podinfo-primary
traefikservice.traefik.containo.us/podinfo
```
## Automated canary promotion
Flagger implements a control loop that gradually shifts traffic to the canary while measuring key performance indicators like HTTP requests success rate, requests average duration and pod health. Based on analysis of the KPIs a canary is promoted or aborted, and the analysis result is published to Slack or MS Teams.
![Flagger Canary Stages](/img/diagrams/flagger-canary-steps.png)
Trigger a canary deployment by updating the container image:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=stefanprodan/podinfo:4.0.6
```
Flagger detects that the deployment revision changed and starts a new rollout:
```text
kubectl -n test describe canary/podinfo
Status:
Canary Weight: 0
Failed Checks: 0
Phase: Succeeded
Events:
New revision detected! Scaling up podinfo.test
Waiting for podinfo.test rollout to finish: 0 of 1 updated replicas are available
Pre-rollout check acceptance-test passed
Advance podinfo.test canary weight 5
Advance podinfo.test canary weight 10
Advance podinfo.test canary weight 15
Advance podinfo.test canary weight 20
Advance podinfo.test canary weight 25
Advance podinfo.test canary weight 30
Advance podinfo.test canary weight 35
Advance podinfo.test canary weight 40
Advance podinfo.test canary weight 45
Advance podinfo.test canary weight 50
Copying podinfo.test template spec to podinfo-primary.test
Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
Routing all traffic to primary
Promotion completed! Scaling down podinfo.test
```
**Note** that if you apply new changes to the deployment during the canary analysis, Flagger will restart the analysis.
You can monitor all canaries with:
```bash
watch kubectl get canaries --all-namespaces
NAMESPACE NAME STATUS WEIGHT LASTTRANSITIONTIME
test podinfo-2 Progressing 30 2020-08-14T12:32:12Z
test podinfo Succeeded 0 2020-08-14T11:23:88Z
```
## Automated rollback
During the canary analysis you can generate HTTP 500 errors to test if Flagger pauses and rolls back the faulted version.
Trigger another canary deployment:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=stefanprodan/podinfo:4.0.6
```
Exec into the load tester pod with:
```bash
kubectl -n test exec -it deploy/flagger-loadtester bash
```
Generate HTTP 500 errors:
```bash
hey -z 1m -c 5 -q 5 http://app.example.com/status/500
```
Generate latency:
```bash
watch -n 1 curl http://app.example.com/delay/1
```
When the number of failed checks reaches the canary analysis threshold, the traffic is routed back to the primary, the canary is scaled to zero and the rollout is marked as failed.
```text
kubectl -n traefik logs deploy/flagger -f | jq .msg
New revision detected! Scaling up podinfo.test
Canary deployment podinfo.test not ready: waiting for rollout to finish: 0 of 1 updated replicas are available
Starting canary analysis for podinfo.test
Pre-rollout check acceptance-test passed
Advance podinfo.test canary weight 5
Advance podinfo.test canary weight 10
Advance podinfo.test canary weight 15
Advance podinfo.test canary weight 20
Halt podinfo.test advancement success rate 53.42% < 99%
Halt podinfo.test advancement success rate 53.19% < 99%
Halt podinfo.test advancement success rate 48.05% < 99%
Rolling back podinfo.test failed checks threshold reached 3
Canary failed! Scaling down podinfo.test
```
## Custom metrics
The canary analysis can be extended with Prometheus queries.
Create a metric template and apply it on the cluster:
```yaml
apiVersion: flagger.app/v1beta1
kind: MetricTemplate
metadata:
name: not-found-percentage
namespace: test
spec:
provider:
type: prometheus
address: http://flagger-prometheus.traefik:9090
query: |
sum(
rate(
traefik_service_request_duration_seconds_bucket{
service=~"{{ namespace }}-{{ target }}-canary-[0-9a-zA-Z-]+@kubernetescrd",
code!="404",
}[{{ interval }}]
)
)
/
sum(
rate(
traefik_service_request_duration_seconds_bucket{
service=~"{{ namespace }}-{{ target }}-canary-[0-9a-zA-Z-]+@kubernetescrd",
}[{{ interval }}]
)
) * 100
```
Edit the canary analysis and add the not found error rate check:
```yaml
analysis:
metrics:
- name: "404s percentage"
templateRef:
name: not-found-percentage
thresholdRange:
max: 5
interval: 1m
```
The above configuration validates the canary by checking if the HTTP 404 req/sec percentage is below 5 percent of the total traffic. If the 404s rate reaches the 5% threshold, then the canary fails.
Trigger a canary deployment by updating the container image:
```bash
kubectl -n test set image deployment/podinfo \
podinfod=stefanprodan/podinfo:4.0.6
```
Generate 404s:
```bash
watch curl http://app.example.com/status/400
```
Watch Flagger logs:
```text
kubectl -n traefik logs deployment/flagger -f | jq .msg
Starting canary deployment for podinfo.test
Advance podinfo.test canary weight 5
Advance podinfo.test canary weight 10
Advance podinfo.test canary weight 15
Halt podinfo.test advancement 404s percentage 6.20 > 5
Halt podinfo.test advancement 404s percentage 6.45 > 5
Halt podinfo.test advancement 404s percentage 7.60 > 5
Halt podinfo.test advancement 404s percentage 8.69 > 5
Halt podinfo.test advancement 404s percentage 9.70 > 5
Rolling back podinfo.test failed checks threshold reached 5
Canary failed! Scaling down podinfo.test
```
If you have [alerting](../usage/alerting.md) configured, Flagger will send a notification with the reason why the canary failed.
For an in-depth look at the analysis process read the [usage docs](../usage/how-it-works.md).

View File

@ -0,0 +1,189 @@
---
weight: 14
title: Zero downtime deployments
type: docs
---
This is a list of things you should consider when dealing with a high traffic production environment if you want to minimise the impact of rolling updates and downscaling.
## Deployment strategy
Limit the number of unavailable pods during a rolling update:
```yaml
apiVersion: apps/v1
kind: Deployment
spec:
progressDeadlineSeconds: 120
strategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 0
```
The default progress deadline for a deployment is ten minutes. You should consider adjusting this value to make the deployment process fail faster.
## Liveness health check
You application should expose a HTTP endpoint that Kubernetes can call to determine if your app transitioned to a broken state from which it can't recover and needs to be restarted.
```yaml
livenessProbe:
exec:
command:
- wget
- --quiet
- --tries=1
- --timeout=4
- --spider
- http://localhost:8080/healthz
timeoutSeconds: 5
initialDelaySeconds: 5
```
If you've enabled mTLS, you'll have to use `exec` for liveness and readiness checks since kubelet is not part of the service mesh and doesn't have access to the TLS cert.
## Readiness health check
You application should expose a HTTP endpoint that Kubernetes can call to determine if your app is ready to receive traffic.
```yaml
readinessProbe:
exec:
command:
- wget
- --quiet
- --tries=1
- --timeout=4
- --spider
- http://localhost:8080/readyz
timeoutSeconds: 5
initialDelaySeconds: 5
periodSeconds: 5
```
If your app depends on external services, you should check if those services are available before allowing Kubernetes to route traffic to an app instance. Keep in mind that the Envoy sidecar can have a slower startup than your app. This means that on application start you should retry for at least a couple of seconds any external connection.
## Graceful shutdown
Before a pod gets terminated, Kubernetes sends a `SIGTERM` signal to every container and waits for period of time \(30s by default\) for all containers to exit gracefully. If your app doesn't handle the `SIGTERM` signal or if it doesn't exit within the grace period, Kubernetes will kill the container and any inflight requests that your app is processing will fail.
```yaml
apiVersion: apps/v1
kind: Deployment
spec:
template:
spec:
terminationGracePeriodSeconds: 60
containers:
- name: app
lifecycle:
preStop:
exec:
command:
- sleep
- "10"
```
Your app container should have a `preStop` hook that delays the container shutdown. This will allow the service mesh to drain the traffic and remove this pod from all other Envoy sidecars before your app becomes unavailable.
## Delay Envoy shutdown
Even if your app reacts to `SIGTERM` and tries to complete the inflight requests before shutdown, that doesn't mean that the response will make it back to the caller. If the Envoy sidecar shuts down before your app, then the caller will receive a 503 error.
To mitigate this issue you can add a `preStop` hook to the Istio proxy and wait for the main app to exit before Envoy exits.
```bash
#!/bin/bash
set -e
if ! pidof envoy &>/dev/null; then
exit 0
fi
if ! pidof pilot-agent &>/dev/null; then
exit 0
fi
while [ $(netstat -plunt | grep tcp | grep -v envoy | wc -l | xargs) -ne 0 ]; do
sleep 1;
done
exit 0
```
You'll have to build your own Envoy docker image with the above script and modify the Istio injection webhook with the `preStop` directive.
Thanks to Stono for his excellent [tips](https://github.com/istio/istio/issues/12183) on minimising 503s.
## Resource requests and limits
Setting CPU and memory requests/limits for all workloads is a mandatory step if you're running a production system. Without limits your nodes could run out of memory or become unresponsive due to CPU exhausting. Without CPU and memory requests, the Kubernetes scheduler will not be able to make decisions about which nodes to place pods on.
```yaml
apiVersion: apps/v1
kind: Deployment
spec:
template:
spec:
containers:
- name: app
resources:
limits:
cpu: 1000m
memory: 1Gi
requests:
cpu: 100m
memory: 128Mi
```
Note that without resource requests the horizontal pod autoscaler can't determine when to scale your app.
## Autoscaling
A production environment should be able to handle traffic bursts without impacting the quality of service. This can be achieved with Kubernetes autoscaling capabilities. Autoscaling in Kubernetes has two dimensions: the Cluster Autoscaler that deals with node scaling operations and the Horizontal Pod Autoscaler that automatically scales the number of pods in a deployment.
```yaml
apiVersion: autoscaling/v2beta2
kind: HorizontalPodAutoscaler
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: app
minReplicas: 2
maxReplicas: 4
metrics:
- type: Resource
resource:
name: cpu
targetAverageValue: 900m
- type: Resource
resource:
name: memory
targetAverageValue: 768Mi
```
The above HPA ensures your app will be scaled up before the pods reach the CPU or memory limits.
## Ingress retries
To minimise the impact of downscaling operations you can make use of Envoy retry capabilities.
```yaml
apiVersion: flagger.app/v1beta1
kind: Canary
spec:
service:
port: 9898
gateways:
- public-gateway.istio-system.svc.cluster.local
hosts:
- app.example.com
retries:
attempts: 10
perTryTimeout: 5s
retryOn: "gateway-error,connect-failure,refused-stream"
```
When the HPA scales down your app, your users could run into 503 errors. The above configuration will make Envoy retry the HTTP requests that failed due to gateway errors.

View File

@ -0,0 +1,5 @@
---
type: docs
weight: 3
title: Usage
---

View File

@ -0,0 +1,151 @@
---
type: docs
weight: 4
title: Alerting
---
Flagger can be configured to send alerts to various chat platforms.
You can define a global alert provider at install time or configure alerts on a per canary basis.
## Global configuration
### Slack
#### Slack Configuration
Flagger requires a custom webhook integration from slack, instead of the new slack app system.
The webhook can be generated by following the [legacy slack documentation](https://api.slack.com/legacy/custom-integrations/messaging/webhooks)
#### Flagger configuration
Once the webhook has been generated. Flagger can be configured to send Slack notifications:
```bash
helm upgrade -i flagger flagger/flagger \
--set slack.url=https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK \
--set slack.proxy=my-http-proxy.com \ # optional http/s proxy
--set slack.channel=general \
--set slack.user=flagger \
--set clusterName=my-cluster
```
Once configured with a Slack incoming **webhook**,
Flagger will post messages when a canary deployment has been initialised,
when a new revision has been detected and if the canary analysis failed or succeeded.
![Slack Notifications](/img/screens/slack-canary-notifications.png)
A canary deployment will be rolled back if the progress deadline exceeded
or if the analysis reached the maximum number of failed checks:
![Slack Notifications](/img/screens/slack-canary-failed.png)
### Microsoft Teams
Flagger can be configured to send notifications to Microsoft Teams:
```bash
helm upgrade -i flagger flagger/flagger \
--set msteams.url=https://outlook.office.com/webhook/YOUR/TEAMS/WEBHOOK \
--set msteams.proxy-url=my-http-proxy.com # optional http/s proxy
```
Similar to Slack, Flagger alerts on canary analysis events:
![MS Teams Notifications](/img/screens/flagger-ms-teams-notifications.png)
![MS Teams Notifications](/img/screens/flagger-ms-teams-failed.png)
## Canary configuration
Configuring alerting globally has several limitations as it's not possible to specify different channels
or configure the verbosity on a per canary basis. To make the alerting move flexible,
the canary analysis can be extended with a list of alerts that reference an alert provider.
For each alert, users can configure the severity level. The alerts section overrides the global setting.
Slack example:
```yaml
apiVersion: flagger.app/v1beta1
kind: AlertProvider
metadata:
name: on-call
namespace: flagger
spec:
type: slack
channel: on-call-alerts
username: flagger
# webhook address (ignored if secretRef is specified)
address: https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK
# optional http/s proxy
proxy: http://my-http-proxy.com
# secret containing the webhook address (optional)
secretRef:
name: on-call-url
---
apiVersion: v1
kind: Secret
metadata:
name: on-call-url
namespace: flagger
data:
address: <encoded-url>
```
The alert provider **type** can be: `slack`, `msteams`, `rocket` or `discord`. When set to `discord`,
Flagger will use [Slack formatting](https://birdie0.github.io/discord-webhooks-guide/other/slack_formatting.html)
and will append `/slack` to the Discord address.
When not specified, **channel** defaults to `general` and **username** defaults to `flagger`.
When **secretRef** is specified, the Kubernetes secret must contain a data field named `address`,
the address in the secret will take precedence over the **address** field in the provider spec.
The canary analysis can have a list of alerts, each alert referencing an alert provider:
```yaml
analysis:
alerts:
- name: "on-call Slack"
severity: error
providerRef:
name: on-call
namespace: flagger
- name: "qa Discord"
severity: warn
providerRef:
name: qa-discord
- name: "dev MS Teams"
severity: info
providerRef:
name: dev-msteams
```
Alert fields:
* **name** \(required\)
* **severity** levels: `info`, `warn`, `error` (default info)
* **providerRef.name** alert provider name (required)
* **providerRef.namespace** alert provider namespace (defaults to the canary namespace)
When the severity is set to `warn`, Flagger will alert when waiting on manual confirmation or if the analysis fails.
When the severity is set to `error`, Flagger will alert only if the canary analysis fails.
To differentiate alerts based on the cluster name, you can configure Flagger with the `-cluster-name=my-cluster`
command flag, or with Helm `--set clusterName=my-cluster`.
## Prometheus Alert Manager
You can use Alertmanager to trigger alerts when a canary deployment failed:
```yaml
- alert: canary_rollback
expr: flagger_canary_status > 1
for: 1m
labels:
severity: warning
annotations:
summary: "Canary failed"
description: "Workload {{ $labels.name }} namespace {{ $labels.namespace }}"
```

View File

@ -0,0 +1,399 @@
---
type: docs
weight: 2
title: Deployment Strategies
---
Flagger can run automated application analysis, promotion and rollback for the following deployment strategies:
* **Canary Release** \(progressive traffic shifting\)
* Istio, Linkerd, App Mesh, NGINX, Skipper, Contour, Gloo Edge, Traefik, Open Service Mesh, Kuma, Gateway API
* **A/B Testing** \(HTTP headers and cookies traffic routing\)
* Istio, App Mesh, NGINX, Contour, Gloo Edge, Gateway API
* **Blue/Green** \(traffic switching\)
* Kubernetes CNI, Istio, Linkerd, App Mesh, NGINX, Contour, Gloo Edge, Open Service Mesh, Gateway API
* **Blue/Green Mirroring** \(traffic shadowing\)
* Istio
For Canary releases and A/B testing you'll need a Layer 7 traffic management solution like
a service mesh or an ingress controller. For Blue/Green deployments no service mesh or ingress controller is required.
A canary analysis is triggered by changes in any of the following objects:
* Deployment PodSpec \(container image, command, ports, env, resources, etc\)
* ConfigMaps mounted as volumes or mapped to environment variables
* Secrets mounted as volumes or mapped to environment variables
## Canary Release
Flagger implements a control loop that gradually shifts traffic to the canary while measuring
key performance indicators like HTTP requests success rate, requests average duration and pod health.
Based on analysis of the KPIs a canary is promoted or aborted.
![Flagger Canary Stages](/img/diagrams/flagger-canary-steps.png)
The canary analysis runs periodically until it reaches the maximum traffic weight or the failed checks threshold.
Spec:
```yaml
analysis:
# schedule interval (default 60s)
interval: 1m
# max number of failed metric checks before rollback
threshold: 10
# max traffic percentage routed to canary
# percentage (0-100)
maxWeight: 50
# canary increment step
# percentage (0-100)
stepWeight: 2
# promotion increment step (default 100)
# percentage (0-100)
stepWeightPromotion: 100
# deploy straight to production without
# the metrics and webhook checks
skipAnalysis: false
```
The above analysis, if it succeeds, will run for 25 minutes while validating the HTTP metrics and webhooks every minute.
You can determine the minimum time it takes to validate and promote a canary deployment using this formula:
```text
interval * (maxWeight / stepWeight)
```
And the time it takes for a canary to be rollback when the metrics or webhook checks are failing:
```text
interval * threshold
```
When `stepWeightPromotion` is specified, the promotion phase happens in stages, the traffic is routed back
to the primary pods in a progressive manner, the primary weight is increased until it reaches 100%.
In emergency cases, you may want to skip the analysis phase and ship changes directly to production.
At any time you can set the `spec.skipAnalysis: true`. When skip analysis is enabled,
Flagger checks if the canary deployment is healthy and promotes it without analysing it.
If an analysis is underway, Flagger cancels it and runs the promotion.
Gated canary promotion stages:
* scan for canary deployments
* check primary and canary deployment status
* halt advancement if a rolling update is underway
* halt advancement if pods are unhealthy
* call confirm-rollout webhooks and check results
* halt advancement if any hook returns a non HTTP 2xx result
* call pre-rollout webhooks and check results
* halt advancement if any hook returns a non HTTP 2xx result
* increment the failed checks counter
* increase canary traffic weight percentage from 0% to 2% \(step weight\)
* call rollout webhooks and check results
* check canary HTTP request success rate and latency
* halt advancement if any metric is under the specified threshold
* increment the failed checks counter
* check if the number of failed checks reached the threshold
* route all traffic to primary
* scale to zero the canary deployment and mark it as failed
* call post-rollout webhooks
* post the analysis result to Slack
* wait for the canary deployment to be updated and start over
* increase canary traffic weight by 2% \(step weight\) till it reaches 50% \(max weight\)
* halt advancement if any webhook call fails
* halt advancement while canary request success rate is under the threshold
* halt advancement while canary request duration P99 is over the threshold
* halt advancement while any custom metric check fails
* halt advancement if the primary or canary deployment becomes unhealthy
* halt advancement while canary deployment is being scaled up/down by HPA
* call confirm-promotion webhooks and check results
* halt advancement if any hook returns a non HTTP 2xx result
* promote canary to primary
* copy ConfigMaps and Secrets from canary to primary
* copy canary deployment spec template over primary
* wait for primary rolling update to finish
* halt advancement if pods are unhealthy
* route all traffic to primary
* scale to zero the canary deployment
* mark rollout as finished
* call post-rollout webhooks
* send notification with the canary analysis result
* wait for the canary deployment to be updated and start over
### Rollout Weights
By default Flagger uses linear weight values for the promotion, with the start value,
the step and the maximum weight value in 0 to 100 range.
Example:
```yaml
canary:
analysis:
promotion:
maxWeight: 50
stepWeight: 20
```
This configuration performs analysis starting from 20, increasing by 20 until weight goes above 50.
We would have steps (canary weight : primary weight):
* 20 (20 : 80)
* 40 (40 : 60)
* 60 (60 : 40)
* promotion
In order to enable non-linear promotion a new parameter was introduced:
* `stepWeights` - determines the ordered array of weights, which shall be used during canary promotion.
Example:
```yaml
canary:
analysis:
promotion:
stepWeights: [1, 2, 10, 80]
```
This configuration performs analysis starting from 1, going through `stepWeights` values till 80.
We would have steps (canary weight : primary weight):
* 1 (1 : 99)
* 2 (2 : 98)
* 10 (10 : 90)
* 80 (20 : 60)
* promotion
## A/B Testing
For frontend applications that require session affinity you should use
HTTP headers or cookies match conditions to ensure a set of users
will stay on the same version for the whole duration of the canary analysis.
![Flagger A/B Testing Stages](/img/diagrams/flagger-abtest-steps.png)
You can enable A/B testing by specifying the HTTP match conditions and the number of iterations.
If Flagger finds a HTTP match condition, it will ignore the `maxWeight` and `stepWeight` settings.
Istio example:
```yaml
analysis:
# schedule interval (default 60s)
interval: 1m
# total number of iterations
iterations: 10
# max number of failed iterations before rollback
threshold: 2
# canary match condition
match:
- headers:
x-canary:
regex: ".*insider.*"
- headers:
cookie:
regex: "^(.*?;)?(canary=always)(;.*)?$"
```
The above configuration will run an analysis for ten minutes targeting the Safari users and those that have a test cookie.
You can determine the minimum time that it takes to validate and promote a canary deployment using this formula:
```text
interval * iterations
```
And the time it takes for a canary to be rollback when the metrics or webhook checks are failing:
```text
interval * threshold
```
Istio example:
```yaml
analysis:
interval: 1m
threshold: 10
iterations: 2
match:
- headers:
x-canary:
exact: "insider"
- headers:
cookie:
regex: "^(.*?;)?(canary=always)(;.*)?$"
- sourceLabels:
app.kubernetes.io/name: "scheduler"
```
The header keys must be lowercase and use hyphen as the separator.
Header values are case-sensitive and formatted as follows:
* `exact: "value"` for exact string match
* `prefix: "value"` for prefix-based match
* `suffix: "value"` for suffix-based match
* `regex: "value"` for [RE2](https://github.com/google/re2/wiki/Syntax) style regex-based match
Note that the `sourceLabels` match conditions are applicable only when
the `mesh` gateway is included in the `canary.service.gateways` list.
App Mesh example:
```yaml
analysis:
interval: 1m
threshold: 10
iterations: 2
match:
- headers:
user-agent:
regex: ".*Chrome.*"
```
Note that App Mesh supports a single condition.
Contour example:
```yaml
analysis:
interval: 1m
threshold: 10
iterations: 2
match:
- headers:
user-agent:
prefix: "Chrome"
```
Note that Contour does not support regex, you can use prefix, suffix or exact.
NGINX example:
```yaml
analysis:
interval: 1m
threshold: 10
iterations: 2
match:
- headers:
x-canary:
exact: "insider"
- headers:
cookie:
exact: "canary"
```
Note that the NGINX ingress controller supports only exact matching for
cookies names where the value must be set to `always`.
Starting with NGINX ingress v0.31, regex matching is supported for header values.
The above configurations will route users with the x-canary header
or canary cookie to the canary instance during analysis:
```bash
curl -H 'X-Canary: insider' http://app.example.com
curl -b 'canary=always' http://app.example.com
```
## Blue/Green Deployments
For applications that are not deployed on a service mesh,
Flagger can orchestrate blue/green style deployments with Kubernetes L4 networking.
When using Istio you have the option to mirror traffic between blue and green.
![Flagger Blue/Green Stages](/img/diagrams/flagger-bluegreen-steps.png)
You can use the blue/green deployment strategy by replacing
`stepWeight/maxWeight` with `iterations` in the `analysis` spec:
```yaml
analysis:
# schedule interval (default 60s)
interval: 1m
# total number of iterations
iterations: 10
# max number of failed iterations before rollback
threshold: 2
```
With the above configuration Flagger will run conformance and load tests on the canary pods for ten minutes.
If the metrics analysis succeeds, live traffic will be switched from
the old version to the new one when the canary is promoted.
The blue/green deployment strategy is supported for all service mesh providers.
Blue/Green rollout steps for service mesh:
* detect new revision (deployment spec, secrets or configmaps changes)
* scale up the canary (green)
* run conformance tests for the canary pods
* run load tests and metric checks for the canary pods every minute
* abort the canary release if the failure threshold is reached
* route traffic to canary (This doesn't happen when using the kubernetes provider)
* promote canary spec over primary (blue)
* wait for primary rollout
* route traffic to primary
* scale down canary
After the analysis finishes, the traffic is routed to the canary (green) before
triggering the primary (blue) rolling update,
this ensures a smooth transition to the new version avoiding dropping
in-flight requests during the Kubernetes deployment rollout.
## Blue/Green with Traffic Mirroring
Traffic Mirroring is a pre-stage in a Canary (progressive traffic shifting) or Blue/Green deployment strategy.
Traffic mirroring will copy each incoming request, sending one request to the primary and one to the canary service.
The response from the primary is sent back to the user. The response from the canary is discarded.
Metrics are collected on both requests so that the deployment will only proceed if the canary metrics are healthy.
Mirroring should be used for requests that are **idempotent** or capable of being processed
twice (once by the primary and once by the canary).
Reads are idempotent. Before using mirroring on requests that may be writes,
you should consider what will happen if a write is duplicated and handled by the primary and canary.
To use mirroring, set `spec.analysis.mirror` to `true`.
Istio example:
```yaml
analysis:
# schedule interval (default 60s)
interval: 1m
# total number of iterations
iterations: 10
# max number of failed iterations before rollback
threshold: 2
# Traffic shadowing (compatible with Istio only)
mirror: true
# Weight of the traffic mirrored to your canary (defaults to 100%)
mirrorWeight: 100
```
Mirroring rollout steps for service mesh:
* detect new revision (deployment spec, secrets or configmaps changes)
* scale from zero the canary deployment
* wait for the HPA to set the canary minimum replicas
* check canary pods health
* run the acceptance tests
* abort the canary release if tests fail
* start the load tests
* mirror 100% of the traffic from primary to canary
* check request success rate and request duration every minute
* abort the canary release if the failure threshold is reached
* stop traffic mirroring after the number of iterations is reached
* route live traffic to the canary pods
* promote the canary \(update the primary secrets, configmaps and deployment spec\)
* wait for the primary deployment rollout to finish
* wait for the HPA to set the primary minimum replicas
* check primary pods health
* switch live traffic back to primary
* scale to zero the canary
* send notification with the canary analysis result
After the analysis finishes, the traffic is routed to the canary (green) before
triggering the primary (blue) rolling update, this ensures a smooth transition
to the new version avoiding dropping in-flight requests during the Kubernetes deployment rollout.

View File

@ -0,0 +1,373 @@
---
type: docs
weight: 1
title: How it works
---
[Flagger](https://github.com/fluxcd/flagger) can be configured to automate the release process for
Kubernetes workloads with a custom resource named canary.
## Canary resource
The canary custom resource defines the release process of an application running on Kubernetes and is
portable across clusters, service meshes and ingress providers.
For a deployment named _podinfo_, a canary release with progressive traffic shifting can be defined as:
```yaml
apiVersion: flagger.app/v1beta1
kind: Canary
metadata:
name: podinfo
spec:
targetRef:
apiVersion: apps/v1
kind: Deployment
name: podinfo
service:
port: 9898
analysis:
interval: 1m
threshold: 10
maxWeight: 50
stepWeight: 5
metrics:
- name: request-success-rate
thresholdRange:
min: 99
interval: 1m
- name: request-duration
thresholdRange:
max: 500
interval: 1m
webhooks:
- name: load-test
url: http://flagger-loadtester.test/
metadata:
cmd: "hey -z 1m -q 10 -c 2 http://podinfo-canary.test:9898/"
```
When you deploy a new version of an app, Flagger gradually shifts traffic to the canary, and at the same time,
measures the requests success rate as well as the average response duration.
You can extend the canary analysis with custom metrics,
acceptance and load testing to harden the validation process of your app release process.
If you are running multiple service meshes or ingress controllers in the same cluster,
you can override the global provider for a specific canary with `spec.provider`.
## Canary target
A canary resource can target a Kubernetes Deployment or DaemonSet.
Kubernetes Deployment example:
```yaml
spec:
progressDeadlineSeconds: 60
targetRef:
apiVersion: apps/v1
kind: Deployment
name: podinfo
autoscalerRef:
apiVersion: autoscaling/v2beta2
kind: HorizontalPodAutoscaler
name: podinfo
```
Based on the above configuration, Flagger generates the following Kubernetes objects:
* `deployment/<targetRef.name>-primary`
* `hpa/<autoscalerRef.name>-primary`
The primary deployment is considered the stable release of your app,
by default all traffic is routed to this version and the target deployment is scaled to zero.
Flagger will detect changes to the target deployment (including secrets and configmaps)
and will perform a canary analysis before promoting the new version as primary.
**Note** that the target deployment must have a single label selector in the format `app: <DEPLOYMENT-NAME>`:
```yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: podinfo
spec:
selector:
matchLabels:
app: podinfo
template:
metadata:
labels:
app: podinfo
```
In addition to `app`, Flagger supports `name` and `app.kubernetes.io/name` selectors.
If you use a different convention you can specify your label with the `-selector-labels=my-app-label`
command flag in the Flagger deployment manifest under containers args
or by setting `--set selectorLabels=my-app-label` when installing Flagger with Helm.
If the target deployment uses secrets and/or configmaps,
Flagger will create a copy of each object using the `-primary` suffix
and will reference these objects in the primary deployment.
If you annotate your ConfigMap or Secret with `flagger.app/config-tracking: disabled`,
Flagger will use the same object for the primary deployment instead of making a primary copy.
You can disable the secrets/configmaps tracking globally with the `-enable-config-tracking=false`
command flag in the Flagger deployment manifest under containers args
or by setting `--set configTracking.enabled=false` when installing Flagger with Helm,
but disabling config-tracking using the per Secret/ConfigMap annotation may fit your use-case better.
The autoscaler reference is optional, when specified,
Flagger will pause the traffic increase while the target and primary deployments are scaled up or down.
HPA can help reduce the resource usage during the canary analysis.
When the autoscaler reference is specified, any changes made to the autoscaler are only made active
in the primary autoscaler when a rollout for the deployment starts and completes successfully.
Optionally, you can create two HPAs, one for canary and one for the primary to update the HPA without
doing a new rollout. As the canary deployment will be scaled to 0, the HPA on the canary will be inactive.
The progress deadline represents the maximum time in seconds for the canary deployment to
make progress before it is rolled back, defaults to ten minutes.
## Canary service
A canary resource dictates how the target workload is exposed inside the cluster.
The canary target should expose a TCP port that will be used by Flagger to create the ClusterIP Services.
```yaml
spec:
service:
name: podinfo
port: 9898
portName: http
targetPort: 9898
portDiscovery: true
```
The container port from the target workload should match the `service.port` or `service.targetPort`.
The `service.name` is optional, defaults to `spec.targetRef.name`.
The `service.targetPort` can be a container port number or name.
The `service.portName` is optional (defaults to `http`), if your workload uses gRPC then set the port name to `grpc`.
If port discovery is enabled, Flagger scans the target workload and extracts the containers ports
excluding the port specified in the canary service and service mesh sidecar ports.
These ports will be used when generating the ClusterIP services.
Based on the canary spec service, Flagger creates the following Kubernetes ClusterIP service:
* `<service.name>.<namespace>.svc.cluster.local`
selector `app=<name>-primary`
* `<service.name>-primary.<namespace>.svc.cluster.local`
selector `app=<name>-primary`
* `<service.name>-canary.<namespace>.svc.cluster.local`
selector `app=<name>`
This ensures that traffic to `podinfo.test:9898` will be routed to the latest stable release of your app.
The `podinfo-canary.test:9898` address is available only during the canary analysis
and can be used for conformance testing or load testing.
You can configure Flagger to set annotations and labels for the generated services with:
```yaml
spec:
service:
port: 9898
apex:
annotations:
test: "test"
labels:
test: "test"
canary:
annotations:
test: "test"
labels:
test: "test"
primary:
annotations:
test: "test"
labels:
test: "test"
```
Note that the `apex` annotations are added to both the generated Kubernetes Service and the
generated service mesh/ingress object. This allows using external-dns with Istio `VirtualServices`
and `TraefikServices`. Beware of configuration conflicts [here](../faq.md#ExternalDNS).
Besides port mapping and metadata, the service specification can
contain URI match and rewrite rules, timeout and retry polices:
```yaml
spec:
service:
port: 9898
match:
- uri:
prefix: /
rewrite:
uri: /
retries:
attempts: 3
perTryTimeout: 1s
timeout: 5s
```
When using **Istio** as the mesh provider, you can also specify HTTP header operations,
CORS and traffic policies, Istio gateways and hosts.
The Istio routing configuration can be found [here](../faq.md#istio-routing).
## Canary status
You can use kubectl to get the current status of canary deployments cluster wide:
```bash
kubectl get canaries --all-namespaces
NAMESPACE NAME STATUS WEIGHT LASTTRANSITIONTIME
test podinfo Progressing 15 2019-06-30T14:05:07Z
prod frontend Succeeded 0 2019-06-30T16:15:07Z
prod backend Failed 0 2019-06-30T17:05:07Z
```
The status condition reflects the last known state of the canary analysis:
```bash
kubectl -n test get canary/podinfo -oyaml | awk '/status/,0'
```
A successful rollout status:
```yaml
status:
canaryWeight: 0
failedChecks: 0
iterations: 0
lastAppliedSpec: "14788816656920327485"
lastPromotedSpec: "14788816656920327485"
conditions:
- lastTransitionTime: "2019-07-10T08:23:18Z"
lastUpdateTime: "2019-07-10T08:23:18Z"
message: Canary analysis completed successfully, promotion finished.
reason: Succeeded
status: "True"
type: Promoted
```
The `Promoted` status condition can have one of the following reasons:
Initialized, Waiting, Progressing, WaitingPromotion, Promoting, Finalising, Succeeded or Failed.
A failed canary will have the promoted status set to `false`,
the reason to `failed` and the last applied spec will be different to the last promoted one.
Wait for a successful rollout:
```bash
kubectl wait canary/podinfo --for=condition=promoted
```
CI example:
```bash
# update the container image
kubectl set image deployment/podinfo podinfod=stefanprodan/podinfo:3.0.1
# wait for Flagger to detect the change
ok=false
until ${ok}; do
kubectl get canary/podinfo | grep 'Progressing' && ok=true || ok=false
sleep 5
done
# wait for the canary analysis to finish
kubectl wait canary/podinfo --for=condition=promoted --timeout=5m
# check if the deployment was successful
kubectl get canary/podinfo | grep Succeeded
```
## Canary finalizers
The default behavior of Flagger on canary deletion is to leave resources that aren't owned
by the controller in their current state.
This simplifies the deletion action and avoids possible deadlocks during resource finalization.
In the event the canary was introduced with existing resource(s) (i.e. service, virtual service, etc.),
they would be mutated during the initialization phase and no longer reflect their initial state.
If the desired functionality upon deletion is to revert the resources to their initial state,
the `revertOnDeletion` attribute can be enabled.
```yaml
spec:
revertOnDeletion: true
```
When a deletion action is submitted to the cluster, Flagger will attempt to revert the following resources:
* [Canary target](how-it-works.md#canary-target) replicas will be updated to the primary replica count
* [Canary service](how-it-works.md#canary-service) selector will be reverted
* Mesh/Ingress traffic routed to the target
The recommended approach to disable canary analysis would be utilization of the `skipAnalysis` attribute,
which limits the need for resource reconciliation.
Utilizing the `revertOnDeletion` attribute should be enabled when
you no longer plan to rely on Flagger for deployment management.
**Note** When this feature is enabled expect a delay in the delete action due to the reconciliation.
## Canary analysis
The canary analysis defines:
* the type of [deployment strategy](deployment-strategies.md)
* the [metrics](metrics.md) used to validate the canary version
* the [webhooks](webhooks.md) used for conformance testing, load testing and manual gating
* the [alerting settings](alerting.md)
Spec:
```yaml
analysis:
# schedule interval (default 60s)
interval:
# max number of failed metric checks before rollback
threshold:
# max traffic percentage routed to canary
# percentage (0-100)
maxWeight:
# canary increment step
# percentage (0-100)
stepWeight:
# promotion increment step
# percentage (0-100)
stepWeightPromotion:
# total number of iterations
# used for A/B Testing and Blue/Green
iterations:
# threshold of primary pods that need to be available to consider it ready
# before starting rollout. this is optional and the default is 100
# percentage (0-100)
primaryReadyThreshold: 100
# threshold of canary pods that need to be available to consider it ready
# before starting rollout. this is optional and the default is 100
# percentage (0-100)
canaryReadyThreshold: 100
# canary match conditions
# used for A/B Testing
match:
- # HTTP header
# key performance indicators
metrics:
- # metric check
# alerting
alerts:
- # alert provider
# external checks
webhooks:
- # hook
```
The canary analysis runs periodically until it reaches the maximum traffic weight or the number of iterations.
On each run, Flagger calls the webhooks, checks the metrics and if the failed checks threshold is reached,
stops the analysis and rolls back the canary.
If alerting is configured, Flagger will post the analysis result using the alert providers.

View File

@ -0,0 +1,617 @@
---
type: docs
weight: 3
title: Metrics Analysis
---
As part of the analysis process, Flagger can validate service level objectives
(SLOs) like availability, error rate percentage, average response time and any other objective
based on app specific metrics.
If a drop in performance is noticed during the SLOs analysis,
the release will be automatically rolled back with minimum impact to end-users.
## Builtin metrics
Flagger comes with two builtin metric checks: HTTP request success rate and duration.
```yaml
analysis:
metrics:
- name: request-success-rate
interval: 1m
# minimum req success rate (non 5xx responses)
# percentage (0-100)
thresholdRange:
min: 99
- name: request-duration
interval: 1m
# maximum req duration P99
# milliseconds
thresholdRange:
max: 500
```
For each metric you can specify a range of accepted values with `thresholdRange` and
the window size or the time series with `interval`.
The builtin checks are available for every service mesh / ingress controlle
and are implemented with [Prometheus queries](../faq.md#metrics).
## Custom metrics
The canary analysis can be extended with custom metric checks.
Using a `MetricTemplate` custom resource,
you configure Flagger to connect to a metric provider and run a query that returns a `float64` value.
The query result is used to validate the canary based on the specified threshold range.
```yaml
apiVersion: flagger.app/v1beta1
kind: MetricTemplate
metadata:
name: my-metric
spec:
provider:
type: # can be prometheus, datadog, etc
address: # API URL
insecureSkipVerify: # if set to true, disables the TLS cert validation
secretRef:
name: # name of the secret containing the API credentials
query: # metric query
```
The following variables are available in query templates:
* `name` (canary.metadata.name)
* `namespace` (canary.metadata.namespace)
* `target` (canary.spec.targetRef.name)
* `service` (canary.spec.service.name)
* `ingress` (canary.spec.ingresRef.name)
* `interval` (canary.spec.analysis.metrics[].interval)
A canary analysis metric can reference a template with `templateRef`:
```yaml
analysis:
metrics:
- name: "my metric"
templateRef:
name: my-metric
# namespace is optional
# when not specified, the canary namespace will be used
namespace: flagger
# accepted values
thresholdRange:
min: 10
max: 1000
# metric query time window
interval: 1m
```
## Prometheus
You can create custom metric checks targeting a Prometheus server by
setting the provider type to `prometheus` and writing the query in PromQL.
Prometheus template example:
```yaml
apiVersion: flagger.app/v1beta1
kind: MetricTemplate
metadata:
name: not-found-percentage
namespace: istio-system
spec:
provider:
type: prometheus
address: http://prometheus.istio-system:9090
query: |
100 - sum(
rate(
istio_requests_total{
reporter="destination",
destination_workload_namespace="{{ namespace }}",
destination_workload="{{ target }}",
response_code!="404"
}[{{ interval }}]
)
)
/
sum(
rate(
istio_requests_total{
reporter="destination",
destination_workload_namespace="{{ namespace }}",
destination_workload="{{ target }}"
}[{{ interval }}]
)
) * 100
```
Reference the template in the canary analysis:
```yaml
analysis:
metrics:
- name: "404s percentage"
templateRef:
name: not-found-percentage
namespace: istio-system
thresholdRange:
max: 5
interval: 1m
```
The above configuration validates the canary by checking if the HTTP 404 req/sec percentage
is below 5 percent of the total traffic. If the 404s rate reaches the 5% threshold, then the canary fails.
Prometheus gRPC error rate example:
```yaml
apiVersion: flagger.app/v1beta1
kind: MetricTemplate
metadata:
name: grpc-error-rate-percentage
namespace: flagger
spec:
provider:
type: prometheus
address: http://flagger-prometheus.flagger-system:9090
query: |
100 - sum(
rate(
grpc_server_handled_total{
grpc_code!="OK",
kubernetes_namespace="{{ namespace }}",
kubernetes_pod_name=~"{{ target }}-[0-9a-zA-Z]+(-[0-9a-zA-Z]+)"
}[{{ interval }}]
)
)
/
sum(
rate(
grpc_server_started_total{
kubernetes_namespace="{{ namespace }}",
kubernetes_pod_name=~"{{ target }}-[0-9a-zA-Z]+(-[0-9a-zA-Z]+)"
}[{{ interval }}]
)
) * 100
```
The above template is for gRPC services instrumented with
[go-grpc-prometheus](https://github.com/grpc-ecosystem/go-grpc-prometheus).
## Prometheus authentication
If your Prometheus API requires basic authentication, you can create a secret in the same namespace
as the `MetricTemplate` with the basic-auth credentials:
```yaml
apiVersion: v1
kind: Secret
metadata:
name: prom-basic-auth
namespace: flagger
data:
username: your-user
password: your-password
```
Then reference the secret in the `MetricTemplate`:
```yaml
apiVersion: flagger.app/v1beta1
kind: MetricTemplate
metadata:
name: my-metric
namespace: flagger
spec:
provider:
type: prometheus
address: http://prometheus.monitoring:9090
secretRef:
name: prom-basic-auth
```
## Datadog
You can create custom metric checks using the Datadog provider.
Create a secret with your Datadog API credentials:
```yaml
apiVersion: v1
kind: Secret
metadata:
name: datadog
namespace: istio-system
data:
datadog_api_key: your-datadog-api-key
datadog_application_key: your-datadog-application-key
```
Datadog template example:
```yaml
apiVersion: flagger.app/v1beta1
kind: MetricTemplate
metadata:
name: not-found-percentage
namespace: istio-system
spec:
provider:
type: datadog
address: https://api.datadoghq.com
secretRef:
name: datadog
query: |
100 - (
sum:istio.mesh.request.count{
reporter:destination,
destination_workload_namespace:{{ namespace }},
destination_workload:{{ target }},
!response_code:404
}.as_count()
/
sum:istio.mesh.request.count{
reporter:destination,
destination_workload_namespace:{{ namespace }},
destination_workload:{{ target }}
}.as_count()
) * 100
```
Reference the template in the canary analysis:
```yaml
analysis:
metrics:
- name: "404s percentage"
templateRef:
name: not-found-percentage
namespace: istio-system
thresholdRange:
max: 5
interval: 1m
```
## Amazon CloudWatch
You can create custom metric checks using the CloudWatch metrics provider.
CloudWatch template example:
```yaml
apiVersion: flagger.app/v1alpha1
kind: MetricTemplate
metadata:
name: cloudwatch-error-rate
spec:
provider:
type: cloudwatch
region: ap-northeast-1 # specify the region of your metrics
query: |
[
{
"Id": "e1",
"Expression": "m1 / m2",
"Label": "ErrorRate"
},
{
"Id": "m1",
"MetricStat": {
"Metric": {
"Namespace": "MyKubernetesCluster",
"MetricName": "ErrorCount",
"Dimensions": [
{
"Name": "appName",
"Value": "{{ name }}.{{ namespace }}"
}
]
},
"Period": 60,
"Stat": "Sum",
"Unit": "Count"
},
"ReturnData": false
},
{
"Id": "m2",
"MetricStat": {
"Metric": {
"Namespace": "MyKubernetesCluster",
"MetricName": "RequestCount",
"Dimensions": [
{
"Name": "appName",
"Value": "{{ name }}.{{ namespace }}"
}
]
},
"Period": 60,
"Stat": "Sum",
"Unit": "Count"
},
"ReturnData": false
}
]
```
The query format documentation can be found
[here](https://aws.amazon.com/premiumsupport/knowledge-center/cloudwatch-getmetricdata-api/).
Reference the template in the canary analysis:
```yaml
analysis:
metrics:
- name: "app error rate"
templateRef:
name: cloudwatch-error-rate
thresholdRange:
max: 0.1
interval: 1m
```
**Note** that Flagger need AWS IAM permission to perform `cloudwatch:GetMetricData` to use this provider.
## New Relic
You can create custom metric checks using the New Relic provider.
Create a secret with your New Relic Insights credentials:
```yaml
apiVersion: v1
kind: Secret
metadata:
name: newrelic
namespace: istio-system
data:
newrelic_account_id: your-account-id
newrelic_query_key: your-insights-query-key
```
New Relic template example:
```yaml
apiVersion: flagger.app/v1beta1
kind: MetricTemplate
metadata:
name: newrelic-error-rate
namespace: ingress-nginx
spec:
provider:
type: newrelic
secretRef:
name: newrelic
query: |
SELECT
filter(sum(nginx_ingress_controller_requests), WHERE status >= '500') /
sum(nginx_ingress_controller_requests) * 100
FROM Metric
WHERE metricName = 'nginx_ingress_controller_requests'
AND ingress = '{{ ingress }}' AND namespace = '{{ namespace }}'
```
Reference the template in the canary analysis:
```yaml
analysis:
metrics:
- name: "error rate"
templateRef:
name: newrelic-error-rate
namespace: ingress-nginx
thresholdRange:
max: 5
interval: 1m
```
## Graphite
You can create custom metric checks using the Graphite provider.
Graphite template example:
```yaml
apiVersion: flagger.app/v1beta1
kind: MetricTemplate
metadata:
name: graphite-request-success-rate
spec:
provider:
type: graphite
address: http://graphite.monitoring
query: |
target=summarize(
asPercent(
sumSeries(
stats.timers.httpServerRequests.app.{{target}}.exception.*.method.*.outcome.{CLIENT_ERROR,INFORMATIONAL,REDIRECTION,SUCCESS}.status.*.uri.*.count
),
sumSeries(
stats.timers.httpServerRequests.app.{{target}}.exception.*.method.*.outcome.*.status.*.uri.*.count
)
),
{{interval}},
'avg'
)
```
Reference the template in the canary analysis:
```yaml
analysis:
metrics:
- name: "success rate"
templateRef:
name: graphite-request-success-rate
thresholdRange:
min: 90
interval: 1min
```
## Graphite authentication
If your Graphite API requires basic authentication, you can create a secret in the same namespace
as the `MetricTemplate` with the basic-auth credentials:
```yaml
apiVersion: v1
kind: Secret
metadata:
name: graphite-basic-auth
namespace: flagger
data:
username: your-user
password: your-password
```
Then, reference the secret in the `MetricTemplate`:
```yaml
apiVersion: flagger.app/v1beta1
kind: MetricTemplate
metadata:
name: my-metric
namespace: flagger
spec:
provider:
type: graphite
address: http://graphite.monitoring
secretRef:
name: graphite-basic-auth
```
## Google Cloud Monitoring (Stackdriver)
Enable Workload Identity on your cluster, create a service account key that has read access to the
Cloud Monitoring API and then create an IAM policy binding between the GCP service account and the Flagger
service account on Kubernetes. You can take a look at this [guide](https://cloud.google.com/kubernetes-engine/docs/how-to/workload-identity)
Annotate the flagger service account
```shell script
kubectl annotate serviceaccount flagger \
--namespace <namespace> \
iam.gke.io/gcp-service-account=<gcp-serviceaccount-name>@<project-id>.iam.gserviceaccount.com
```
Alternatively, you can download the json keys and add it to your secret with the key `serviceAccountKey` (This method is not recommended).
Create a secret that contains your project-id (and, if workload identity is not enabled on your cluster,
your [service account json](https://cloud.google.com/docs/authentication/production#create_service_account)).
```
kubectl create secret generic gcloud-sa --from-literal=project=<project-id>
```
Then reference the secret in the metric template.
Note: The particular MQL query used here works if [Istio is installed on GKE](https://cloud.google.com/istio/docs/istio-on-gke/installing).
```yaml
apiVersion: flagger.app/v1beta1
kind: MetricTemplate
metadata:
name: bytes-sent
namespace: test
spec:
provider:
type: stackdriver
secretRef:
name: gcloud-sa
query: |
fetch k8s_container
| metric 'istio.io/service/server/response_latencies'
| filter
(metric.destination_service_name == '{{ service }}-canary'
&& metric.destination_service_namespace == '{{ namespace }}')
| align delta(1m)
| every 1m
| group_by [],
[value_response_latencies_percentile:
percentile(value.response_latencies, 99)]
```
The reference for the query language can be found [here](https://cloud.google.com/monitoring/mql/reference)
## InfluxDB
The InfluxDB provider uses the [flux](https://docs.influxdata.com/influxdb/v2.0/query-data/get-started/) query language.
Create a secret that contains your authentication token that can be found in the InfluxDB UI.
```
kubectl create secret generic influx-token --from-literal=token=<token>
```
Then reference the secret in the metric template.
Note: The particular MQL query used here works if [Istio is installed on GKE](https://cloud.google.com/istio/docs/istio-on-gke/installing).
```yaml
apiVersion: flagger.app/v1beta1
kind: MetricTemplate
metadata:
name: not-found
namespace: test
spec:
provider:
type: influxdb
secretRef:
name: influx-token
query: |
from(bucket: "default")
|> range(start: -2h)
|> filter(fn: (r) => r["_measurement"] == "istio_requests_total")
|> filter(fn: (r) => r[" destination_workload_namespace"] == "{{ namespace }}")
|> filter(fn: (r) => r["destination_workload"] == "{{ target }}")
|> filter(fn: (r) => r["response_code"] == "500")
|> count()
|> yield(name: "count")
```
## Dynatrace
You can create custom metric checks using the Dynatrace provider.
Create a secret with your Dynatrace token:
```yaml
apiVersion: v1
kind: Secret
metadata:
name: dynatrace
namespace: istio-system
data:
dynatrace_token: ZHQwYz...
```
Dynatrace metric template example:
```yaml
apiVersion: flagger.app/v1beta1
kind: MetricTemplate
metadata:
name: response-time-95pct
namespace: istio-system
spec:
provider:
type: dynatrace
address: https://xxxxxxxx.live.dynatrace.com
secretRef:
name: dynatrace
query: |
builtin:service.response.time:filter(eq(dt.entity.service,SERVICE-ABCDEFG0123456789)):percentile(95)
```
Reference the template in the canary analysis:
```yaml
analysis:
metrics:
- name: "response-time-95pct"
templateRef:
name: response-time-95pct
namespace: istio-system
thresholdRange:
max: 1000
interval: 1m
```

View File

@ -0,0 +1,128 @@
---
type: docs
weight: 5
title: Monitoring
---
## Grafana
Flagger comes with a Grafana dashboard made for canary analysis. Install Grafana with Helm:
```bash
helm upgrade -i flagger-grafana flagger/grafana \
--set url=http://prometheus:9090
```
The dashboard shows the RED and USE metrics for the primary and canary workloads:
![Canary Dashboard](/img/screens/grafana-canary-analysis.png)
## Logging
The canary errors and latency spikes have been recorded as Kubernetes events and logged by Flagger in json format:
```text
kubectl -n istio-system logs deployment/flagger --tail=100 | jq .msg
Starting canary deployment for podinfo.test
Advance podinfo.test canary weight 5
Advance podinfo.test canary weight 10
Advance podinfo.test canary weight 15
Advance podinfo.test canary weight 20
Advance podinfo.test canary weight 25
Advance podinfo.test canary weight 30
Advance podinfo.test canary weight 35
Halt podinfo.test advancement success rate 98.69% < 99%
Advance podinfo.test canary weight 40
Halt podinfo.test advancement request duration 1.515s > 500ms
Advance podinfo.test canary weight 45
Advance podinfo.test canary weight 50
Copying podinfo.test template spec to podinfo-primary.test
Halt podinfo-primary.test advancement waiting for rollout to finish: 1 old replicas are pending termination
Scaling down podinfo.test
Promotion completed! podinfo.test
```
## Event Webhook
Flagger can be configured to send event payloads to a specified webhook:
```bash
helm upgrade -i flagger flagger/flagger \
--set eventWebhook=https://example.com/flagger-canary-event-webhook
```
The environment variable _EVENT\_WEBHOOK\_URL_ can be used for activating the event-webhook, too.
This is handy for using a secret to store a sensible value that could contain api keys for example.
When configured, every action that Flagger takes during a canary deployment
will be sent as JSON via an HTTP POST request. The JSON payload has the following schema:
```javascript
{
"name": "string (canary name)",
"namespace": "string (canary namespace)",
"phase": "string (canary phase)",
"metadata": {
"eventMessage": "string (canary event message)",
"eventType": "string (canary event type)",
"timestamp": "string (unix timestamp ms)"
}
}
```
Example:
```javascript
{
"name": "podinfo",
"namespace": "default",
"phase": "Progressing",
"metadata": {
"eventMessage": "New revision detected! Scaling up podinfo.default",
"eventType": "Normal",
"timestamp": "1578607635167"
}
}
```
The event webhook can be overwritten at canary level with:
```yaml
analysis:
webhooks:
- name: "send to Slack"
type: event
url: http://event-recevier.notifications/slack
```
## Metrics
Flagger exposes Prometheus metrics that can be used to determine
the canary analysis status and the destination weight values:
```bash
# Flagger version and mesh provider gauge
flagger_info{version="0.10.0", mesh_provider="istio"} 1
# Canaries total gauge
flagger_canary_total{namespace="test"} 1
# Canary promotion last known status gauge
# 0 - running, 1 - successful, 2 - failed
flagger_canary_status{name="podinfo" namespace="test"} 1
# Canary traffic weight gauge
flagger_canary_weight{workload="podinfo-primary" namespace="test"} 95
flagger_canary_weight{workload="podinfo" namespace="test"} 5
# Seconds spent performing canary analysis histogram
flagger_canary_duration_seconds_bucket{name="podinfo",namespace="test",le="10"} 6
flagger_canary_duration_seconds_bucket{name="podinfo",namespace="test",le="+Inf"} 6
flagger_canary_duration_seconds_sum{name="podinfo",namespace="test"} 17.3561329
flagger_canary_duration_seconds_count{name="podinfo",namespace="test"} 6
# Last canary metric analysis result per different metrics
flagger_canary_metric_analysis{metric="podinfo-http-successful-rate",name="podinfo",namespace="test"} 1
flagger_canary_metric_analysis{metric="podinfo-custom-metric",name="podinfo",namespace="test"} 0.918223108974359
```

View File

@ -0,0 +1,560 @@
---
type: docs
weight: 3
title: Webhooks
---
The canary analysis can be extended with webhooks.
Flagger will call each webhook URL and determine from the response status code
(HTTP 2xx) if the canary is failing or not.
There are several types of hooks:
* **confirm-rollout** hooks are executed before scaling up the canary deployment and can be used for manual approval.
The rollout is paused until the hook returns a successful HTTP status code.
* **pre-rollout** hooks are executed before routing traffic to canary.
The canary advancement is paused if a pre-rollout hook fails and if the number of failures reach the
threshold the canary will be rollback.
* **rollout** hooks are executed during the analysis on each iteration before the metric checks.
If a rollout hook call fails the canary advancement is paused and eventfully rolled back.
* **confirm-traffic-increase** hooks are executed right before the weight on the canary is increased. The canary
advancement is paused until this hook returns HTTP 200.
* **confirm-promotion** hooks are executed before the promotion step.
The canary promotion is paused until the hooks return HTTP 200.
While the promotion is paused, Flagger will continue to run the metrics checks and rollout hooks.
* **post-rollout** hooks are executed after the canary has been promoted or rolled back.
If a post rollout hook fails the error is logged.
* **rollback** hooks are executed while a canary deployment is in either Progressing or Waiting status.
This provides the ability to rollback during analysis or while waiting for a confirmation. If a rollback hook
returns a successful HTTP status code, Flagger will stop the analysis and mark the canary release as failed.
* **event** hooks are executed every time Flagger emits a Kubernetes event. When configured,
every action that Flagger takes during a canary deployment will be sent as JSON via an HTTP POST request.
Spec:
```yaml
analysis:
webhooks:
- name: "start gate"
type: confirm-rollout
url: http://flagger-loadtester.test/gate/approve
- name: "helm test"
type: pre-rollout
url: http://flagger-helmtester.flagger/
timeout: 3m
metadata:
type: "helmv3"
cmd: "test podinfo -n test"
- name: "load test"
type: rollout
url: http://flagger-loadtester.test/
timeout: 15s
metadata:
cmd: "hey -z 1m -q 5 -c 2 http://podinfo-canary.test:9898/"
- name: "traffic increase gate"
type: confirm-traffic-increase
url: http://flagger-loadtester.test/gate/approve
- name: "promotion gate"
type: confirm-promotion
url: http://flagger-loadtester.test/gate/approve
- name: "notify"
type: post-rollout
url: http://telegram.bot:8080/
timeout: 5s
metadata:
some: "message"
- name: "rollback gate"
type: rollback
url: http://flagger-loadtester.test/rollback/check
- name: "send to Slack"
type: event
url: http://event-recevier.notifications/slack
metadata:
environment: "test"
cluster: "flagger-test"
```
> **Note** that the sum of all rollout webhooks timeouts should be lower than the analysis interval.
Webhook payload (HTTP POST):
```javascript
{
"name": "podinfo",
"namespace": "test",
"phase": "Progressing",
"metadata": {
"test": "all",
"token": "16688eb5e9f289f1991c"
}
}
```
Response status codes:
* 200-202 - advance canary by increasing the traffic weight
* timeout or non-2xx - halt advancement and increment failed checks
On a non-2xx response Flagger will include the response body (if any) in the failed checks log and Kubernetes events.
Event payload (HTTP POST):
```javascript
{
"name": "string (canary name)",
"namespace": "string (canary namespace)",
"phase": "string (canary phase)",
"metadata": {
"eventMessage": "string (canary event message)",
"eventType": "string (canary event type)",
"timestamp": "string (unix timestamp ms)"
}
}
```
The event receiver can create alerts based on the received phase
(possible values: `Initialized`, `Waiting`, `Progressing`, `Promoting`, `Finalising`, `Succeeded` or `Failed`).
## Load Testing
For workloads that are not receiving constant traffic Flagger can be configured with a webhook,
that when called, will start a load test for the target workload.
If the target workload doesn't receive any traffic during the canary analysis,
Flagger metric checks will fail with "no values found for metric request-success-rate".
Flagger comes with a load testing service based on [rakyll/hey](https://github.com/rakyll/hey)
that generates traffic during analysis when configured as a webhook.
![Flagger Load Testing Webhook](/img/diagrams/flagger-load-testing.png)
First you need to deploy the load test runner in a namespace with sidecar injection enabled:
```bash
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/tester?ref=main
```
Or by using Helm:
```bash
helm repo add flagger https://flagger.app
helm upgrade -i flagger-loadtester flagger/loadtester \
--namespace=test \
--set cmd.timeout=1h \
--set cmd.namespaceRegexp=''
```
When deployed the load tester API will be available at `http://flagger-loadtester.test/`.
Now you can add webhooks to the canary analysis spec:
```yaml
webhooks:
- name: load-test-get
url: http://flagger-loadtester.test/
timeout: 5s
metadata:
type: cmd
cmd: "hey -z 1m -q 10 -c 2 http://podinfo-canary.test:9898/"
- name: load-test-post
url: http://flagger-loadtester.test/
timeout: 5s
metadata:
type: cmd
cmd: "hey -z 1m -q 10 -c 2 -m POST -d '{test: 2}' http://podinfo-canary.test:9898/echo"
```
When the canary analysis starts, Flagger will call the webhooks and the load tester will
run the `hey` commands in the background, if they are not already running.
This will ensure that during the analysis, the `podinfo-canary.test`
service will receive a steady stream of GET and POST requests.
If your workload is exposed outside the mesh you can point `hey` to the public URL and use HTTP2.
```yaml
webhooks:
- name: load-test-get
url: http://flagger-loadtester.test/
timeout: 5s
metadata:
type: cmd
cmd: "hey -z 1m -q 10 -c 2 -h2 https://podinfo.example.com/"
```
For gRPC services you can use [bojand/ghz](https://github.com/bojand/ghz) which is a similar tool to Hey but for gRPC:
```yaml
webhooks:
- name: grpc-load-test
url: http://flagger-loadtester.test/
timeout: 5s
metadata:
type: cmd
cmd: "ghz -z 1m -q 10 -c 2 --insecure podinfo.test:9898"
```
`ghz` uses reflection to identify which gRPC method to call.
If you do not wish to enable reflection for your gRPC service you can implement a standardized
health check from the [grpc-proto](https://github.com/grpc/grpc-proto) library.
To use this [health check schema](https://github.com/grpc/grpc-proto/blob/master/grpc/health/v1/health.proto)
without reflection you can pass a parameter to `ghz` like this
```yaml
webhooks:
- name: grpc-load-test-no-reflection
url: http://flagger-loadtester.test/
timeout: 5s
metadata:
type: cmd
cmd: "ghz --insecure --proto=/tmp/ghz/health.proto --call=grpc.health.v1.Health/Check podinfo.test:9898"
```
The load tester can run arbitrary commands as long as the binary is present in the container image.
For example if you want to replace `hey` with another CLI, you can create your own Docker image:
```text
FROM weaveworks/flagger-loadtester:<VER>
RUN curl -Lo /usr/local/bin/my-cli https://github.com/user/repo/releases/download/ver/my-cli \
&& chmod +x /usr/local/bin/my-cli
```
## Load Testing Delegation
The load tester can also forward testing tasks to external tools,
by now [nGrinder](https://github.com/naver/ngrinder) is supported.
To use this feature, add a load test task of type 'ngrinder' to the canary analysis spec:
```yaml
webhooks:
- name: load-test-post
url: http://flagger-loadtester.test/
timeout: 5s
metadata:
# type of this load test task, cmd or ngrinder
type: ngrinder
# base url of your nGrinder controller server
server: http://ngrinder-server:port
# id of the test to clone from, the test must have been defined.
clone: 100
# user name and base64 encoded password to authenticate against the nGrinder server
username: admin
passwd: YWRtaW4=
# the interval between between nGrinder test status polling, default to 1s
pollInterval: 5s
```
When the canary analysis starts, the load tester will initiate a
[clone_and_start request](https://github.com/naver/ngrinder/wiki/REST-API-PerfTest)
to the nGrinder server and start a new performance test. the load tester will periodically
poll the nGrinder server for the status of the test,
and prevent duplicate requests from being sent in subsequent analysis loops.
### K6 Load Tester
You can also delegate load testing to a third-party webhook. An example of this is the [`k6 webhook`](https://github.com/grafana/flagger-k6-webhook). This webhook uses [`k6`](https://k6.io/), a very featureful load tester, to run load or smoke tests on canaries. For all features available, see the source repository.
Here's an example integrating this webhook as a `pre-rollout` step, to load test a service before any traffic is sent to it:
```yaml
webhooks:
- name: k6-load-test
timeout: 5m
type: pre-rollout
url: http://k6-loadtester.flagger/launch-test
metadata:
script: |
import http from 'k6/http';
import { sleep } from 'k6';
export const options = {
vus: 2,
duration: '30s',
thresholds: {
http_req_duration: ['p(95)<50']
},
ext: {
loadimpact: {
name: '<cluster>/<your_service>',
projectID: <project id>,
},
},
};
export default function () {
http.get('http://<your_service>-canary.<namespace>:80/');
sleep(0.10);
}
```
## Integration Testing
Flagger comes with a testing service that can run Helm tests, Bats tests or Concord tests when configured as a webhook.
Deploy the Helm test runner in the `kube-system` namespace using the `tiller` service account:
```bash
helm repo add flagger https://flagger.app
helm upgrade -i flagger-helmtester flagger/loadtester \
--namespace=kube-system \
--set serviceAccountName=tiller
```
When deployed the Helm tester API will be available at `http://flagger-helmtester.kube-system/`.
Now you can add pre-rollout webhooks to the canary analysis spec:
```yaml
analysis:
webhooks:
- name: "smoke test"
type: pre-rollout
url: http://flagger-helmtester.kube-system/
timeout: 3m
metadata:
type: "helm"
cmd: "test {{ .Release.Name }} --cleanup"
```
When the canary analysis starts, Flagger will call the pre-rollout webhooks before routing traffic to the canary.
If the helm test fails, Flagger will retry until the analysis threshold is reached and the canary is rolled back.
If you are using Helm v3,
you'll have to create a dedicated service account and add the release namespace to the test command:
```yaml
analysis:
webhooks:
- name: "smoke test"
type: pre-rollout
url: http://flagger-helmtester.kube-system/
timeout: 3m
metadata:
type: "helmv3"
cmd: "test {{ .Release.Name }} --timeout 3m -n {{ .Release.Namespace }}"
```
If the test hangs or logs error messages hinting to insufficient permissions it can be related to RBAC,
check the [Troubleshooting](webhooks.md#Troubleshooting) section for an example configuration.
As an alternative to Helm you can use the
[Bash Automated Testing System](https://github.com/bats-core/bats-core) to run your tests.
```yaml
analysis:
webhooks:
- name: "acceptance tests"
type: pre-rollout
url: http://flagger-batstester.default/
timeout: 5m
metadata:
type: "bash"
cmd: "bats /tests/acceptance.bats"
```
Note that you should create a ConfigMap with your Bats tests and mount it inside the tester container.
You can also configure the test runner to start a [Concord](https://concord.walmartlabs.com/) process.
```yaml
analysis:
webhooks:
- name: "concord integration test"
type: pre-rollout
url: http://flagger-concordtester.default/
timeout: 60s
metadata:
type: "concord"
org: "your-concord-org"
project: "your-concord-project"
repo: "your-concord-repo"
entrypoint: "your-concord-entrypoint"
apiKeyPath: "/tmp/concord-api-key"
endpoint: "https://canary-endpoint/"
pollInterval: "5"
pollTimeout: "60"
```
`org`, `project`, `repo` and `entrypoint` represents where your test process runs in Concord.
In order to authenticate to Concord, you need to set `apiKeyPath`
to a path of a file containing a valid Concord API key on the `flagger-helmtester` container.
This can be done via mounting a Kubernetes secret in the tester's Deployment.
`pollInterval` represents the interval in seconds the web-hook will call Concord
to see if the process has finished (Default is 5s). `pollTimeout` represents the time in seconds
the web-hook will try to call Concord before timing out (Default is 30s).
## Manual Gating
For manual approval of a canary deployment you can use the `confirm-rollout` and `confirm-promotion` webhooks.
The confirmation rollout hooks are executed before the pre-rollout hooks. For manually approving traffic weight increase,
you can use the `confirm-traffic-increase` webhook.
Flagger will halt the canary traffic shifting and analysis until the confirm webhook returns HTTP status 200.
For manual rollback of a canary deployment you can use the `rollback` webhook.
The rollback hook will be called during the analysis and confirmation states.
If a rollback webhook returns a successful HTTP status code,
Flagger will shift all traffic back to the primary instance and fail the canary.
Manual gating with Flagger's tester:
```yaml
analysis:
webhooks:
- name: "gate"
type: confirm-rollout
url: http://flagger-loadtester.test/gate/halt
```
The `/gate/halt` returns HTTP 403 thus blocking the rollout.
If you have notifications enabled, Flagger will post a message to
Slack or MS Teams if a canary rollout is waiting for approval.
The notifications can be disabled with:
```yaml
analysis:
webhooks:
- name: "gate"
type: confirm-rollout
url: http://flagger-loadtester.test/gate/halt
muteAlert: true
```
Change the URL to `/gate/approve` to start the canary analysis:
```yaml
analysis:
webhooks:
- name: "gate"
type: confirm-rollout
url: http://flagger-loadtester.test/gate/approve
```
Manual gating can be driven with Flagger's tester API. Set the confirmation URL to `/gate/check`:
```yaml
analysis:
webhooks:
- name: "ask for confirmation"
type: confirm-rollout
url: http://flagger-loadtester.test/gate/check
```
By default the gate is closed, you can start or resume the canary rollout with:
```bash
kubectl -n test exec -it flagger-loadtester-xxxx-xxxx sh
curl -d '{"name": "podinfo","namespace":"test"}' http://localhost:8080/gate/open
```
You can pause the rollout at any time with:
```bash
curl -d '{"name": "podinfo","namespace":"test"}' http://localhost:8080/gate/close
```
If a canary analysis is paused the status will change to waiting:
```bash
kubectl get canary/podinfo
NAME STATUS WEIGHT
podinfo Waiting 0
```
The `confirm-promotion` hook type can be used to manually approve the canary promotion.
While the promotion is paused, Flagger will continue to run the metrics checks and load tests.
```yaml
analysis:
webhooks:
- name: "promotion gate"
type: confirm-promotion
url: http://flagger-loadtester.test/gate/halt
```
The `rollback` hook type can be used to manually rollback the canary promotion.
As with gating, rollbacks can be driven with Flagger's tester API by setting the rollback URL to `/rollback/check`
```yaml
analysis:
webhooks:
- name: "rollback"
type: rollback
url: http://flagger-loadtester.test/rollback/check
```
By default, rollback is closed, you can rollback a canary rollout with:
```bash
kubectl -n test exec -it flagger-loadtester-xxxx-xxxx sh
curl -d '{"name": "podinfo","namespace":"test"}' http://localhost:8080/rollback/open
```
You can close the rollback with:
```bash
curl -d '{"name": "podinfo","namespace":"test"}' http://localhost:8080/rollback/close
```
If you have notifications enabled, Flagger will post a message to Slack or MS Teams if a canary has been rolled back.
## Troubleshooting
### Manually check if helm test is running
To debug in depth any issues with helm tests, you can execute commands on the flagger-loadtester pod.
```bash
kubectl exec -it deploy/flagger-loadtester -- bash
helmv3 test <release> -n <namespace> --debug
```
### Helm tests hang during canary deployment
If test execution hangs or displays insufficient permissions, check your RBAC settings.
```yaml
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: helm-smoke-tester
rules:
- apiGroups: [""]
resources: ["secrets"]
verbs: ["get", "watch", "list", "update"]
# choose the permission based on helm test type (Pod or Job)
- apiGroups: [""]
resources: ["pods", "pods/log"]
verbs: ["create", "list", "delete", "watch"]
- apiGroups: ["batch"]
resources: ["jobs", "jobs/log"]
verbs: ["create", "list", "delete", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: helm-smoke-tester
# Don't forget to update accordingly
namespace: namespace-of-the-tested-release
subjects:
- kind: User
name: system:serviceaccount:linkerd:default
apiGroup: rbac.authorization.k8s.io
roleRef:
kind: ClusterRole
name: helm-smoke-tester
apiGroup: rbac.authorization.k8s.io
```

View File

@ -133,7 +133,8 @@ class Repo():
shutil.copyfile(
os.path.join(self.dest, entry[0]),
out_file)
docs = entry[1].startswith('docs/') or entry[1].startswith('legacy/')
docs = entry[1].startswith('docs/') or entry[1].startswith('legacy/') or \
entry[1].startswith('flagger/')
title = None
weight = None
if len(entry) == 4:

View File

@ -1,13 +1,30 @@
{{ $cover := and (.HasShortcode "blocks/cover") (not .Site.Params.ui.navbar_translucent_over_cover_disable) }}
<nav class="js-navbar-scroll navbar-expand-lg navbar navbar-dark {{ if $cover}} td-navbar-cover {{ end }}flex-row td-navbar">
<a class="navbar-brand" href="{{ .Site.Home.RelPermalink }}">
{{ if in .Permalink "/flagger" }}
<span class="navbar-logo">{{ if .Site.Params.ui.navbar_logo }}{{ with resources.Get "icons/flagger.svg" }}{{ ( . | minify).Content | safeHTML }}{{ end }}{{ end }}</span><span class="font-weight-bold">Flagger</span>
{{ else }}
<span class="navbar-logo">{{ if .Site.Params.ui.navbar_logo }}{{ with resources.Get "icons/logo.svg" }}{{ ( . | minify).Content | safeHTML }}{{ end }}{{ end }}</span><span class="font-weight-bold">{{ .Site.Title }}</span>
{{ end }}
</a>
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarContent" aria-controls="navbarContent" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div class="collapse navbar-collapse justify-content-end" id="navbarContent">
<ul class="navbar-nav mt-2 p-3 p-lg-0 mt-lg-0">
{{ if in .Permalink "/flagger" }}
<li class="nav-item mr-4 mb-2 mb-lg-0">
<a class="nav-link active" href="/flagger"><span class="active">Docs</span></a>
</li>
{{ else if in .Permalink "/legacy" }}
<li class="nav-item mr-4 mb-2 mb-lg-0">
<a class="nav-link active" href="/legacy"><span class="active">Docs</span></a>
</li>
{{ else }}
<li class="nav-item mr-4 mb-2 mb-lg-0">
<a class="nav-link active" href="/docs"><span class="active">Docs</span></a>
</li>
{{ end }}
{{ $p := . }}
{{ range .Site.Menus.main }}
{{ $active := or ($p.IsMenuCurrent "main" .) ($p.HasMenuCurrent "main" .) }}

View File

@ -25,7 +25,9 @@
</div>
<div class="row">
<a class="btn btn-lg btn-{{ $b1_color }}" href="{{ $b1_url }}" role="button">{{ $b1_caption }}</a>
{{ if $b2_url }}
<a class="btn btn-lg btn-{{ $b2_color }}" href="{{ $b2_url }}" role="button">{{ $b2_caption }}</a>
{{ end }}
</div>
</div>
</section>

View File

@ -1,13 +1,30 @@
/docs/components/helm/controller /docs/components/helm 301!
/docs/components/image/controller /docs/components/image 301!
/docs/components/kustomize/controller /docs/components/kustomize 301!
/docs/components/notification/controller /docs/components/notification 301!
/docs/components/source/controller /docs/components/source 301!
/docs/guides/installation /docs/installation 301!
/contributing /docs/contributing 301!
/docs/contributing/flux2 /docs/contributing/flux 301!
/legacy/helm-operator/guides/upgrade-to-ga /legacy/helm-operator/how-to/upgrade-to-ga 301!
/legacy/helm-operator/guides/upgrade-to-beta /legacy/helm-operator/how-to/upgrade-to-beta 301!
/docs/roadmap /roadmap 301!
/integrations /ecosystem 301!
/docs/contributing/docs/some-background https://github.com/fluxcd/website#readme 301!
/docs/components/helm/controller /docs/components/helm 301!
/docs/components/image/controller /docs/components/image 301!
/docs/components/kustomize/controller /docs/components/kustomize 301!
/docs/components/notification/controller /docs/components/notification 301!
/docs/components/source/controller /docs/components/source 301!
/docs/guides/installation /docs/installation 301!
/contributing /docs/contributing 301!
/docs/contributing/flux2 /docs/contributing/flux 301!
/legacy/helm-operator/guides/upgrade-to-ga /legacy/helm-operator/how-to/upgrade-to-ga 301!
/legacy/helm-operator/guides/upgrade-to-beta /legacy/helm-operator/how-to/upgrade-to-beta 301!
/docs/roadmap /roadmap 301!
/integrations /ecosystem 301!
/docs/contributing/docs/some-background https://github.com/fluxcd/website#readme 301!
/flagger/how-it-works /flagger/usage/how-it-works 301!
/flagger/usage/progressive-delivery /flagger/tutorials/istio-progressive-delivery 301!
/flagger/usage/ab-testing /flagger/tutorials/istio-ab-testing 301!
/flagger/usage/blue-green /flagger/tutorials/kubernetes-blue-green 301!
/flagger/usage/appmesh-progressive-delivery /flagger/tutorials/appmesh-progressive-delivery 301!
/flagger/usage/linkerd-progressive-delivery /flagger/tutorials/linkerd-progressive-delivery 301!
/flagger/usage/contour-progressive-delivery /flagger/tutorials/contour-progressive-delivery 301!
/flagger/usage/gloo-progressive-delivery /flagger/tutorials/gloo-progressive-delivery 301!
/flagger/usage/nginx-progressive-delivery /flagger/tutorials/nginx-progressive-delivery 301!
/flagger/usage/skipper-progressive-delivery /flagger/tutorials/skipper-progressive-delivery 301!
/flagger/usage/crossover-progressive-delivery /flagger/tutorials/crossover-progressive-delivery 301!
/flagger/usage/traefik-progressive-delivery /flagger/tutorials/traefik-progressive-delivery 301!
/flagger/usage/osm-progressive-delivery /flagger/tutorials/osm-progressive-delivery 301!
/flagger/usage/kuma-progressive-delivery /flagger/tutorials/kuma-progressive-delivery 301!
/flagger/usage/gatewayapi-progressive-delivery /flagger/tutorials/gatewayapi-progressive-delivery 301!

Binary file not shown.

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 35 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 40 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 48 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 33 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 34 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 38 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 39 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 39 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 40 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 38 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 38 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 196 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 41 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 121 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 41 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 29 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 38 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 39 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 124 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 51 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 45 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 40 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 205 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 523 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 349 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 497 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 131 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 133 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 523 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 440 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 78 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 113 KiB