Import rev 95381e1892a43a167b0894f583894da2a7067123 of fluxcd/flagger docs
- Add nav entries (frontmatter) - Import diagrams and screens - use local images (not hotlink GitHub) - Import redirects too Addresses: #894 Signed-off-by: Daniel Holbach <daniel@weave.works>
|
@ -9,7 +9,7 @@ This repo houses the assets used to build the Flux project's landing page at <ht
|
|||
> Project | Docs Site | Github Source
|
||||
> ---------------- | ------------------------------------------| -------------
|
||||
> Flux | <https://fluxcd.io/docs> | <https://github.com/fluxcd/website>
|
||||
> Flagger | <https://docs.flagger.app> | <https://github.com/fluxcd/flagger>
|
||||
> Flagger | <https://fluxcd.io/flagger> | <https://github.com/fluxcd/website>
|
||||
> Flux (legacy) | <https://fluxcd.io/legacy/flux> | <https://github.com/fluxcd/website>
|
||||
> Helm Operator | <https://fluxcd.io/legacy/helm-operator/> | <https://github.com/fluxcd/website>
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
adopters:
|
||||
project: Flagger
|
||||
description: >
|
||||
Check out everything about Flagger on <https://flagger.app>.
|
||||
Check out everything about Flagger on [its website](/flagger).
|
||||
companies:
|
||||
- name: Capra Consulting
|
||||
url: https://www.capraconsulting.no
|
||||
|
|
|
@ -0,0 +1,28 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<svg width="64px" height="64px" viewBox="0 0 64 64" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
|
||||
<title>flagger-icon</title>
|
||||
<g id="flagger-icon" stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
|
||||
<path d="M11.7870673,17.7791346 C10.7376442,17.0966346 10.7376442,15.5602885 11.7870673,14.8783654 L31.1659135,2.27951923 C31.4527085,2.09313128 31.7809164,2 32.109115,2 C32.1144833,2 32.1144833,30.6577081 32.1144833,30.6577081 C31.7844966,30.658555 31.4542722,30.565385 31.1659135,30.3779808 L11.7870673,17.7791346 Z" id="Fill-1" fill="#7BB09F"></path>
|
||||
<path d="M1.78706731,24.2791346 C0.737644231,23.5966346 0.737644231,22.0602885 1.78706731,21.3783654 L21.1659135,8.77951923 C21.4527085,8.59313128 21.7809164,8.5 22.109147,8.5 C22.1144833,8.5 16.7504591,14.9302441 16.4621003,14.7428399 L1.78706731,24.2791346 Z" id="Fill-1-Copy" fill="#7BB09F"></path>
|
||||
<path d="M32.1005414,32.7999243 C32.4314477,32.7984255 32.7627112,32.8915919 33.051875,33.0795192 L52.4312981,45.6783654 C53.4807212,46.3602885 53.4807212,47.8966346 52.4312981,48.5791346 L33.051875,61.1779808 C32.7723427,61.3596486 32.4534685,61.452763 32.1336066,61.4575 C32.1336066,61.4575 32.1005414,32.7999243 32.1005414,32.7999243 Z" id="Fill-1" fill="#7BB09F"></path>
|
||||
<path d="M42.6594534,51.6821881 L62.4312981,39.1784411 C63.4807212,39.8603641 63.4807212,41.3967103 62.4312981,42.0792103 L43.051875,54.6780564 C42.7723427,54.8597243 42.4534685,54.9528387 42.1336066,54.9573995 C42.1336066,54.9573995 42.3702896,51.4942608 42.6594534,51.6821881 Z" id="Fill-1-Copy-2" fill="#7BB09F"></path>
|
||||
<g id="Group" transform="translate(11.000000, 32.000000)" fill="#7BB09F">
|
||||
<path d="M23.1648462,1.79890385 C22.7177486,1.57560653 21.9079818,0.775616463 21.12525,0.8 C20.4471662,0.8 20.1819808,1.07951923 19.08525,1.79890385 C20.6106346,2.45198077 26.2588846,4.40082692 27.8390769,4.83755769 C25.3252832,3.17403545 23.7672062,2.16115083 23.1648462,1.79890385 Z" id="Fill-5"></path>
|
||||
<path d="M17.1629423,3.04898077 L15.4269808,4.17744231 C16.9371113,5.2881491 18.4473527,6.15609036 19.9577051,6.7812661 C23.0272183,8.05182074 26.9292047,8.86995809 27.8191154,9.08936538 C32.8706538,10.3349423 37.6418077,11.5107115 41.4783462,15.3478269 C41.6733462,15.54225 41.8562308,15.7407115 42.0373846,15.93975 C42.4308462,15.1880192 42.2335385,14.1957115 41.4466154,13.6845577 L33.8560385,8.74898077 C32.0133462,8.14090385 30.1360385,7.67590385 28.2806538,7.21898077 C26.5308462,6.78744231 23.5727919,6.02059898 21.9181765,5.49156052 C20.8150996,5.13886821 19.2300215,4.32467496 17.1629423,3.04898077 Z" id="Fill-7"></path>
|
||||
<path d="M16.8750849,8.20483098 C15.6193488,7.62821266 14.2831279,6.83594208 12.8664221,5.82801923 L11.2119808,6.91782692 C12.8335577,8.16072333 14.2385577,9.07106402 15.4269808,9.648849 C17.2096154,10.5155265 24.0114808,12.3722308 24.4326346,12.4760769 C29.4841731,13.7210769 34.2553269,14.8968462 38.0924423,18.7339615 C38.0987885,18.7408846 38.1045577,18.7472308 38.1114808,18.7541538 L39.75225,17.6868462 C39.6524423,17.5824231 39.5584038,17.4751154 39.4545577,17.3718462 C35.2384038,13.1551154 29.9791731,11.8587692 24.8941731,10.6051154 C24.3137885,10.4620385 18.7586891,9.06975845 16.8750849,8.20483098 Z" id="Fill-11"></path>
|
||||
<path d="M23.1650769,16.3935 C27.1175769,17.4140769 30.8341154,18.6342692 33.9823846,21.4381154 L35.6439231,20.3581154 C32.0465509,16.9762696 28.005918,15.3433009 23.4551919,14.4789663 C19.3757148,13.7041376 16.6984434,12.7436057 13.6520967,11.5324737 C11.621199,10.7250524 9.97648083,9.72718297 8.71794231,8.53886538 L7.04717308,9.62521154 C9.06194661,11.3700006 10.8411363,12.5664934 12.3847422,13.2146899 C14.4228927,14.0705575 18.0163376,15.1301608 23.1650769,16.3935 Z" id="Fill-17"></path>
|
||||
<path d="M4.57875,11.2299231 L2.92990385,12.3018462 C2.98759615,12.3612692 3.04009615,12.423 3.09951923,12.4818462 C7.31625,16.6985769 12.5743269,17.9949231 17.6599038,19.2485769 C22.0641346,20.3337692 26.2543269,21.3687692 29.7989423,24.1581923 L31.4893269,23.0591538 C27.4958654,19.6968462 22.7385577,18.5158846 18.1214423,17.3781923 C13.1206731,16.1453077 8.39567308,14.9758846 4.57875,11.2299231" id="Fill-19"></path>
|
||||
<path d="M1.07555769,14.5060962 C0.883442308,14.3139808 0.702865385,14.1184038 0.524019231,13.9216731 C-0.227711538,14.6745577 -0.139442308,15.9726346 0.80325,16.5853269 L6.50959615,20.2955192 C9.03536538,21.3409038 11.6765192,21.9945577 14.2738269,22.6349423 C18.3284423,23.6341731 22.2019038,24.5924423 25.5578654,26.9157115 L27.2834423,25.7930192 C23.4676731,22.9245577 19.0403654,21.8255192 14.7347885,20.7639808 C9.68382692,19.5189808 4.91267308,18.3432115 1.07555769,14.5060962" id="Fill-21"></path>
|
||||
<path d="M19.6441154,28.8342692 C20.0243077,29.0188846 20.3998846,29.2133077 20.7691154,29.4221538 C21.2093077,29.5150385 21.6771923,29.4383077 22.0683462,29.1838846 L23.0260385,28.5613846 C19.9493077,26.5035 16.5287308,25.461 13.1196923,24.5927308 L19.6441154,28.8342692 Z" id="Fill-23"></path>
|
||||
</g>
|
||||
<g id="Group-Copy" transform="translate(11.000000, 1.200000)" fill="#7BB09F">
|
||||
<path d="M23.1648462,1.79890385 C22.7177486,1.57560653 21.9079818,0.775616463 21.12525,0.8 C20.4471662,0.8 20.1819808,1.07951923 19.08525,1.79890385 C20.6106346,2.45198077 26.2588846,4.40082692 27.8390769,4.83755769 C25.3252832,3.17403545 23.7672062,2.16115083 23.1648462,1.79890385 Z" id="Fill-5"></path>
|
||||
<path d="M17.1629423,3.04898077 L15.4269808,4.17744231 C16.9371113,5.2881491 18.4473527,6.15609036 19.9577051,6.7812661 C23.0272183,8.05182074 26.9292047,8.86995809 27.8191154,9.08936538 C32.8706538,10.3349423 37.6418077,11.5107115 41.4783462,15.3478269 C41.6733462,15.54225 41.8562308,15.7407115 42.0373846,15.93975 C42.4308462,15.1880192 42.2335385,14.1957115 41.4466154,13.6845577 L33.8560385,8.74898077 C32.0133462,8.14090385 30.1360385,7.67590385 28.2806538,7.21898077 C26.5308462,6.78744231 23.5727919,6.02059898 21.9181765,5.49156052 C20.8150996,5.13886821 19.2300215,4.32467496 17.1629423,3.04898077 Z" id="Fill-7"></path>
|
||||
<path d="M16.8750849,8.20483098 C15.6193488,7.62821266 14.2831279,6.83594208 12.8664221,5.82801923 L11.2119808,6.91782692 C12.8335577,8.16072333 14.2385577,9.07106402 15.4269808,9.648849 C17.2096154,10.5155265 24.0114808,12.3722308 24.4326346,12.4760769 C29.4841731,13.7210769 34.2553269,14.8968462 38.0924423,18.7339615 C38.0987885,18.7408846 38.1045577,18.7472308 38.1114808,18.7541538 L39.75225,17.6868462 C39.6524423,17.5824231 39.5584038,17.4751154 39.4545577,17.3718462 C35.2384038,13.1551154 29.9791731,11.8587692 24.8941731,10.6051154 C24.3137885,10.4620385 18.7586891,9.06975845 16.8750849,8.20483098 Z" id="Fill-11"></path>
|
||||
<path d="M23.1650769,16.3935 C27.1175769,17.4140769 30.8341154,18.6342692 33.9823846,21.4381154 L35.6439231,20.3581154 C32.0465509,16.9762696 28.005918,15.3433009 23.4551919,14.4789663 C19.3757148,13.7041376 16.6984434,12.7436057 13.6520967,11.5324737 C11.621199,10.7250524 9.97648083,9.72718297 8.71794231,8.53886538 L7.04717308,9.62521154 C9.06194661,11.3700006 10.8411363,12.5664934 12.3847422,13.2146899 C14.4228927,14.0705575 18.0163376,15.1301608 23.1650769,16.3935 Z" id="Fill-17"></path>
|
||||
<path d="M4.57875,11.2299231 L2.92990385,12.3018462 C2.98759615,12.3612692 3.04009615,12.423 3.09951923,12.4818462 C7.31625,16.6985769 12.5743269,17.9949231 17.6599038,19.2485769 C22.0641346,20.3337692 26.2543269,21.3687692 29.7989423,24.1581923 L31.4893269,23.0591538 C27.4958654,19.6968462 22.7385577,18.5158846 18.1214423,17.3781923 C13.1206731,16.1453077 8.39567308,14.9758846 4.57875,11.2299231" id="Fill-19"></path>
|
||||
<path d="M1.07555769,14.5060962 C0.883442308,14.3139808 0.702865385,14.1184038 0.524019231,13.9216731 C-0.227711538,14.6745577 -0.139442308,15.9726346 0.80325,16.5853269 L6.50959615,20.2955192 C9.03536538,21.3409038 11.6765192,21.9945577 14.2738269,22.6349423 C18.3284423,23.6341731 22.2019038,24.5924423 25.5578654,26.9157115 L27.2834423,25.7930192 C23.4676731,22.9245577 19.0403654,21.8255192 14.7347885,20.7639808 C9.68382692,19.5189808 4.91267308,18.3432115 1.07555769,14.5060962" id="Fill-21"></path>
|
||||
<path d="M19.6441154,28.8342692 C20.0243077,29.0188846 20.3998846,29.2133077 20.7691154,29.4221538 C21.2093077,29.5150385 21.6771923,29.4383077 22.0683462,29.1838846 L23.0260385,28.5613846 C19.9493077,26.5035 16.5287308,25.461 13.1196923,24.5927308 L19.6441154,28.8342692 Z" id="Fill-23"></path>
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
After Width: | Height: | Size: 8.4 KiB |
|
@ -153,14 +153,6 @@ enable = true
|
|||
icon = "fab fa-youtube"
|
||||
desc = "Recordings of past Flux dev meetings"
|
||||
|
||||
|
||||
|
||||
[[menus.main]]
|
||||
url = "/docs"
|
||||
name = "Docs"
|
||||
weight = 1
|
||||
|
||||
|
||||
[[menus.main]]
|
||||
identifier = "Project"
|
||||
name = "Project"
|
||||
|
|
|
@ -255,6 +255,15 @@ What we achieved up until today is only possible because of <a href="/community"
|
|||
|
||||
{{% blocks/section color="white" %}}
|
||||
|
||||
<p id="flagger"></p>
|
||||
{{% blocks/project title="Flagger"
|
||||
image="/img/flagger-gitops.png" image-align="right" bg-color="green"
|
||||
button1-url="/flagger" %}}
|
||||
Kubernetes Operator for the automation of promoting canary deployments using Istio, Linkerd, App Mesh, NGINX, Skipper, Contour, Gloo or Traefik routing for traffic shifting, and Prometheus metrics for canary analysis.
|
||||
|
||||
The canary analysis can be extended with webhooks for running system integration/acceptance tests, load tests, or any other custom validation.
|
||||
{{% /blocks/project %}}
|
||||
|
||||
<p id="gitops-toolkit"></p>
|
||||
{{% blocks/project title="GitOps Toolkit"
|
||||
image="/img/building-blocks.svg" image-align="left" bg-color="blue"
|
||||
|
@ -263,15 +272,6 @@ What we achieved up until today is only possible because of <a href="/community"
|
|||
The set of APIs and controllers that make up the runtime for Flux. You can use the GitOps Toolkit to extend Flux, and to build your own systems for continuous delivery.
|
||||
{{% /blocks/project %}}
|
||||
|
||||
{{% blocks/project title="Flagger"
|
||||
image="/img/flagger-gitops.png" image-align="right" bg-color="green"
|
||||
button1-url="https://docs.flagger.app/"
|
||||
button2-url="https://flagger.app/" %}}
|
||||
Kubernetes Operator for the automation of promoting canary deployments using Istio, Linkerd, App Mesh, NGINX, Skipper, Contour, Gloo or Traefik routing for traffic shifting, and Prometheus metrics for canary analysis.
|
||||
|
||||
The canary analysis can be extended with webhooks for running system integration/acceptance tests, load tests, or any other custom validation.
|
||||
{{% /blocks/project %}}
|
||||
|
||||
{{% blocks/project title="Flux v1 and Helm Operator"
|
||||
image="/img/logos/flux-horizontal-color.png" image-align="right" bg-color="white"
|
||||
button1-url="/legacy/flux" button1-caption="Flux v1 Documentation" button1-color="blue"
|
||||
|
|
|
@ -121,7 +121,7 @@ We are very happy to announce the v1.6 release of Flagger. This release
|
|||
includes:
|
||||
|
||||
- Support for A/B testing using [Gloo
|
||||
Edge](https://docs.flagger.app/tutorials/gloo-progressive-delivery)
|
||||
Edge](/flagger/tutorials/gloo-progressive-delivery)
|
||||
HTTP headers based routing.
|
||||
- Extended support for Istio\'s `HTTPMatchRequest` and `VirtualService`
|
||||
delegation.
|
||||
|
|
|
@ -119,7 +119,7 @@ history, the move itself and what it means:
|
|||
|
||||
## Flagger v1.7.0 is out
|
||||
|
||||
This release comes with support for manually approving the traffic weight increase. Starting with this version, Flagger can be used with Linkerd v2.10 and its new Viz addon, please see [the updated guide](https://docs.flagger.app/tutorials/linkerd-progressive-delivery). Thanks to the Linkerd team for contributing to Flagger.
|
||||
This release comes with support for manually approving the traffic weight increase. Starting with this version, Flagger can be used with Linkerd v2.10 and its new Viz addon, please see [the updated guide](/flagger/tutorials/linkerd-progressive-delivery). Thanks to the Linkerd team for contributing to Flagger.
|
||||
|
||||
## Upcoming events
|
||||
|
||||
|
|
|
@ -106,7 +106,7 @@ where they can be reviewed and approved before going to production.
|
|||
|
||||
## Flagger v1.8.0
|
||||
|
||||
Until now [Flagger](https://flagger.app) was compatible with Linkerd
|
||||
Until now [Flagger](/flagger) was compatible with Linkerd
|
||||
which implements the [Service Mesh Interface](https://smi-spec.io) (SMI) `v1alpha1`.
|
||||
Starting with v1.8.0, Flagger extends the SMI support for the
|
||||
`v1alpha2` and `v1alpha3` APIs.
|
||||
|
|
|
@ -96,7 +96,7 @@ Kubernetes 1.19 is supported.
|
|||
This release comes with support for [Kuma Service
|
||||
Mesh](https://kuma.io/). For more details see the [Kuma
|
||||
Progressive Delivery
|
||||
tutorial](https://docs.flagger.app/tutorials/kuma-progressive-delivery).
|
||||
tutorial](/flagger/tutorials/kuma-progressive-delivery).
|
||||
|
||||
{{< imgproc flagger-kuma-canary Resize 600x >}}
|
||||
Kuma Progressive Delivery with Flagger
|
||||
|
|
|
@ -118,7 +118,7 @@ services are adjusted accordingly.
|
|||

|
||||
|
||||
If you want to get started right away, have a look at [our
|
||||
tutorial](https://docs.flagger.app/tutorials/gatewayapi-progressive-delivery),
|
||||
tutorial](/flagger/tutorials/gatewayapi-progressive-delivery),
|
||||
which shows you how to use Contour's Gateway API implementation and Flagger to automate canary
|
||||
deployments. It won't take long to follow, but will convey how powerful
|
||||
this integration is.
|
||||
|
|
|
@ -77,7 +77,7 @@ project within Kubernetes. Be sure to check out the blog post to find
|
|||
out how to integrate this into your setups.
|
||||
|
||||
The Flux community is happy and proud that
|
||||
[Flagger](https://docs.flagger.app/) is part of our effort
|
||||
[Flagger](/flagger/) is part of our effort
|
||||
to bring GitOps solutions to the world.
|
||||
|
||||
### Flux "Maintainers' Focus" Project Board
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
---
|
||||
title: Flagger
|
||||
manualLink: https://docs.flagger.app
|
||||
manualLinkTarget: _blank
|
||||
manualLink: /flagger
|
||||
weight: 150
|
||||
---
|
||||
|
|
|
@ -52,7 +52,7 @@ New app release can be automatically delivered to staging using Flux's [image up
|
|||
For production, you may choose to manually approve app version bumps by configuring Flux
|
||||
to push the changes to a new branch from which you can create a pull request.
|
||||
You can limit the impact of an issue that escaped staging testing by using Flagger's
|
||||
[canary releases](https://docs.flagger.app/).
|
||||
[canary releases](/flagger/).
|
||||
|
||||
Changes to infrastructure can be disruptive and could cause cluster-wide outages.
|
||||
These config changes can be made in steps, first you merge a change to staging,
|
||||
|
|
|
@ -1269,7 +1269,7 @@ If you are on GitHub, and are struggling to get started using GitHub Actions, or
|
|||
[kubecfg yaml parser]: https://github.com/bitnami/kubecfg/blob/master/lib/kubecfg.libsonnet#L25
|
||||
[bitnami-labs/kube-libsonnet]: https://github.com/bitnami-labs/kube-libsonnet
|
||||
[example 10.3 jsonnet]: https://github.com/kingdonb/any_old_app/blob/release/0.10.3/manifests/example.jsonnet
|
||||
[Manual Gating]: https://docs.flagger.app/usage/webhooks#manual-gating
|
||||
[Manual Gating]: /flagger/usage/webhooks#manual-gating
|
||||
[flux create tenant]: /cmd/flux_create_tenant
|
||||
[Flux 2 Multi-Tenancy Guide]: https://github.com/fluxcd/flux2-multi-tenancy
|
||||
[Mozilla SOPS]: /guides/mozilla-sops/
|
||||
|
|
|
@ -0,0 +1,45 @@
|
|||
# Table of contents
|
||||
|
||||
* [Introduction](README.md)
|
||||
* [FAQ](faq.md)
|
||||
|
||||
## Install
|
||||
|
||||
* [Flagger Install on Kubernetes](install/flagger-install-on-kubernetes.md)
|
||||
* [Flagger Install on GKE Istio](install/flagger-install-on-google-cloud.md)
|
||||
* [Flagger Install on EKS App Mesh](install/flagger-install-on-eks-appmesh.md)
|
||||
* [Flagger Install on Alibaba ServiceMesh](install/flagger-install-on-alibaba-servicemesh.md)
|
||||
|
||||
## Usage
|
||||
|
||||
* [How it works](usage/how-it-works.md)
|
||||
* [Deployment Strategies](usage/deployment-strategies.md)
|
||||
* [Metrics Analysis](usage/metrics.md)
|
||||
* [Webhooks](usage/webhooks.md)
|
||||
* [Alerting](usage/alerting.md)
|
||||
* [Monitoring](usage/monitoring.md)
|
||||
|
||||
## Tutorials
|
||||
|
||||
* [Istio Canary Deployments](tutorials/istio-progressive-delivery.md)
|
||||
* [Istio A/B Testing](tutorials/istio-ab-testing.md)
|
||||
* [Linkerd Canary Deployments](tutorials/linkerd-progressive-delivery.md)
|
||||
* [App Mesh Canary Deployments](tutorials/appmesh-progressive-delivery.md)
|
||||
* [Contour Canary Deployments](tutorials/contour-progressive-delivery.md)
|
||||
* [Gloo Canary Deployments](tutorials/gloo-progressive-delivery.md)
|
||||
* [NGINX Canary Deployments](tutorials/nginx-progressive-delivery.md)
|
||||
* [Skipper Canary Deployments](tutorials/skipper-progressive-delivery.md)
|
||||
* [Traefik Canary Deployments](tutorials/traefik-progressive-delivery.md)
|
||||
* [Open Service Mesh Deployments](tutorials/osm-progressive-delivery.md)
|
||||
* [Kuma Canary Deployments](tutorials/kuma-progressive-delivery.md)
|
||||
* [Gateway API Canary Deployments](tutorials/gatewayapi-progressive-delivery.md)
|
||||
* [Blue/Green Deployments](tutorials/kubernetes-blue-green.md)
|
||||
* [Canary analysis with Prometheus Operator](tutorials/prometheus-operator.md)
|
||||
* [Zero downtime deployments](tutorials/zero-downtime-deployments.md)
|
||||
|
||||
## Dev
|
||||
|
||||
* [Development Guide](dev/dev-guide.md)
|
||||
* [Release Guide](dev/release-guide.md)
|
||||
* [Upgrade Guide](dev/upgrade-guide.md)
|
||||
|
|
@ -0,0 +1,55 @@
|
|||
---
|
||||
description: Flagger is a progressive delivery Kubernetes operator
|
||||
type: docs
|
||||
weight: 1
|
||||
title: Flagger
|
||||
---
|
||||
|
||||
[Flagger](https://github.com/fluxcd/flagger) is a progressive delivery tool that automates the release
|
||||
process for applications running on Kubernetes. It reduces the risk of introducing a new software
|
||||
version in production by gradually shifting traffic to the new version while measuring metrics
|
||||
and running conformance tests.
|
||||
|
||||
Flagger implements several deployment strategies (Canary releases, A/B testing, Blue/Green mirroring)
|
||||
using a service mesh (App Mesh, Istio, Linkerd, Kuma, Open Service Mesh)
|
||||
or an ingress controller (Contour, Gloo, NGINX, Skipper, Traefik) for traffic routing.
|
||||
For release analysis, Flagger can query Prometheus, InfluxDB, Datadog, New Relic, CloudWatch, Stackdriver
|
||||
or Graphite and for alerting it uses Slack, MS Teams, Discord and Rocket.
|
||||
|
||||

|
||||
|
||||
Flagger can be configured with Kubernetes custom resources and is compatible with
|
||||
any CI/CD solutions made for Kubernetes. Since Flagger is declarative and reacts to Kubernetes events,
|
||||
it can be used in **GitOps** pipelines together with tools like Flux, JenkinsX, Carvel, Argo, etc.
|
||||
|
||||
Flagger is a [Cloud Native Computing Foundation](https://cncf.io/) project
|
||||
and part of [Flux](/) family of GitOps tools.
|
||||
|
||||
## Getting started
|
||||
|
||||
To get started with Flagger, choose one of the supported routing providers and
|
||||
[install](install/flagger-install-on-kubernetes.md) Flagger with Helm or Kustomize.
|
||||
|
||||
After installing Flagger, you can follow one of these tutorials to get started:
|
||||
|
||||
**Service mesh tutorials**
|
||||
|
||||
* [Istio](tutorials/istio-progressive-delivery.md)
|
||||
* [Linkerd](tutorials/linkerd-progressive-delivery.md)
|
||||
* [AWS App Mesh](tutorials/appmesh-progressive-delivery.md)
|
||||
* [Open Service Mesh](tutorials/osm-progressive-delivery.md)
|
||||
* [Kuma](tutorials/kuma-progressive-delivery.md)
|
||||
|
||||
**Ingress controller tutorials**
|
||||
|
||||
* [Contour](tutorials/contour-progressive-delivery.md)
|
||||
* [Gloo](tutorials/gloo-progressive-delivery.md)
|
||||
* [NGINX Ingress](tutorials/nginx-progressive-delivery.md)
|
||||
* [Skipper Ingress](tutorials/skipper-progressive-delivery.md)
|
||||
* [Traefik](tutorials/traefik-progressive-delivery.md)
|
||||
|
||||
**Hands-on GitOps workshops**
|
||||
|
||||
* [Istio](https://github.com/stefanprodan/gitops-istio)
|
||||
* [Linkerd](https://helm.workshop.flagger.dev)
|
||||
* [AWS App Mesh](https://eks.handson.flagger.dev)
|
|
@ -0,0 +1,5 @@
|
|||
---
|
||||
type: docs
|
||||
title: Dev
|
||||
weight: 5
|
||||
---
|
|
@ -0,0 +1,212 @@
|
|||
---
|
||||
type: docs
|
||||
title: Development Guide
|
||||
weight: 1
|
||||
---
|
||||
|
||||
This document describes how to build, test and run Flagger from source.
|
||||
|
||||
## Setup dev environment
|
||||
|
||||
Flagger is written in Go and uses Go modules for dependency management.
|
||||
|
||||
On your dev machine install the following tools:
|
||||
|
||||
* go >= 1.17
|
||||
* git >;= 2.20
|
||||
* bash >= 5.0
|
||||
* make >= 3.81
|
||||
* kubectl >= 1.22
|
||||
* kustomize >= 4.4
|
||||
* helm >= 3.0
|
||||
* docker >= 19.03
|
||||
|
||||
You'll also need a Kubernetes cluster for testing Flagger.
|
||||
You can use Minikube, Kind, Docker desktop or any remote cluster (AKS/EKS/GKE/etc) Kubernetes version 1.22 or newer.
|
||||
|
||||
To start contributing to Flagger, fork the [repository](https://github.com/fluxcd/flagger) on GitHub.
|
||||
|
||||
Create a dir inside your `GOPATH`:
|
||||
|
||||
```bash
|
||||
mkdir -p $GOPATH/src/github.com/fluxcd
|
||||
```
|
||||
|
||||
Clone your fork:
|
||||
|
||||
```bash
|
||||
cd $GOPATH/src/github.com/fluxcd
|
||||
git clone https://github.com/YOUR_USERNAME/flagger
|
||||
cd flagger
|
||||
```
|
||||
|
||||
Set Flagger repository as upstream:
|
||||
|
||||
```bash
|
||||
git remote add upstream https://github.com/fluxcd/flagger.git
|
||||
```
|
||||
|
||||
Sync your fork regularly to keep it up-to-date with upstream:
|
||||
|
||||
```bash
|
||||
git fetch upstream
|
||||
git checkout main
|
||||
git merge upstream/main
|
||||
```
|
||||
|
||||
## Build
|
||||
|
||||
Download Go modules:
|
||||
|
||||
```bash
|
||||
go mod download
|
||||
```
|
||||
|
||||
Build Flagger binary:
|
||||
|
||||
```bash
|
||||
make build
|
||||
```
|
||||
|
||||
Build load tester binary:
|
||||
|
||||
```bash
|
||||
make loadtester-build
|
||||
```
|
||||
|
||||
## Code changes
|
||||
|
||||
We require all commits to be signed. By signing off with your signature, you
|
||||
certify that you wrote the patch or otherwise have the right to contribute the
|
||||
material by the rules of the [DCO](https://raw.githubusercontent.com/fluxcd/flagger/main/DCO).
|
||||
|
||||
If your `user.name` and `user.email` are configured in your Git config,
|
||||
you can sign your commit automatically with:
|
||||
|
||||
```bash
|
||||
git commit -s
|
||||
```
|
||||
|
||||
Before submitting a PR, make sure your changes are covered by unit tests.
|
||||
|
||||
If you made changes to `go.mod` run:
|
||||
|
||||
```bash
|
||||
go mod tidy
|
||||
```
|
||||
|
||||
If you made changes to `pkg/apis` regenerate Kubernetes client sets with:
|
||||
|
||||
```bash
|
||||
make codegen
|
||||
```
|
||||
|
||||
Run code formatters:
|
||||
|
||||
```bash
|
||||
go install golang.org/x/tools/cmd/goimports@latest
|
||||
|
||||
make fmt
|
||||
```
|
||||
|
||||
Run unit tests:
|
||||
|
||||
```bash
|
||||
make test
|
||||
```
|
||||
|
||||
## API changes
|
||||
|
||||
If you made changes to `pkg/apis` regenerate the Kubernetes client sets with:
|
||||
|
||||
```bash
|
||||
make codegen
|
||||
```
|
||||
|
||||
Update the validation spec in `artifacts/flagger/crd.yaml` and run:
|
||||
|
||||
```bash
|
||||
make crd
|
||||
```
|
||||
|
||||
Note that any change to the CRDs must be accompanied by an update to the Open API schema.
|
||||
|
||||
## Manual testing
|
||||
|
||||
Install a service mesh and/or an ingress controller on your cluster
|
||||
and deploy Flagger using one of the install options
|
||||
[listed here](/flagger/install/flagger-install-on-kubernetes).
|
||||
|
||||
If you made changes to the CRDs, apply your local copy with:
|
||||
|
||||
```bash
|
||||
kubectl apply -f artifacts/flagger/crd.yaml
|
||||
```
|
||||
|
||||
Shutdown the Flagger instance installed on your cluster (replace the namespace with your mesh/ingress one):
|
||||
|
||||
```bash
|
||||
kubectl -n istio-system scale deployment/flagger --replicas=0
|
||||
```
|
||||
|
||||
Port forward to your Prometheus instance:
|
||||
|
||||
```bash
|
||||
kubectl -n istio-system port-forward svc/prometheus 9090:9090
|
||||
```
|
||||
|
||||
Run Flagger locally against your remote cluster by specifying a kubeconfig path:
|
||||
|
||||
```bash
|
||||
go run cmd/flagger/ -kubeconfig=$HOME/.kube/config \
|
||||
-log-level=info \
|
||||
-mesh-provider=istio \
|
||||
-metrics-server=http://localhost:9090
|
||||
```
|
||||
|
||||
Another option to manually test your changes is to build and push the image to your container registry:
|
||||
|
||||
```bash
|
||||
make build
|
||||
docker build -t <YOUR-DOCKERHUB-USERNAME>/flagger:<YOUR-TAG> .
|
||||
docker push <YOUR-DOCKERHUB-USERNAME>/flagger:<YOUR-TAG>
|
||||
```
|
||||
|
||||
Deploy your image on the cluster and scale up Flagger:
|
||||
|
||||
```bash
|
||||
kubectl -n istio-system set image deployment/flagger flagger=<YOUR-DOCKERHUB-USERNAME>/flagger:<YOUR-TAG>
|
||||
kubectl -n istio-system scale deployment/flagger --replicas=1
|
||||
```
|
||||
|
||||
Now you can use one of the [tutorials](/flagger/) to manually test your changes.
|
||||
|
||||
## Integration testing
|
||||
|
||||
Flagger end-to-end tests can be run locally with [Kubernetes Kind](https://github.com/kubernetes-sigs/kind).
|
||||
|
||||
Create a Kind cluster:
|
||||
|
||||
```bash
|
||||
kind create cluster
|
||||
```
|
||||
|
||||
Build Flagger container image and load it on the cluster:
|
||||
|
||||
```bash
|
||||
make build
|
||||
docker build -t test/flagger:latest .
|
||||
kind load docker-image test/flagger:latest
|
||||
```
|
||||
|
||||
|
||||
Run the Istio e2e tests:
|
||||
|
||||
```bash
|
||||
./test/istio/run.sh
|
||||
```
|
||||
|
||||
For each service mesh and ingress controller, there is a dedicated e2e test suite,
|
||||
choose one that matches your changes from this [list](https://github.com/fluxcd/flagger/tree/main/test).
|
||||
|
||||
When you open a pull request on Flagger repo, the unit and integration tests will be run in CI.
|
|
@ -0,0 +1,38 @@
|
|||
---
|
||||
type: docs
|
||||
title: Release Guide
|
||||
weight: 1
|
||||
---
|
||||
|
||||
This document describes how to release Flagger.
|
||||
|
||||
## Release
|
||||
|
||||
To release a new Flagger version (e.g. `2.0.0`) follow these steps:
|
||||
|
||||
* create a branch `git checkout -b prep-2.0.0`
|
||||
* set the version in code and manifests `TAG=2.0.0 make version-set`
|
||||
* commit changes and merge PR
|
||||
* checkout master `git checkout main && git pull`
|
||||
* tag master `make release`
|
||||
|
||||
## CI
|
||||
|
||||
After the tag has been pushed to GitHub, the CI release pipeline does the following:
|
||||
|
||||
* creates a GitHub release
|
||||
* pushes the Flagger binary and change log to GitHub release
|
||||
* pushes the Flagger container image to Docker Hub
|
||||
* pushes the Helm chart to github-pages branch
|
||||
* GitHub pages publishes the new chart version on the Helm repository
|
||||
|
||||
## Docs
|
||||
|
||||
The documentation [website](/flagger) is built from the `docs` branch.
|
||||
|
||||
After a Flagger release, publish the docs with:
|
||||
|
||||
* `git checkout main && git pull`
|
||||
* `git checkout docs`
|
||||
* `git rebase main`
|
||||
* `git push origin docs`
|
|
@ -0,0 +1,94 @@
|
|||
---
|
||||
type: docs
|
||||
weight: 3
|
||||
title: Upgrade Guide
|
||||
---
|
||||
|
||||
This document describes how to upgrade Flagger.
|
||||
|
||||
## Upgrade canaries v1alpha3 to v1beta1
|
||||
|
||||
Canary CRD changes in `canaries.flagger.app/v1beta1`:
|
||||
|
||||
* the `spec.canaryAnalysis` field has been deprecated and replaced with `spec.analysis`
|
||||
* the `spec.analysis.interval` and `spec.analysis.threshold` fields are required
|
||||
* the `status.lastAppliedSpec` and `status.lastPromotedSpec` hashing algorithm changed to `hash/fnv`
|
||||
* the `spec.analysis.alerts` array can reference `alertproviders.flagger.app/v1beta1` resources
|
||||
* the `spec.analysis.metrics[].templateRef` can reference a `metrictemplate.flagger.app/v1beta1` resource
|
||||
* the `metric.threshold` field has been deprecated and replaced with `metric.thresholdRange`
|
||||
* the `metric.query` field has been deprecated and replaced with `metric.templateRef`
|
||||
* the `spec.ingressRef.apiVersion` accepts `networking.k8s.io/v1beta1`
|
||||
* the `spec.targetRef` can reference `DaemonSet` kind
|
||||
* the `spec.service.meshName` field has been deprecated and no longer used for `provider: appmesh:v1beta2`
|
||||
|
||||
Upgrade procedure:
|
||||
|
||||
* install the `v1beta1` CRDs
|
||||
* update Flagger deployment
|
||||
* replace `apiVersion: flagger.app/v1alpha3` with `apiVersion: flagger.app/v1beta1` in all canary manifests
|
||||
* replace `spec.canaryAnalysis` with `spec.analysis` in all canary manifests
|
||||
* update canary manifests in cluster
|
||||
|
||||
**Note** that after upgrading Flagger, all canaries will be triggered as the hash value used for tracking changes is computed differently. You can set `spec.skipAnalysis: true` in all canary manifests before upgrading Flagger, do the upgrade, wait for Flagger to finish the no-op promotions and finally set `skipAnalysis` to `false`.
|
||||
|
||||
Update builtin metrics:
|
||||
|
||||
* replace `threshold` with `thresholdRange.min` for request-success-rate
|
||||
* replace `threshold` with `thresholdRange.max` for request-duration
|
||||
|
||||
```yaml
|
||||
metrics:
|
||||
- name: request-success-rate
|
||||
thresholdRange:
|
||||
min: 99
|
||||
interval: 1m
|
||||
- name: request-duration
|
||||
thresholdRange:
|
||||
max: 500
|
||||
interval: 1m
|
||||
```
|
||||
|
||||
## Istio telemetry v2
|
||||
|
||||
Istio 1.5 comes with a breaking change for Flagger uses. In Istio telemetry v2 the metric `istio_request_duration_seconds_bucket` has been removed and replaced with `istio_request_duration_milliseconds_bucket` and this breaks the `request-duration` metric check.
|
||||
|
||||
If are using **Istio 1.4**, you can create a metric template using the old duration metric like this:
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: MetricTemplate
|
||||
metadata:
|
||||
name: latency
|
||||
namespace: istio-system
|
||||
spec:
|
||||
provider:
|
||||
type: prometheus
|
||||
address: http://prometheus.istio-system:9090
|
||||
query: |
|
||||
histogram_quantile(
|
||||
0.99,
|
||||
sum(
|
||||
rate(
|
||||
istio_request_duration_seconds_bucket{
|
||||
reporter="destination",
|
||||
destination_workload_namespace="{{ namespace }}",
|
||||
destination_workload=~"{{ target }}"
|
||||
}[{{ interval }}]
|
||||
)
|
||||
) by (le)
|
||||
)
|
||||
```
|
||||
|
||||
In the canary manifests, replace the `request-duration` metric with `latency`:
|
||||
|
||||
```yaml
|
||||
metrics:
|
||||
- name: latency
|
||||
templateRef:
|
||||
name: latency
|
||||
namespace: istio-system
|
||||
thresholdRange:
|
||||
max: 0.500
|
||||
interval: 1m
|
||||
```
|
||||
|
|
@ -0,0 +1,947 @@
|
|||
---
|
||||
type: docs
|
||||
weight: 10
|
||||
title: FAQ
|
||||
---
|
||||
|
||||
## Deployment Strategies
|
||||
|
||||
#### Which deployment strategies are supported by Flagger?
|
||||
|
||||
Flagger implements the following deployment strategies:
|
||||
|
||||
* [Canary Release](usage/deployment-strategies.md#canary-release)
|
||||
* [A/B Testing](usage/deployment-strategies.md#ab-testing)
|
||||
* [Blue/Green](usage/deployment-strategies.md#bluegreen-deployments)
|
||||
* [Blue/Green Mirroring](usage/deployment-strategies.md#bluegreen-with-traffic-mirroring)
|
||||
|
||||
#### When should I use A/B testing instead of progressive traffic shifting?
|
||||
|
||||
For frontend applications that require session affinity, you should use HTTP headers or
|
||||
cookie match conditions to ensure a set of users will stay on the same version for
|
||||
the whole duration of the canary analysis.
|
||||
|
||||
#### Can I use Flagger to manage applications that live outside of a service mesh?
|
||||
|
||||
For applications that are not deployed on a service mesh,
|
||||
Flagger can orchestrate Blue/Green style deployments with Kubernetes L4 networking.
|
||||
|
||||
#### When can I use traffic mirroring?
|
||||
|
||||
Traffic mirroring can be used for Blue/Green deployment strategy or a pre-stage in a Canary release.
|
||||
Traffic mirroring will copy each incoming request, sending one request to the primary and one to the canary service.
|
||||
Mirroring should be used for requests that are **idempotent**
|
||||
or capable of being processed twice (once by the primary and once by the canary).
|
||||
|
||||
#### How to retry a failed release?
|
||||
|
||||
A canary analysis is triggered by changes in any of the following objects:
|
||||
|
||||
* Deployment/DaemonSet PodSpec (metadata, container image, command, ports, env, resources, etc)
|
||||
* ConfigMaps mounted as volumes or mapped to environment variables
|
||||
* Secrets mounted as volumes or mapped to environment variables
|
||||
|
||||
To retry a release you can add or change an annotation on the pod template:
|
||||
|
||||
```yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
spec:
|
||||
template:
|
||||
metadata:
|
||||
annotations:
|
||||
timestamp: "2020-03-10T14:24:48+0000"
|
||||
```
|
||||
|
||||
#### How to change replicas for a deployment when not using HPA?
|
||||
|
||||
To change replicas for a deployment when not using HPA, you have to update the canary deployment with the desired replica count
|
||||
and trigger an analysis by annotating the template. After the analysis finishes, Flagger will promote the `spec.replicas` changes to the primary deployment.
|
||||
|
||||
Example:
|
||||
```yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
spec:
|
||||
replicas: 4 #update replicas
|
||||
template:
|
||||
metadata:
|
||||
annotations:
|
||||
timestamp: "2022-02-10T14:24:48+0000" #add annotation to trigger analysis
|
||||
```
|
||||
|
||||
#### Why is there a window of downtime during the canary initializing process when analysis is disabled?
|
||||
|
||||
A window of downtime is the intended behavior when the analysis is disabled. This allows instant rollback and also mimics the way
|
||||
a Kubernetes deployment initialization works. To avoid this, enable the analysis (`skipAnalysis: true`), wait for the initialization
|
||||
to finish, and disable it afterward (`skipAnalysis: false`).
|
||||
|
||||
## Kubernetes services
|
||||
|
||||
#### How is an application exposed inside the cluster?
|
||||
|
||||
Assuming the app name is `podinfo`, you can define a canary like:
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: Canary
|
||||
metadata:
|
||||
name: podinfo
|
||||
namespace: test
|
||||
spec:
|
||||
targetRef:
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: podinfo
|
||||
service:
|
||||
# service name (optional)
|
||||
name: podinfo
|
||||
# ClusterIP port number (required)
|
||||
port: 9898
|
||||
# container port name or number
|
||||
targetPort: http
|
||||
# port name can be http or grpc (default http)
|
||||
portName: http
|
||||
```
|
||||
|
||||
If the `service.name` is not specified, then `targetRef.name` is used for
|
||||
the apex domain and canary/primary services name prefix.
|
||||
You should treat the service name as an immutable field; changing its could result in routing conflicts.
|
||||
|
||||
Based on the canary spec service, Flagger generates the following Kubernetes ClusterIP service:
|
||||
|
||||
* `<service.name>.<namespace>.svc.cluster.local`
|
||||
|
||||
selector `app=<name>-primary`
|
||||
|
||||
* `<service.name>-primary.<namespace>.svc.cluster.local`
|
||||
|
||||
selector `app=<name>-primary`
|
||||
|
||||
* `<service.name>-canary.<namespace>.svc.cluster.local`
|
||||
|
||||
selector `app=<name>`
|
||||
|
||||
This ensures that traffic coming from a namespace outside the mesh to `podinfo.test:9898`
|
||||
will be routed to the latest stable release of your app.
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: podinfo
|
||||
spec:
|
||||
type: ClusterIP
|
||||
selector:
|
||||
app: podinfo-primary
|
||||
ports:
|
||||
- name: http
|
||||
port: 9898
|
||||
protocol: TCP
|
||||
targetPort: http
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: podinfo-primary
|
||||
spec:
|
||||
type: ClusterIP
|
||||
selector:
|
||||
app: podinfo-primary
|
||||
ports:
|
||||
- name: http
|
||||
port: 9898
|
||||
protocol: TCP
|
||||
targetPort: http
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: podinfo-canary
|
||||
spec:
|
||||
type: ClusterIP
|
||||
selector:
|
||||
app: podinfo
|
||||
ports:
|
||||
- name: http
|
||||
port: 9898
|
||||
protocol: TCP
|
||||
targetPort: http
|
||||
```
|
||||
|
||||
The `podinfo-canary.test:9898` address is available only during the canary analysis
|
||||
and can be used for conformance testing or load testing.
|
||||
|
||||
## Multiple ports
|
||||
|
||||
#### My application listens on multiple ports. How can I expose them inside the cluster?
|
||||
|
||||
If port discovery is enabled, Flagger scans the deployment spec and extracts the containers ports excluding
|
||||
the port specified in the canary service and Envoy sidecar ports.
|
||||
These ports will be used when generating the ClusterIP services.
|
||||
|
||||
For a deployment that exposes two ports:
|
||||
|
||||
```yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
spec:
|
||||
template:
|
||||
metadata:
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "9899"
|
||||
spec:
|
||||
containers:
|
||||
- name: app
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
- containerPort: 9090
|
||||
```
|
||||
|
||||
You can enable port discovery so that Prometheus will be able to reach port `9090` over mTLS:
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: Canary
|
||||
spec:
|
||||
service:
|
||||
# container port used for canary analysis
|
||||
port: 8080
|
||||
# port name can be http or grpc (default http)
|
||||
portName: http
|
||||
# add all the other container ports
|
||||
# to the ClusterIP services (default false)
|
||||
portDiscovery: true
|
||||
trafficPolicy:
|
||||
tls:
|
||||
mode: ISTIO_MUTUAL
|
||||
```
|
||||
|
||||
Both port `8080` and `9090` will be added to the ClusterIP services.
|
||||
|
||||
## Label selectors
|
||||
|
||||
#### What labels selectors are supported by Flagger?
|
||||
|
||||
The target deployment must have a single label selector in the format `app: <DEPLOYMENT-NAME>`:
|
||||
|
||||
```yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: podinfo
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: podinfo
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: podinfo
|
||||
```
|
||||
|
||||
Besides `app`, Flagger supports `name` and `app.kubernetes.io/name` selectors.
|
||||
If you use a different convention, you can specify your label with the `-selector-labels` flag.
|
||||
For example:
|
||||
|
||||
```
|
||||
flagger \
|
||||
-selector-labels=service,name,app.kubernetes.io/name \
|
||||
...
|
||||
```
|
||||
|
||||
#### Is pod affinity and anti affinity supported?
|
||||
|
||||
Flagger will rewrite the first value in each match expression,
|
||||
defined in the target deployment's pod anti-affinity and topology spread constraints,
|
||||
satisfying the following two requirements when creating, or updating, the primary deployment:
|
||||
|
||||
* The key in the match expression must be one of the labels specified by the parameter selector-labels.
|
||||
The default labels are `app`,`name`,`app.kubernetes.io/name`.
|
||||
* The value must match the name of the target deployment.
|
||||
|
||||
The rewrite done by Flagger in these cases is to suffix the value with `-primary`.
|
||||
This rewrite can be used to spread the pods created by the canary
|
||||
and primary deployments across different availability zones.
|
||||
|
||||
Example target deployment:
|
||||
|
||||
```yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: podinfo
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: podinfo
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: podinfo
|
||||
spec:
|
||||
affinity:
|
||||
podAntiAffinity:
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 100
|
||||
podAffinityTerm:
|
||||
labelSelector:
|
||||
matchExpressions:
|
||||
- key: app
|
||||
operator: In
|
||||
values:
|
||||
- podinfo
|
||||
topologyKey: topology.kubernetes.io/zone
|
||||
```
|
||||
|
||||
Example of generated primary deployment:
|
||||
|
||||
```yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: podinfo-primary
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: podinfo-primary
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: podinfo-primary
|
||||
spec:
|
||||
affinity:
|
||||
podAntiAffinity:
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 100
|
||||
podAffinityTerm:
|
||||
labelSelector:
|
||||
matchExpressions:
|
||||
- key: app
|
||||
operator: In
|
||||
values:
|
||||
- podinfo-primary
|
||||
topologyKey: topology.kubernetes.io/zone
|
||||
```
|
||||
|
||||
It is also possible to use a different label than the `app`, `name` or `app.kubernetes.io/name`.
|
||||
|
||||
Anti affinity example(using a different label):
|
||||
|
||||
```yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: podinfo
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: podinfo
|
||||
affinity: podinfo
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: podinfo
|
||||
affinity: podinfo
|
||||
spec:
|
||||
affinity:
|
||||
podAntiAffinity:
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 100
|
||||
podAffinityTerm:
|
||||
labelSelector:
|
||||
matchLabels:
|
||||
affinity: podinfo
|
||||
topologyKey: topology.kubernetes.io/zone
|
||||
```
|
||||
|
||||
## Metrics
|
||||
|
||||
#### How does Flagger measure the request success rate and duration?
|
||||
|
||||
By default, Flagger measures the request success rate and duration using Prometheus queries.
|
||||
|
||||
#### HTTP requests success rate percentage
|
||||
|
||||
Spec:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
metrics:
|
||||
- name: request-success-rate
|
||||
# minimum req success rate (non 5xx responses)
|
||||
# percentage (0-100)
|
||||
thresholdRange:
|
||||
min: 99
|
||||
interval: 1m
|
||||
```
|
||||
|
||||
Istio query:
|
||||
|
||||
```javascript
|
||||
sum(
|
||||
rate(
|
||||
istio_requests_total{
|
||||
reporter="destination",
|
||||
destination_workload_namespace=~"$namespace",
|
||||
destination_workload=~"$workload",
|
||||
response_code!~"5.*"
|
||||
}[$interval]
|
||||
)
|
||||
)
|
||||
/
|
||||
sum(
|
||||
rate(
|
||||
istio_requests_total{
|
||||
reporter="destination",
|
||||
destination_workload_namespace=~"$namespace",
|
||||
destination_workload=~"$workload"
|
||||
}[$interval]
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
Envoy query (App Mesh):
|
||||
|
||||
```javascript
|
||||
sum(
|
||||
rate(
|
||||
envoy_cluster_upstream_rq{
|
||||
kubernetes_namespace="$namespace",
|
||||
kubernetes_pod_name=~"$workload",
|
||||
envoy_response_code!~"5.*"
|
||||
}[$interval]
|
||||
)
|
||||
)
|
||||
/
|
||||
sum(
|
||||
rate(
|
||||
envoy_cluster_upstream_rq{
|
||||
kubernetes_namespace="$namespace",
|
||||
kubernetes_pod_name=~"$workload"
|
||||
}[$interval]
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
Envoy query (Contour and Gloo):
|
||||
|
||||
```javascript
|
||||
sum(
|
||||
rate(
|
||||
envoy_cluster_upstream_rq{
|
||||
envoy_cluster_name=~"$namespace-$workload",
|
||||
envoy_response_code!~"5.*"
|
||||
}[$interval]
|
||||
)
|
||||
)
|
||||
/
|
||||
sum(
|
||||
rate(
|
||||
envoy_cluster_upstream_rq{
|
||||
envoy_cluster_name=~"$namespace-$workload",
|
||||
}[$interval]
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
#### HTTP requests milliseconds duration P99
|
||||
|
||||
Spec:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
metrics:
|
||||
- name: request-duration
|
||||
# maximum req duration P99
|
||||
# milliseconds
|
||||
thresholdRange:
|
||||
max: 500
|
||||
interval: 1m
|
||||
```
|
||||
|
||||
Istio query:
|
||||
|
||||
```javascript
|
||||
histogram_quantile(0.99,
|
||||
sum(
|
||||
irate(
|
||||
istio_request_duration_milliseconds_bucket{
|
||||
reporter="destination",
|
||||
destination_workload=~"$workload",
|
||||
destination_workload_namespace=~"$namespace"
|
||||
}[$interval]
|
||||
)
|
||||
) by (le)
|
||||
)
|
||||
```
|
||||
|
||||
Envoy query (App Mesh, Contour and Gloo):
|
||||
|
||||
```javascript
|
||||
histogram_quantile(0.99,
|
||||
sum(
|
||||
irate(
|
||||
envoy_cluster_upstream_rq_time_bucket{
|
||||
kubernetes_pod_name=~"$workload",
|
||||
kubernetes_namespace=~"$namespace"
|
||||
}[$interval]
|
||||
)
|
||||
) by (le)
|
||||
)
|
||||
```
|
||||
|
||||
> **Note** that the metric interval should be lower or equal to the control loop interval.
|
||||
|
||||
#### Can I use custom metrics?
|
||||
|
||||
The analysis can be extended with metrics provided by Prometheus, Datadog, AWS CloudWatch, New Relic and Graphite.
|
||||
For more details on how custom metrics can be used, please read the [metrics docs](usage/metrics.md).
|
||||
|
||||
#### Istio Gateway API
|
||||
|
||||
If you're using Istio with Gateway API, the Prometheus query needs to include `reporter="source"`. For example, to calculate HTTP requests error percentage, the query would be:
|
||||
|
||||
```javascript
|
||||
100 - sum(
|
||||
rate(
|
||||
istio_requests_total{
|
||||
reporter="source",
|
||||
destination_workload_namespace=~"$namespace",
|
||||
destination_workload=~"$workload",
|
||||
response_code!~"5.*"
|
||||
}[$interval]
|
||||
)
|
||||
)
|
||||
/
|
||||
sum(
|
||||
rate(
|
||||
istio_requests_total{
|
||||
reporter="source",
|
||||
destination_workload_namespace=~"$namespace",
|
||||
destination_workload=~"$workload"
|
||||
}[$interval]
|
||||
)
|
||||
) * 100
|
||||
```
|
||||
|
||||
## Istio routing
|
||||
|
||||
#### How does Flagger interact with Istio?
|
||||
|
||||
Flagger creates an Istio Virtual Service and Destination Rules based on the Canary service spec.
|
||||
The service configuration lets you expose an app inside or outside the mesh. You can also define traffic policies,
|
||||
HTTP match conditions, URI rewrite rules, CORS policies, timeout and retries.
|
||||
|
||||
The following spec exposes the `frontend` workload inside the mesh on `frontend.test.svc.cluster.local:9898`
|
||||
and outside the mesh on `frontend.example.com`. You'll have to specify an Istio ingress gateway for external hosts.
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: Canary
|
||||
metadata:
|
||||
name: frontend
|
||||
namespace: test
|
||||
spec:
|
||||
service:
|
||||
# container port
|
||||
port: 9898
|
||||
# service port name (optional, will default to "http")
|
||||
portName: http-frontend
|
||||
# Istio gateways (optional)
|
||||
gateways:
|
||||
- public-gateway.istio-system.svc.cluster.local
|
||||
- mesh
|
||||
# Istio virtual service host names (optional)
|
||||
hosts:
|
||||
- frontend.example.com
|
||||
# Istio traffic policy
|
||||
trafficPolicy:
|
||||
tls:
|
||||
# use ISTIO_MUTUAL when mTLS is enabled
|
||||
mode: DISABLE
|
||||
# HTTP match conditions (optional)
|
||||
match:
|
||||
- uri:
|
||||
prefix: /
|
||||
# HTTP rewrite (optional)
|
||||
rewrite:
|
||||
uri: /
|
||||
# Istio retry policy (optional)
|
||||
retries:
|
||||
attempts: 3
|
||||
perTryTimeout: 1s
|
||||
retryOn: "gateway-error,connect-failure,refused-stream"
|
||||
# Add headers (optional)
|
||||
headers:
|
||||
request:
|
||||
add:
|
||||
x-some-header: "value"
|
||||
# cross-origin resource sharing policy (optional)
|
||||
corsPolicy:
|
||||
allowOrigin:
|
||||
- example.com
|
||||
allowMethods:
|
||||
- GET
|
||||
allowCredentials: false
|
||||
allowHeaders:
|
||||
- x-some-header
|
||||
maxAge: 24h
|
||||
```
|
||||
|
||||
For the above spec Flagger will generate the following virtual service:
|
||||
|
||||
```yaml
|
||||
apiVersion: networking.istio.io/v1alpha3
|
||||
kind: VirtualService
|
||||
metadata:
|
||||
name: frontend
|
||||
namespace: test
|
||||
ownerReferences:
|
||||
- apiVersion: flagger.app/v1beta1
|
||||
blockOwnerDeletion: true
|
||||
controller: true
|
||||
kind: Canary
|
||||
name: podinfo
|
||||
uid: 3a4a40dd-3875-11e9-8e1d-42010a9c0fd1
|
||||
spec:
|
||||
gateways:
|
||||
- public-gateway.istio-system.svc.cluster.local
|
||||
- mesh
|
||||
hosts:
|
||||
- frontend.example.com
|
||||
- frontend
|
||||
http:
|
||||
- corsPolicy:
|
||||
allowHeaders:
|
||||
- x-some-header
|
||||
allowMethods:
|
||||
- GET
|
||||
allowOrigin:
|
||||
- example.com
|
||||
maxAge: 24h
|
||||
headers:
|
||||
request:
|
||||
add:
|
||||
x-some-header: "value"
|
||||
match:
|
||||
- uri:
|
||||
prefix: /
|
||||
rewrite:
|
||||
uri: /
|
||||
route:
|
||||
- destination:
|
||||
host: podinfo-primary
|
||||
weight: 100
|
||||
- destination:
|
||||
host: podinfo-canary
|
||||
weight: 0
|
||||
retries:
|
||||
attempts: 3
|
||||
perTryTimeout: 1s
|
||||
retryOn: "gateway-error,connect-failure,refused-stream"
|
||||
```
|
||||
|
||||
For each destination in the virtual service a rule is generated:
|
||||
|
||||
```yaml
|
||||
apiVersion: networking.istio.io/v1alpha3
|
||||
kind: DestinationRule
|
||||
metadata:
|
||||
name: frontend-primary
|
||||
namespace: test
|
||||
spec:
|
||||
host: frontend-primary
|
||||
trafficPolicy:
|
||||
tls:
|
||||
mode: DISABLE
|
||||
---
|
||||
apiVersion: networking.istio.io/v1alpha3
|
||||
kind: DestinationRule
|
||||
metadata:
|
||||
name: frontend-canary
|
||||
namespace: test
|
||||
spec:
|
||||
host: frontend-canary
|
||||
trafficPolicy:
|
||||
tls:
|
||||
mode: DISABLE
|
||||
```
|
||||
|
||||
Flagger keeps in sync the virtual service and destination rules with the canary service spec.
|
||||
Any direct modification to the virtual service spec will be overwritten.
|
||||
|
||||
To expose a workload inside the mesh on `http://backend.test.svc.cluster.local:9898`,
|
||||
the service spec can contain only the container port and the traffic policy:
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: Canary
|
||||
metadata:
|
||||
name: backend
|
||||
namespace: test
|
||||
spec:
|
||||
service:
|
||||
port: 9898
|
||||
trafficPolicy:
|
||||
tls:
|
||||
mode: DISABLE
|
||||
```
|
||||
|
||||
Based on the above spec, Flagger will create several ClusterIP services like:
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: backend-primary
|
||||
ownerReferences:
|
||||
- apiVersion: flagger.app/v1beta1
|
||||
blockOwnerDeletion: true
|
||||
controller: true
|
||||
kind: Canary
|
||||
name: backend
|
||||
uid: 2ca1a9c7-2ef6-11e9-bd01-42010a9c0145
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- name: http
|
||||
port: 9898
|
||||
protocol: TCP
|
||||
targetPort: 9898
|
||||
selector:
|
||||
app: backend-primary
|
||||
```
|
||||
|
||||
Flagger works for user facing apps exposed outside the cluster via an ingress gateway and for backend HTTP APIs
|
||||
that are accessible only from inside the mesh.
|
||||
|
||||
If `Delegation` is enabled, Flagger would generate Istio VirtualService without hosts and gateway,
|
||||
making the service compatible with Istio delegation.
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: Canary
|
||||
metadata:
|
||||
name: backend
|
||||
namespace: test
|
||||
spec:
|
||||
service:
|
||||
delegation: true
|
||||
port: 9898
|
||||
targetRef:
|
||||
apiVersion: v1
|
||||
kind: Deployment
|
||||
name: podinfo
|
||||
analysis:
|
||||
interval: 15s
|
||||
threshold: 15
|
||||
maxWeight: 30
|
||||
stepWeight: 10
|
||||
```
|
||||
|
||||
Based on the above spec, Flagger will create the following virtual service:
|
||||
|
||||
```yaml
|
||||
apiVersion: networking.istio.io/v1alpha3
|
||||
kind: VirtualService
|
||||
metadata:
|
||||
name: backend
|
||||
namespace: test
|
||||
ownerReferences:
|
||||
- apiVersion: flagger.app/v1beta1
|
||||
blockOwnerDeletion: true
|
||||
controller: true
|
||||
kind: Canary
|
||||
name: backend
|
||||
uid: 58562662-5e10-4512-b269-2b789c1b30fe
|
||||
spec:
|
||||
http:
|
||||
- route:
|
||||
- destination:
|
||||
host: podinfo-primary
|
||||
weight: 100
|
||||
- destination:
|
||||
host: podinfo-canary
|
||||
weight: 0
|
||||
```
|
||||
|
||||
Therefore, the following virtual service forwards the traffic to `/podinfo` by the above delegate VirtualService.
|
||||
|
||||
```yaml
|
||||
apiVersion: networking.istio.io/v1alpha3
|
||||
kind: VirtualService
|
||||
metadata:
|
||||
name: frontend
|
||||
namespace: test
|
||||
spec:
|
||||
gateways:
|
||||
- public-gateway.istio-system.svc.cluster.local
|
||||
- mesh
|
||||
hosts:
|
||||
- frontend.example.com
|
||||
- frontend
|
||||
http:
|
||||
- match:
|
||||
- uri:
|
||||
prefix: /podinfo
|
||||
rewrite:
|
||||
uri: /
|
||||
delegate:
|
||||
name: backend
|
||||
namespace: test
|
||||
```
|
||||
|
||||
Note that pilot env `PILOT_ENABLE_VIRTUAL_SERVICE_DELEGATE` must also be set.
|
||||
For the use of Istio Delegation, you can refer to the documentation of
|
||||
[Virtual Service](https://istio.io/latest/docs/reference/config/networking/virtual-service/#Delegate)
|
||||
and [pilot environment variables](https://istio.io/latest/docs/reference/commands/pilot-discovery/#envvars).
|
||||
|
||||
## Istio Ingress Gateway
|
||||
|
||||
#### How can I expose multiple canaries on the same external domain?
|
||||
|
||||
Assuming you have two apps -- one that serves the main website and one that serves its REST API --
|
||||
you can define a canary object for each app as:
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: Canary
|
||||
metadata:
|
||||
name: website
|
||||
spec:
|
||||
service:
|
||||
port: 8080
|
||||
gateways:
|
||||
- public-gateway.istio-system.svc.cluster.local
|
||||
hosts:
|
||||
- my-site.com
|
||||
match:
|
||||
- uri:
|
||||
prefix: /
|
||||
rewrite:
|
||||
uri: /
|
||||
---
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: Canary
|
||||
metadata:
|
||||
name: webapi
|
||||
spec:
|
||||
service:
|
||||
port: 8080
|
||||
gateways:
|
||||
- public-gateway.istio-system.svc.cluster.local
|
||||
hosts:
|
||||
- my-site.com
|
||||
match:
|
||||
- uri:
|
||||
prefix: /api
|
||||
rewrite:
|
||||
uri: /
|
||||
```
|
||||
|
||||
Based on the above configuration, Flagger will create two virtual services bounded
|
||||
to the same ingress gateway and external host.
|
||||
Istio Pilot will
|
||||
[merge](https://istio.io/help/ops/traffic-management/deploy-guidelines/#multiple-virtual-services-and-destination-rules-for-the-same-host)
|
||||
the two services and the website rule will be moved to the end of the list in the merged configuration.
|
||||
|
||||
Note that host merging only works if the canaries are bounded to an ingress gateway other than the `mesh` gateway.
|
||||
|
||||
## Istio Mutual TLS
|
||||
|
||||
#### How can I enable mTLS for a canary?
|
||||
|
||||
When deploying Istio with global mTLS enabled, you have to set the TLS mode to `ISTIO_MUTUAL`:
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: Canary
|
||||
spec:
|
||||
service:
|
||||
trafficPolicy:
|
||||
tls:
|
||||
mode: ISTIO_MUTUAL
|
||||
```
|
||||
|
||||
If you run Istio in permissive mode, you can disable TLS:
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: Canary
|
||||
spec:
|
||||
service:
|
||||
trafficPolicy:
|
||||
tls:
|
||||
mode: DISABLE
|
||||
```
|
||||
|
||||
#### If Flagger is outside of the mesh, how can it start the load test?
|
||||
|
||||
In order for Flagger to be able to call the load tester service from outside the mesh,
|
||||
you need to disable mTLS:
|
||||
|
||||
```yaml
|
||||
apiVersion: networking.istio.io/v1beta1
|
||||
kind: DestinationRule
|
||||
metadata:
|
||||
name: flagger-loadtester
|
||||
namespace: test
|
||||
spec:
|
||||
host: "flagger-loadtester.test.svc.cluster.local"
|
||||
trafficPolicy:
|
||||
tls:
|
||||
mode: DISABLE
|
||||
---
|
||||
apiVersion: security.istio.io/v1beta1
|
||||
kind: PeerAuthentication
|
||||
metadata:
|
||||
name: flagger-loadtester
|
||||
namespace: test
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: flagger-loadtester
|
||||
mtls:
|
||||
mode: DISABLE
|
||||
```
|
||||
|
||||
## ExternalDNS
|
||||
|
||||
### Can I use annotations?
|
||||
|
||||
Flagger propagates annotations (and labels) to all the generated apex,
|
||||
primary and canary objects. This allows using external-dns annotations.
|
||||
|
||||
You can configure Flagger to set annotations with:
|
||||
|
||||
```yaml
|
||||
spec:
|
||||
service:
|
||||
apex:
|
||||
annotations:
|
||||
external-dns.alpha.kubernetes.io/hostname: "mydomain.com"
|
||||
primary:
|
||||
annotations:
|
||||
external-dns.alpha.kubernetes.io/hostname: "primary.mydomain.com"
|
||||
canary:
|
||||
annotations:
|
||||
external-dns.alpha.kubernetes.io/hostname: "canary.mydomain.com"
|
||||
```
|
||||
|
||||
### Multiple sources and Istio
|
||||
|
||||
**/!\\** The apex annotations are added to both the generated Kubernetes Services and the generated Istio
|
||||
VirtualServices objects. If you have configured external-dns to use both sources,
|
||||
this will create conflicts!
|
||||
|
||||
```yaml
|
||||
spec:
|
||||
containers:
|
||||
args:
|
||||
- --source=service # choose only one
|
||||
- --source=istio-virtualservice # of these two
|
||||
```
|
||||
|
||||
[Checkout ExternalDNS documentation](https://github.com/kubernetes-sigs/external-dns/blob/master/docs/tutorials/istio.md)
|
|
@ -0,0 +1,5 @@
|
|||
---
|
||||
type: docs
|
||||
weight: 3
|
||||
title: Install
|
||||
---
|
|
@ -0,0 +1,140 @@
|
|||
---
|
||||
type: docs
|
||||
weight: 4
|
||||
title: Flagger Install on Alibaba ServiceMesh
|
||||
---
|
||||
|
||||
This guide walks you through setting up Flagger on Alibaba ServiceMesh.
|
||||
|
||||
## Prerequisites
|
||||
- Created an ACK([Alibabacloud Container Service for Kubernetes](https://cs.console.aliyun.com)) cluster instance.
|
||||
- Created an ASM([Alibaba ServiceMesh](https://servicemesh.console.aliyun.com)) instance, and added ACK cluster.
|
||||
|
||||
### Variables declaration
|
||||
- `$ACK_CONFIG`: the kubeconfig file path of ACK, which be treated as`$HOME/.kube/config` in the rest of guide.
|
||||
- `$MESH_CONFIG`: the kubeconfig file path of ASM.
|
||||
- `$ISTIO_RELEASE`: see https://github.com/istio/istio/releases
|
||||
- `$FLAGGER_SRC`: see https://github.com/fluxcd/flagger
|
||||
|
||||
## Install Prometheus
|
||||
Install Prometheus:
|
||||
|
||||
```bash
|
||||
kubectl apply -f $ISTIO_RELEASE/samples/addons/prometheus.yaml
|
||||
```
|
||||
|
||||
it' same with the below cmd:
|
||||
|
||||
```bash
|
||||
kubectl --kubeconfig "$ACK_CONFIG" apply -f $ISTIO_RELEASE/samples/addons/prometheus.yaml
|
||||
```
|
||||
|
||||
Append the below configs to `scrape_configs` in prometheus configmap, to support telemetry:
|
||||
```yaml
|
||||
scrape_configs:
|
||||
# Mixer scrapping. Defaults to Prometheus and mixer on same namespace.
|
||||
- job_name: 'istio-mesh'
|
||||
kubernetes_sd_configs:
|
||||
- role: endpoints
|
||||
namespaces:
|
||||
names:
|
||||
- istio-system
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
|
||||
action: keep
|
||||
regex: istio-telemetry;prometheus
|
||||
# Scrape config for envoy stats
|
||||
- job_name: 'envoy-stats'
|
||||
metrics_path: /stats/prometheus
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_pod_container_port_name]
|
||||
action: keep
|
||||
regex: '.*-envoy-prom'
|
||||
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
|
||||
action: replace
|
||||
regex: ([^:]+)(?::\d+)?;(\d+)
|
||||
replacement: $1:15090
|
||||
target_label: __address__
|
||||
- action: labeldrop
|
||||
regex: __meta_kubernetes_pod_label_(.+)
|
||||
- source_labels: [__meta_kubernetes_namespace]
|
||||
action: replace
|
||||
target_label: namespace
|
||||
- source_labels: [__meta_kubernetes_pod_name]
|
||||
action: replace
|
||||
target_label: pod_name
|
||||
- job_name: 'istio-policy'
|
||||
kubernetes_sd_configs:
|
||||
- role: endpoints
|
||||
namespaces:
|
||||
names:
|
||||
- istio-system
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
|
||||
action: keep
|
||||
regex: istio-policy;http-policy-monitoring
|
||||
- job_name: 'istio-telemetry'
|
||||
kubernetes_sd_configs:
|
||||
- role: endpoints
|
||||
namespaces:
|
||||
names:
|
||||
- istio-system
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
|
||||
action: keep
|
||||
regex: istio-telemetry;http-monitoring
|
||||
- job_name: 'pilot'
|
||||
kubernetes_sd_configs:
|
||||
- role: endpoints
|
||||
namespaces:
|
||||
names:
|
||||
- istio-system
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
|
||||
action: keep
|
||||
regex: istiod;http-monitoring
|
||||
- source_labels: [__meta_kubernetes_service_label_app]
|
||||
target_label: app
|
||||
- job_name: 'sidecar-injector'
|
||||
kubernetes_sd_configs:
|
||||
- role: endpoints
|
||||
namespaces:
|
||||
names:
|
||||
- istio-system
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
|
||||
action: keep
|
||||
regex: istio-sidecar-injector;http-monitoring
|
||||
```
|
||||
|
||||
## Install Flagger
|
||||
|
||||
Add Flagger Helm repository:
|
||||
|
||||
```bash
|
||||
helm repo add flagger https://flagger.app
|
||||
helm repo update
|
||||
```
|
||||
|
||||
Install Flagger's Canary CRD:
|
||||
|
||||
```yaml
|
||||
kubectl apply -f $FLAGGER_SRC/artifacts/flagger/crd.yaml
|
||||
```
|
||||
|
||||
Deploy Flagger for Alibaba ServiceMesh:
|
||||
|
||||
```bash
|
||||
cp $MESH_CONFIG kubeconfig
|
||||
kubectl -n istio-system create secret generic istio-kubeconfig --from-file kubeconfig
|
||||
kubectl -n istio-system label secret istio-kubeconfig istio/multiCluster=true
|
||||
helm upgrade -i flagger flagger/flagger \
|
||||
--namespace=istio-system \
|
||||
--set crd.create=false \
|
||||
--set meshProvider=istio \
|
||||
--set metricsServer=http://prometheus:9090 \
|
||||
--set istio.kubeconfig.secretName=istio-kubeconfig \
|
||||
--set istio.kubeconfig.key=kubeconfig
|
||||
```
|
|
@ -0,0 +1,155 @@
|
|||
---
|
||||
type: docs
|
||||
weight: 3
|
||||
title: Flagger Install on EKS App Mesh
|
||||
---
|
||||
|
||||
This guide walks you through setting up Flagger and AWS App Mesh on EKS.
|
||||
|
||||
## App Mesh
|
||||
|
||||
The App Mesh integration with EKS is made out of the following components:
|
||||
|
||||
* Kubernetes custom resources
|
||||
* `mesh.appmesh.k8s.aws` defines a logical boundary for network traffic between the services
|
||||
* `virtualnode.appmesh.k8s.aws` defines a logical pointer to a Kubernetes workload
|
||||
* `virtualservice.appmesh.k8s.aws` defines the routing rules for a workload inside the mesh
|
||||
* CRD controller - keeps the custom resources in sync with the App Mesh control plane
|
||||
* Admission controller - injects the Envoy sidecar and assigns Kubernetes pods to App Mesh virtual nodes
|
||||
* Telemetry service - Prometheus instance that collects and stores Envoy's metrics
|
||||
|
||||
## Create a Kubernetes cluster
|
||||
|
||||
In order to create an EKS cluster you can use [eksctl](https://eksctl.io).
|
||||
Eksctl is an open source command-line utility made by Weaveworks in collaboration with Amazon.
|
||||
|
||||
On MacOS you can install eksctl with Homebrew:
|
||||
|
||||
```bash
|
||||
brew tap weaveworks/tap
|
||||
brew install weaveworks/tap/eksctl
|
||||
```
|
||||
|
||||
Create an EKS cluster with:
|
||||
|
||||
```bash
|
||||
eksctl create cluster --name=appmesh \
|
||||
--region=us-west-2 \
|
||||
--nodes 3 \
|
||||
--node-volume-size=120 \
|
||||
--appmesh-access
|
||||
```
|
||||
|
||||
The above command will create a two nodes cluster with
|
||||
App Mesh [IAM policy](https://docs.aws.amazon.com/app-mesh/latest/userguide/MESH_IAM_user_policies.html)
|
||||
attached to the EKS node instance role.
|
||||
|
||||
Verify the install with:
|
||||
|
||||
```bash
|
||||
kubectl get nodes
|
||||
```
|
||||
|
||||
## Install Helm
|
||||
|
||||
Install the [Helm](https://docs.helm.sh/using_helm/#installing-helm) v3 command-line tool:
|
||||
|
||||
```text
|
||||
brew install helm
|
||||
```
|
||||
|
||||
Add the EKS repository to Helm:
|
||||
|
||||
```bash
|
||||
helm repo add eks https://aws.github.io/eks-charts
|
||||
```
|
||||
|
||||
## Enable horizontal pod auto-scaling
|
||||
|
||||
Install the Horizontal Pod Autoscaler (HPA) metrics provider:
|
||||
|
||||
```bash
|
||||
helm upgrade -i metrics-server stable/metrics-server \
|
||||
--namespace kube-system \
|
||||
--set args[0]=--kubelet-preferred-address-types=InternalIP
|
||||
```
|
||||
|
||||
After a minute, the metrics API should report CPU and memory usage for pods. You can very the metrics API with:
|
||||
|
||||
```bash
|
||||
kubectl -n kube-system top pods
|
||||
```
|
||||
|
||||
## Install the App Mesh components
|
||||
|
||||
Install the App Mesh CRDs:
|
||||
|
||||
```bash
|
||||
kubectl apply -k github.com/aws/eks-charts/stable/appmesh-controller//crds?ref=master
|
||||
```
|
||||
|
||||
Create the `appmesh-system` namespace:
|
||||
|
||||
```bash
|
||||
kubectl create ns appmesh-system
|
||||
```
|
||||
|
||||
Install the App Mesh controller:
|
||||
|
||||
```bash
|
||||
helm upgrade -i appmesh-controller eks/appmesh-controller \
|
||||
--wait --namespace appmesh-system
|
||||
```
|
||||
|
||||
In order to collect the App Mesh metrics that Flagger needs to run the canary analysis,
|
||||
you'll need to setup a Prometheus instance to scrape the Envoy sidecars.
|
||||
|
||||
Install the App Mesh Prometheus:
|
||||
|
||||
```bash
|
||||
helm upgrade -i appmesh-prometheus eks/appmesh-prometheus \
|
||||
--wait --namespace appmesh-system
|
||||
```
|
||||
|
||||
## Install Flagger
|
||||
|
||||
Add Flagger Helm repository:
|
||||
|
||||
```bash
|
||||
helm repo add flagger https://flagger.app
|
||||
```
|
||||
|
||||
Install Flagger's Canary CRD:
|
||||
|
||||
```yaml
|
||||
kubectl apply -f https://raw.githubusercontent.com/fluxcd/flagger/main/artifacts/flagger/crd.yaml
|
||||
```
|
||||
|
||||
Deploy Flagger in the _**appmesh-system**_ namespace:
|
||||
|
||||
```bash
|
||||
helm upgrade -i flagger flagger/flagger \
|
||||
--namespace=appmesh-system \
|
||||
--set crd.create=false \
|
||||
--set meshProvider=appmesh:v1beta2 \
|
||||
--set metricsServer=http://appmesh-prometheus:9090
|
||||
```
|
||||
|
||||
## Install Grafana
|
||||
|
||||
Deploy App Mesh Grafana that comes with a dashboard for monitoring Flagger's canary releases:
|
||||
|
||||
```bash
|
||||
helm upgrade -i appmesh-grafana eks/appmesh-grafana \
|
||||
--namespace appmesh-system
|
||||
```
|
||||
|
||||
You can access Grafana using port forwarding:
|
||||
|
||||
```bash
|
||||
kubectl -n appmesh-system port-forward svc/appmesh-grafana 3000:3000
|
||||
```
|
||||
|
||||
Now that you have Flagger running, you can try the
|
||||
[App Mesh canary deployments tutorial](/flagger/usage/appmesh-progressive-delivery).
|
||||
|
|
@ -0,0 +1,404 @@
|
|||
---
|
||||
type: docs
|
||||
weight: 2
|
||||
title: Flagger Install on GKE Istio
|
||||
---
|
||||
|
||||
This guide walks you through setting up Flagger and Istio on Google Kubernetes Engine.
|
||||
|
||||

|
||||
|
||||
## Prerequisites
|
||||
|
||||
You will be creating a cluster on Google’s Kubernetes Engine \(GKE\), if you don’t have an account you can sign up [here](https://cloud.google.com/free/) for free credits.
|
||||
|
||||
Login into Google Cloud, create a project and enable billing for it.
|
||||
|
||||
Install the [gcloud](https://cloud.google.com/sdk/) command line utility and configure your project with `gcloud init`.
|
||||
|
||||
Set the default project \(replace `PROJECT_ID` with your own project\):
|
||||
|
||||
```text
|
||||
gcloud config set project PROJECT_ID
|
||||
```
|
||||
|
||||
Set the default compute region and zone:
|
||||
|
||||
```text
|
||||
gcloud config set compute/region us-central1
|
||||
gcloud config set compute/zone us-central1-a
|
||||
```
|
||||
|
||||
Enable the Kubernetes and Cloud DNS services for your project:
|
||||
|
||||
```text
|
||||
gcloud services enable container.googleapis.com
|
||||
gcloud services enable dns.googleapis.com
|
||||
```
|
||||
|
||||
Install the kubectl command-line tool:
|
||||
|
||||
```text
|
||||
gcloud components install kubectl
|
||||
```
|
||||
|
||||
## GKE cluster setup
|
||||
|
||||
Create a cluster with the Istio add-on:
|
||||
|
||||
```bash
|
||||
K8S_VERSION=$(gcloud container get-server-config --format=json \
|
||||
| jq -r '.validMasterVersions[0]')
|
||||
|
||||
gcloud beta container clusters create istio \
|
||||
--cluster-version=${K8S_VERSION} \
|
||||
--zone=us-central1-a \
|
||||
--num-nodes=2 \
|
||||
--machine-type=n1-highcpu-4 \
|
||||
--preemptible \
|
||||
--no-enable-cloud-logging \
|
||||
--no-enable-cloud-monitoring \
|
||||
--disk-size=30 \
|
||||
--enable-autorepair \
|
||||
--addons=HorizontalPodAutoscaling,Istio \
|
||||
--istio-config=auth=MTLS_PERMISSIVE
|
||||
```
|
||||
|
||||
The above command will create a default node pool consisting of two `n1-highcpu-4` \(vCPU: 4, RAM 3.60GB, DISK: 30GB\) preemptible VMs. Preemptible VMs are up to 80% cheaper than regular instances and are terminated and replaced after a maximum of 24 hours.
|
||||
|
||||
Set up credentials for `kubectl`:
|
||||
|
||||
```bash
|
||||
gcloud container clusters get-credentials istio
|
||||
```
|
||||
|
||||
Create a cluster admin role binding:
|
||||
|
||||
```bash
|
||||
kubectl create clusterrolebinding "cluster-admin-$(whoami)" \
|
||||
--clusterrole=cluster-admin \
|
||||
--user="$(gcloud config get-value core/account)"
|
||||
```
|
||||
|
||||
Validate your setup with:
|
||||
|
||||
```bash
|
||||
kubectl -n istio-system get svc
|
||||
```
|
||||
|
||||
In a couple of seconds GCP should allocate an external IP to the `istio-ingressgateway` service.
|
||||
|
||||
## Cloud DNS setup
|
||||
|
||||
You will need an internet domain and access to the registrar to change the name servers to Google Cloud DNS.
|
||||
|
||||
Create a managed zone named `istio` in Cloud DNS \(replace `example.com` with your domain\):
|
||||
|
||||
```bash
|
||||
gcloud dns managed-zones create \
|
||||
--dns-name="example.com." \
|
||||
--description="Istio zone" "istio"
|
||||
```
|
||||
|
||||
Look up your zone's name servers:
|
||||
|
||||
```bash
|
||||
gcloud dns managed-zones describe istio
|
||||
```
|
||||
|
||||
Update your registrar's name server records with the records returned by the above command.
|
||||
|
||||
Wait for the name servers to change \(replace `example.com` with your domain\):
|
||||
|
||||
```bash
|
||||
watch dig +short NS example.com
|
||||
```
|
||||
|
||||
Create a static IP address named `istio-gateway` using the Istio ingress IP:
|
||||
|
||||
```bash
|
||||
export GATEWAY_IP=$(kubectl -n istio-system get svc/istio-ingressgateway -ojson \
|
||||
| jq -r .status.loadBalancer.ingress[0].ip)
|
||||
|
||||
gcloud compute addresses create istio-gateway --addresses ${GATEWAY_IP} --region us-central1
|
||||
```
|
||||
|
||||
Create the following DNS records \(replace `example.com` with your domain\):
|
||||
|
||||
```bash
|
||||
DOMAIN="example.com"
|
||||
|
||||
gcloud dns record-sets transaction start --zone=istio
|
||||
|
||||
gcloud dns record-sets transaction add --zone=istio \
|
||||
--name="${DOMAIN}" --ttl=300 --type=A ${GATEWAY_IP}
|
||||
|
||||
gcloud dns record-sets transaction add --zone=istio \
|
||||
--name="www.${DOMAIN}" --ttl=300 --type=A ${GATEWAY_IP}
|
||||
|
||||
gcloud dns record-sets transaction add --zone=istio \
|
||||
--name="*.${DOMAIN}" --ttl=300 --type=A ${GATEWAY_IP}
|
||||
|
||||
gcloud dns record-sets transaction execute --zone istio
|
||||
```
|
||||
|
||||
Verify that the wildcard DNS is working \(replace `example.com` with your domain\):
|
||||
|
||||
```bash
|
||||
watch host test.example.com
|
||||
```
|
||||
|
||||
## Install Helm
|
||||
|
||||
Install the [Helm](https://docs.helm.sh/using_helm/#installing-helm) command-line tool:
|
||||
|
||||
```text
|
||||
brew install kubernetes-helm
|
||||
```
|
||||
|
||||
Create a service account and a cluster role binding for Tiller:
|
||||
|
||||
```bash
|
||||
kubectl -n kube-system create sa tiller
|
||||
|
||||
kubectl create clusterrolebinding tiller-cluster-rule \
|
||||
--clusterrole=cluster-admin \
|
||||
--serviceaccount=kube-system:tiller
|
||||
```
|
||||
|
||||
Deploy Tiller in the `kube-system` namespace:
|
||||
|
||||
```bash
|
||||
helm init --service-account tiller
|
||||
```
|
||||
|
||||
You should consider using SSL between Helm and Tiller, for more information on securing your Helm installation see [docs.helm.sh](https://docs.helm.sh/using_helm/#securing-your-helm-installation).
|
||||
|
||||
## Install cert-manager
|
||||
|
||||
Jetstack's [cert-manager](https://github.com/jetstack/cert-manager) is a Kubernetes operator that automatically creates and manages TLS certs issued by Let’s Encrypt.
|
||||
|
||||
You'll be using cert-manager to provision a wildcard certificate for the Istio ingress gateway.
|
||||
|
||||
Install cert-manager's CRDs:
|
||||
|
||||
```bash
|
||||
CERT_REPO=https://raw.githubusercontent.com/jetstack/cert-manager
|
||||
|
||||
kubectl apply -f ${CERT_REPO}/release-0.10/deploy/manifests/00-crds.yaml
|
||||
```
|
||||
|
||||
Create the cert-manager namespace and disable resource validation:
|
||||
|
||||
```bash
|
||||
kubectl create namespace cert-manager
|
||||
|
||||
kubectl label namespace cert-manager certmanager.k8s.io/disable-validation=true
|
||||
```
|
||||
|
||||
Install cert-manager with Helm:
|
||||
|
||||
```bash
|
||||
helm repo add jetstack https://charts.jetstack.io && \
|
||||
helm repo update && \
|
||||
helm upgrade -i cert-manager \
|
||||
--namespace cert-manager \
|
||||
--version v0.10.0 \
|
||||
jetstack/cert-manager
|
||||
```
|
||||
|
||||
## Istio Gateway TLS setup
|
||||
|
||||

|
||||
|
||||
Create a generic Istio Gateway to expose services outside the mesh on HTTPS:
|
||||
|
||||
```bash
|
||||
REPO=https://raw.githubusercontent.com/fluxcd/flagger/main
|
||||
|
||||
kubectl apply -f ${REPO}/artifacts/gke/istio-gateway.yaml
|
||||
```
|
||||
|
||||
Create a service account with Cloud DNS admin role \(replace `my-gcp-project` with your project ID\):
|
||||
|
||||
```bash
|
||||
GCP_PROJECT=my-gcp-project
|
||||
|
||||
gcloud iam service-accounts create dns-admin \
|
||||
--display-name=dns-admin \
|
||||
--project=${GCP_PROJECT}
|
||||
|
||||
gcloud iam service-accounts keys create ./gcp-dns-admin.json \
|
||||
--iam-account=dns-admin@${GCP_PROJECT}.iam.gserviceaccount.com \
|
||||
--project=${GCP_PROJECT}
|
||||
|
||||
gcloud projects add-iam-policy-binding ${GCP_PROJECT} \
|
||||
--member=serviceAccount:dns-admin@${GCP_PROJECT}.iam.gserviceaccount.com \
|
||||
--role=roles/dns.admin
|
||||
```
|
||||
|
||||
Create a Kubernetes secret with the GCP Cloud DNS admin key:
|
||||
|
||||
```bash
|
||||
kubectl create secret generic cert-manager-credentials \
|
||||
--from-file=./gcp-dns-admin.json \
|
||||
--namespace=istio-system
|
||||
```
|
||||
|
||||
Create a letsencrypt issuer for CloudDNS \(replace `email@example.com` with a valid email address and `my-gcp-project`with your project ID\):
|
||||
|
||||
```yaml
|
||||
apiVersion: certmanager.k8s.io/v1alpha1
|
||||
kind: Issuer
|
||||
metadata:
|
||||
name: letsencrypt-prod
|
||||
namespace: istio-system
|
||||
spec:
|
||||
acme:
|
||||
server: https://acme-v02.api.letsencrypt.org/directory
|
||||
email: email@example.com
|
||||
privateKeySecretRef:
|
||||
name: letsencrypt-prod
|
||||
dns01:
|
||||
providers:
|
||||
- name: cloud-dns
|
||||
clouddns:
|
||||
serviceAccountSecretRef:
|
||||
name: cert-manager-credentials
|
||||
key: gcp-dns-admin.json
|
||||
project: my-gcp-project
|
||||
```
|
||||
|
||||
Save the above resource as letsencrypt-issuer.yaml and then apply it:
|
||||
|
||||
```text
|
||||
kubectl apply -f ./letsencrypt-issuer.yaml
|
||||
```
|
||||
|
||||
Create a wildcard certificate \(replace `example.com` with your domain\):
|
||||
|
||||
```yaml
|
||||
apiVersion: certmanager.k8s.io/v1alpha1
|
||||
kind: Certificate
|
||||
metadata:
|
||||
name: istio-gateway
|
||||
namespace: istio-system
|
||||
spec:
|
||||
secretName: istio-ingressgateway-certs
|
||||
issuerRef:
|
||||
name: letsencrypt-prod
|
||||
commonName: "*.example.com"
|
||||
acme:
|
||||
config:
|
||||
- dns01:
|
||||
provider: cloud-dns
|
||||
domains:
|
||||
- "*.example.com"
|
||||
- "example.com"
|
||||
```
|
||||
|
||||
Save the above resource as istio-gateway-cert.yaml and then apply it:
|
||||
|
||||
```text
|
||||
kubectl apply -f ./istio-gateway-cert.yaml
|
||||
```
|
||||
|
||||
In a couple of seconds cert-manager should fetch a wildcard certificate from letsencrypt.org:
|
||||
|
||||
```text
|
||||
kubectl -n istio-system describe certificate istio-gateway
|
||||
|
||||
Events:
|
||||
Type Reason Age From Message
|
||||
---- ------ ---- ---- -------
|
||||
Normal CertIssued 1m52s cert-manager Certificate issued successfully
|
||||
```
|
||||
|
||||
Recreate Istio ingress gateway pods:
|
||||
|
||||
```bash
|
||||
kubectl -n istio-system get pods -l istio=ingressgateway
|
||||
```
|
||||
|
||||
Note that Istio gateway doesn't reload the certificates from the TLS secret on cert-manager renewal. Since the GKE cluster is made out of preemptible VMs the gateway pods will be replaced once every 24h, if your not using preemptible nodes then you need to manually delete the gateway pods every two months before the certificate expires.
|
||||
|
||||
## Install Prometheus
|
||||
|
||||
The GKE Istio add-on does not include a Prometheus instance that scrapes the Istio telemetry service. Because Flagger uses the Istio HTTP metrics to run the canary analysis you have to deploy the following Prometheus configuration that's similar to the one that comes with the official Istio Helm chart.
|
||||
|
||||
Find the GKE Istio version with:
|
||||
|
||||
```bash
|
||||
kubectl -n istio-system get deploy istio-pilot -oyaml | grep image:
|
||||
```
|
||||
|
||||
Install Prometheus in istio-system namespace:
|
||||
|
||||
```bash
|
||||
kubectl -n istio-system apply -f \
|
||||
https://storage.googleapis.com/gke-release/istio/release/1.0.6-gke.3/patches/install-prometheus.yaml
|
||||
```
|
||||
|
||||
## Install Flagger and Grafana
|
||||
|
||||
Add Flagger Helm repository:
|
||||
|
||||
```bash
|
||||
helm repo add flagger https://flagger.app
|
||||
```
|
||||
|
||||
Install Flagger's Canary CRD:
|
||||
|
||||
```yaml
|
||||
kubectl apply -f https://raw.githubusercontent.com/fluxcd/flagger/main/artifacts/flagger/crd.yaml
|
||||
```
|
||||
|
||||
Deploy Flagger in the `istio-system` namespace with Slack notifications enabled:
|
||||
|
||||
```bash
|
||||
helm upgrade -i flagger flagger/flagger \
|
||||
--namespace=istio-system \
|
||||
--set crd.create=false \
|
||||
--set metricsServer=http://prometheus.istio-system:9090 \
|
||||
--set slack.url=https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK \
|
||||
--set slack.channel=general \
|
||||
--set slack.user=flagger
|
||||
```
|
||||
|
||||
Deploy Grafana in the `istio-system` namespace:
|
||||
|
||||
```bash
|
||||
helm upgrade -i flagger-grafana flagger/grafana \
|
||||
--namespace=istio-system \
|
||||
--set url=http://prometheus.istio-system:9090 \
|
||||
--set user=admin \
|
||||
--set password=replace-me
|
||||
```
|
||||
|
||||
Expose Grafana through the public gateway by creating a virtual service \(replace `example.com` with your domain\):
|
||||
|
||||
```yaml
|
||||
apiVersion: networking.istio.io/v1alpha3
|
||||
kind: VirtualService
|
||||
metadata:
|
||||
name: grafana
|
||||
namespace: istio-system
|
||||
spec:
|
||||
hosts:
|
||||
- "grafana.example.com"
|
||||
gateways:
|
||||
- public-gateway.istio-system.svc.cluster.local
|
||||
http:
|
||||
- route:
|
||||
- destination:
|
||||
host: flagger-grafana
|
||||
```
|
||||
|
||||
Save the above resource as grafana-virtual-service.yaml and then apply it:
|
||||
|
||||
```bash
|
||||
kubectl apply -f ./grafana-virtual-service.yaml
|
||||
```
|
||||
|
||||
Navigate to `http://grafana.example.com` in your browser and you should be redirected to the HTTPS version.
|
||||
|
|
@ -0,0 +1,260 @@
|
|||
---
|
||||
type: docs
|
||||
weight: 1
|
||||
title: Flagger Install on Kubernetes
|
||||
---
|
||||
|
||||
This guide walks you through setting up Flagger on a Kubernetes cluster with Helm v3 or Kustomize.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Flagger requires a Kubernetes cluster **v1.16** or newer.
|
||||
|
||||
## Install Flagger with Helm
|
||||
|
||||
Add Flagger Helm repository:
|
||||
|
||||
```bash
|
||||
helm repo add flagger https://flagger.app
|
||||
```
|
||||
|
||||
Install Flagger's Canary CRD:
|
||||
|
||||
```yaml
|
||||
kubectl apply -f https://raw.githubusercontent.com/fluxcd/flagger/main/artifacts/flagger/crd.yaml
|
||||
```
|
||||
|
||||
Deploy Flagger for Istio:
|
||||
|
||||
```bash
|
||||
helm upgrade -i flagger flagger/flagger \
|
||||
--namespace=istio-system \
|
||||
--set crd.create=false \
|
||||
--set meshProvider=istio \
|
||||
--set metricsServer=http://prometheus:9090
|
||||
```
|
||||
|
||||
Note that Flagger depends on Istio telemetry and Prometheus, if you're installing
|
||||
Istio with istioctl then you should be using the
|
||||
[default profile](https://istio.io/docs/setup/additional-setup/config-profiles/).
|
||||
|
||||
For Istio multi-cluster shared control plane you can install Flagger on each remote cluster and set the
|
||||
Istio control plane host cluster kubeconfig:
|
||||
|
||||
```bash
|
||||
helm upgrade -i flagger flagger/flagger \
|
||||
--namespace=istio-system \
|
||||
--set crd.create=false \
|
||||
--set meshProvider=istio \
|
||||
--set metricsServer=http://istio-cluster-prometheus:9090 \
|
||||
--set istio.kubeconfig.secretName=istio-kubeconfig \
|
||||
--set istio.kubeconfig.key=kubeconfig
|
||||
```
|
||||
|
||||
Note that the Istio kubeconfig must be stored in a Kubernetes secret with a data key named `kubeconfig`.
|
||||
For more details on how to configure Istio multi-cluster
|
||||
credentials read the [Istio docs](https://istio.io/docs/setup/install/multicluster/shared-vpn/#credentials).
|
||||
|
||||
Deploy Flagger for Linkerd:
|
||||
|
||||
```bash
|
||||
helm upgrade -i flagger flagger/flagger \
|
||||
--namespace=linkerd \
|
||||
--set crd.create=false \
|
||||
--set meshProvider=linkerd \
|
||||
--set metricsServer=http://linkerd-prometheus:9090
|
||||
```
|
||||
|
||||
Deploy Flagger for App Mesh:
|
||||
|
||||
```bash
|
||||
helm upgrade -i flagger flagger/flagger \
|
||||
--namespace=appmesh-system \
|
||||
--set crd.create=false \
|
||||
--set meshProvider=appmesh \
|
||||
--set metricsServer=http://appmesh-prometheus:9090
|
||||
```
|
||||
|
||||
Deploy Flagger for **Open Service Mesh (OSM)** (requires OSM to have been installed with Prometheus):
|
||||
|
||||
```console
|
||||
$ helm upgrade -i flagger flagger/flagger \
|
||||
--namespace=osm-system \
|
||||
--set crd.create=false \
|
||||
--set meshProvider=osm \
|
||||
--set metricsServer=http://osm-prometheus.osm-system.svc:7070
|
||||
```
|
||||
|
||||
You can install Flagger in any namespace as long as it can talk to the Prometheus service on port 9090.
|
||||
|
||||
For ingress controllers, the install instructions are:
|
||||
|
||||
* [Contour](/flagger/tutorials/contour-progressive-delivery)
|
||||
* [Gloo](/flagger/tutorials/gloo-progressive-delivery)
|
||||
* [NGINX](/flagger/tutorials/nginx-progressive-delivery)
|
||||
* [Skipper](/flagger/tutorials/skipper-progressive-delivery)
|
||||
* [Traefik](/flagger/tutorials/traefik-progressive-delivery)
|
||||
|
||||
You can use the helm template command and apply the generated yaml with kubectl:
|
||||
|
||||
```bash
|
||||
# generate
|
||||
helm fetch --untar --untardir . flagger/flagger &&
|
||||
helm template flagger ./flagger \
|
||||
--namespace=istio-system \
|
||||
--set metricsServer=http://prometheus.istio-system:9090 \
|
||||
> flagger.yaml
|
||||
|
||||
# apply
|
||||
kubectl apply -f flagger.yaml
|
||||
```
|
||||
|
||||
To uninstall the Flagger release with Helm run:
|
||||
|
||||
```text
|
||||
helm delete flagger
|
||||
```
|
||||
|
||||
The command removes all the Kubernetes components associated with the chart and deletes the release.
|
||||
|
||||
> **Note** that on uninstall the Canary CRD will not be removed. Deleting the CRD will make Kubernetes
|
||||
> remove all the objects owned by Flagger like Istio virtual services, Kubernetes deployments and ClusterIP services.
|
||||
|
||||
If you want to remove all the objects created by Flagger you have delete the Canary CRD with kubectl:
|
||||
|
||||
```text
|
||||
kubectl delete crd canaries.flagger.app
|
||||
```
|
||||
|
||||
## Install Grafana with Helm
|
||||
|
||||
Flagger comes with a Grafana dashboard made for monitoring the canary analysis.
|
||||
|
||||
Deploy Grafana in the _**istio-system**_ namespace:
|
||||
|
||||
```bash
|
||||
helm upgrade -i flagger-grafana flagger/grafana \
|
||||
--namespace=istio-system \
|
||||
--set url=http://prometheus.istio-system:9090 \
|
||||
--set user=admin \
|
||||
--set password=change-me
|
||||
```
|
||||
|
||||
Or use helm template command and apply the generated yaml with kubectl:
|
||||
|
||||
```bash
|
||||
# generate
|
||||
helm fetch --untar --untardir . flagger/grafana &&
|
||||
helm template flagger-grafana ./grafana \
|
||||
--namespace=istio-system \
|
||||
> flagger-grafana.yaml
|
||||
|
||||
# apply
|
||||
kubectl apply -f flagger-grafana.yaml
|
||||
```
|
||||
|
||||
You can access Grafana using port forwarding:
|
||||
|
||||
```bash
|
||||
kubectl -n istio-system port-forward svc/flagger-grafana 3000:80
|
||||
```
|
||||
|
||||
## Install Flagger with Kustomize
|
||||
|
||||
As an alternative to Helm, Flagger can be installed with Kustomize **3.5.0** or newer.
|
||||
|
||||
**Service mesh specific installers**
|
||||
|
||||
Install Flagger for Istio:
|
||||
|
||||
```bash
|
||||
kustomize build https://github.com/fluxcd/flagger/kustomize/istio?ref=main | kubectl apply -f -
|
||||
```
|
||||
|
||||
Install Flagger for AWS App Mesh:
|
||||
|
||||
```bash
|
||||
kustomize build https://github.com/fluxcd/flagger/kustomize/appmesh?ref=main | kubectl apply -f -
|
||||
```
|
||||
|
||||
This deploys Flagger and sets the metrics server URL to App Mesh's Prometheus instance.
|
||||
|
||||
Install Flagger for Linkerd:
|
||||
|
||||
```bash
|
||||
kustomize build https://github.com/fluxcd/flagger/kustomize/linkerd?ref=main | kubectl apply -f -
|
||||
```
|
||||
|
||||
This deploys Flagger in the `linkerd` namespace and sets the metrics server URL to Linkerd's Prometheus instance.
|
||||
|
||||
Install Flagger for Open Service Mesh:
|
||||
|
||||
```bash
|
||||
kustomize build https://github.com/fluxcd/flagger/kustomize/osm?ref=main | kubectl apply -f -
|
||||
```
|
||||
|
||||
This deploys Flagger in the `osm-system` namespace and sets the metrics server URL to OSM's Prometheus instance.
|
||||
|
||||
If you want to install a specific Flagger release, add the version number to the URL:
|
||||
|
||||
```bash
|
||||
kustomize build https://github.com/fluxcd/flagger/kustomize/linkerd?ref=v1.0.0 | kubectl apply -f -
|
||||
```
|
||||
|
||||
**Generic installer**
|
||||
|
||||
Install Flagger and Prometheus for Contour, Gloo, NGINX, Skipper, or Traefik ingress:
|
||||
|
||||
```bash
|
||||
kustomize build https://github.com/fluxcd/flagger/kustomize/kubernetes?ref=main | kubectl apply -f -
|
||||
```
|
||||
|
||||
This deploys Flagger and Prometheus in the `flagger-system` namespace,
|
||||
sets the metrics server URL to `http://flagger-prometheus.flagger-system:9090` and the mesh provider to `kubernetes`.
|
||||
|
||||
The Prometheus instance has a two hours data retention and is configured to scrape all pods in your cluster
|
||||
that have the `prometheus.io/scrape: "true"` annotation.
|
||||
|
||||
To target a different provider you can specify it in the canary custom resource:
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: Canary
|
||||
metadata:
|
||||
name: app
|
||||
namespace: test
|
||||
spec:
|
||||
# can be: kubernetes, istio, linkerd, appmesh, nginx, skipper, gloo, traefik, osm
|
||||
# use the kubernetes provider for Blue/Green style deployments
|
||||
provider: nginx
|
||||
```
|
||||
|
||||
**Customized installer**
|
||||
|
||||
Create a kustomization file using Flagger as base and patch the container args:
|
||||
|
||||
```bash
|
||||
cat > kustomization.yaml <<EOF
|
||||
namespace: istio-system
|
||||
bases:
|
||||
- https://github.com/fluxcd/flagger/kustomize/kubernetes?ref=main
|
||||
patches:
|
||||
- target:
|
||||
kind: Deployment
|
||||
name: flagger
|
||||
patch: |-
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: flagger
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: flagger
|
||||
args:
|
||||
- -mesh-provider=istio
|
||||
- -metrics-server=http://prometheus.istio-system:9090
|
||||
- -include-label-prefix=app.kubernetes.io
|
||||
EOF
|
||||
```
|
|
@ -0,0 +1,5 @@
|
|||
---
|
||||
type: docs
|
||||
weight: 4
|
||||
title: Tutorials
|
||||
---
|
|
@ -0,0 +1,438 @@
|
|||
---
|
||||
type: docs
|
||||
weight: 4
|
||||
title: App Mesh Canary Deployments
|
||||
---
|
||||
|
||||
This guide shows you how to use App Mesh and Flagger to automate canary deployments.
|
||||
You'll need an EKS cluster (Kubernetes >= 1.16) configured with App Mesh,
|
||||
you can find the installation guide [here](/flagger/install/flagger-install-on-eks-appmesh).
|
||||
|
||||
## Bootstrap
|
||||
|
||||
Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler (HPA),
|
||||
then creates a series of objects (Kubernetes deployments, ClusterIP services,
|
||||
App Mesh virtual nodes and services).
|
||||
These objects expose the application on the mesh and drive the canary analysis and promotion.
|
||||
The only App Mesh object you need to create by yourself is the mesh resource.
|
||||
|
||||
Create a mesh called `global`:
|
||||
|
||||
```bash
|
||||
cat << EOF | kubectl apply -f -
|
||||
apiVersion: appmesh.k8s.aws/v1beta2
|
||||
kind: Mesh
|
||||
metadata:
|
||||
name: global
|
||||
spec:
|
||||
namespaceSelector:
|
||||
matchLabels:
|
||||
appmesh.k8s.aws/sidecarInjectorWebhook: enabled
|
||||
EOF
|
||||
```
|
||||
|
||||
Create a test namespace with App Mesh sidecar injection enabled:
|
||||
|
||||
```bash
|
||||
cat << EOF | kubectl apply -f -
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: test
|
||||
labels:
|
||||
appmesh.k8s.aws/sidecarInjectorWebhook: enabled
|
||||
EOF
|
||||
```
|
||||
|
||||
Create a deployment and a horizontal pod autoscaler:
|
||||
|
||||
```bash
|
||||
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/podinfo?ref=main
|
||||
```
|
||||
|
||||
Deploy the load testing service to generate traffic during the canary analysis:
|
||||
|
||||
```bash
|
||||
helm upgrade -i flagger-loadtester flagger/loadtester \
|
||||
--namespace=test \
|
||||
--set appmesh.enabled=true \
|
||||
--set "appmesh.backends[0]=podinfo" \
|
||||
--set "appmesh.backends[1]=podinfo-canary"
|
||||
```
|
||||
|
||||
Create a canary definition:
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: Canary
|
||||
metadata:
|
||||
annotations:
|
||||
# Enable Envoy access logging to stdout.
|
||||
appmesh.flagger.app/accesslog: enabled
|
||||
name: podinfo
|
||||
namespace: test
|
||||
spec:
|
||||
# App Mesh API reference
|
||||
provider: appmesh:v1beta2
|
||||
# deployment reference
|
||||
targetRef:
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: podinfo
|
||||
# the maximum time in seconds for the canary deployment
|
||||
# to make progress before it is rollback (default 600s)
|
||||
progressDeadlineSeconds: 60
|
||||
# HPA reference (optional)
|
||||
autoscalerRef:
|
||||
apiVersion: autoscaling/v2beta2
|
||||
kind: HorizontalPodAutoscaler
|
||||
name: podinfo
|
||||
service:
|
||||
# container port
|
||||
port: 9898
|
||||
# App Mesh ingress timeout (optional)
|
||||
timeout: 15s
|
||||
# App Mesh retry policy (optional)
|
||||
retries:
|
||||
attempts: 3
|
||||
perTryTimeout: 5s
|
||||
retryOn: "gateway-error,client-error,stream-error"
|
||||
# App Mesh URI settings
|
||||
match:
|
||||
- uri:
|
||||
prefix: /
|
||||
rewrite:
|
||||
uri: /
|
||||
# define the canary analysis timing and KPIs
|
||||
analysis:
|
||||
# schedule interval (default 60s)
|
||||
interval: 1m
|
||||
# max number of failed metric checks before rollback
|
||||
threshold: 5
|
||||
# max traffic percentage routed to canary
|
||||
# percentage (0-100)
|
||||
maxWeight: 50
|
||||
# canary increment step
|
||||
# percentage (0-100)
|
||||
stepWeight: 5
|
||||
# App Mesh Prometheus checks
|
||||
metrics:
|
||||
- name: request-success-rate
|
||||
# minimum req success rate (non 5xx responses)
|
||||
# percentage (0-100)
|
||||
thresholdRange:
|
||||
min: 99
|
||||
interval: 1m
|
||||
- name: request-duration
|
||||
# maximum req duration P99
|
||||
# milliseconds
|
||||
thresholdRange:
|
||||
max: 500
|
||||
interval: 30s
|
||||
# testing (optional)
|
||||
webhooks:
|
||||
- name: acceptance-test
|
||||
type: pre-rollout
|
||||
url: http://flagger-loadtester.test/
|
||||
timeout: 30s
|
||||
metadata:
|
||||
type: bash
|
||||
cmd: "curl -sd 'test' http://podinfo-canary.test:9898/token | grep token"
|
||||
- name: load-test
|
||||
url: http://flagger-loadtester.test/
|
||||
timeout: 5s
|
||||
metadata:
|
||||
cmd: "hey -z 1m -q 10 -c 2 http://podinfo-canary.test:9898/"
|
||||
```
|
||||
|
||||
Save the above resource as podinfo-canary.yaml and then apply it:
|
||||
|
||||
```bash
|
||||
kubectl apply -f ./podinfo-canary.yaml
|
||||
```
|
||||
|
||||
After a couple of seconds Flagger will create the canary objects:
|
||||
|
||||
```bash
|
||||
# applied
|
||||
deployment.apps/podinfo
|
||||
horizontalpodautoscaler.autoscaling/podinfo
|
||||
canary.flagger.app/podinfo
|
||||
|
||||
# generated Kubernetes objects
|
||||
deployment.apps/podinfo-primary
|
||||
horizontalpodautoscaler.autoscaling/podinfo-primary
|
||||
service/podinfo
|
||||
service/podinfo-canary
|
||||
service/podinfo-primary
|
||||
|
||||
# generated App Mesh objects
|
||||
virtualnode.appmesh.k8s.aws/podinfo-canary
|
||||
virtualnode.appmesh.k8s.aws/podinfo-primary
|
||||
virtualrouter.appmesh.k8s.aws/podinfo
|
||||
virtualrouter.appmesh.k8s.aws/podinfo-canary
|
||||
virtualservice.appmesh.k8s.aws/podinfo
|
||||
virtualservice.appmesh.k8s.aws/podinfo-canary
|
||||
```
|
||||
|
||||
After the boostrap, the podinfo deployment will be scaled to zero and the traffic to `podinfo.test`
|
||||
will be routed to the primary pods.
|
||||
During the canary analysis, the `podinfo-canary.test` address can be used to target directly the canary pods.
|
||||
|
||||
App Mesh blocks all egress traffic by default.
|
||||
If your application needs to call another service, you have to create an App Mesh virtual service for it
|
||||
and add the virtual service name to the backend list.
|
||||
|
||||
```yaml
|
||||
service:
|
||||
port: 9898
|
||||
backends:
|
||||
- backend1
|
||||
- arn:aws:appmesh:eu-west-1:12345678910:mesh/my-mesh/virtualService/backend2
|
||||
```
|
||||
|
||||
## Setup App Mesh Gateway (optional)
|
||||
|
||||
In order to expose the podinfo app outside the mesh you can use the App Mesh Gateway.
|
||||
|
||||
Deploy the App Mesh Gateway behind an AWS NLB:
|
||||
|
||||
```bash
|
||||
helm upgrade -i appmesh-gateway eks/appmesh-gateway \
|
||||
--namespace test
|
||||
```
|
||||
|
||||
Find the gateway public address:
|
||||
|
||||
```bash
|
||||
export URL="http://$(kubectl -n test get svc/appmesh-gateway -ojson | jq -r ".status.loadBalancer.ingress[].hostname")"
|
||||
echo $URL
|
||||
```
|
||||
|
||||
Wait for the NLB to become active:
|
||||
|
||||
```bash
|
||||
watch curl -sS $URL
|
||||
```
|
||||
|
||||
Create a gateway route that points to the podinfo virtual service:
|
||||
|
||||
```yaml
|
||||
cat << EOF | kubectl apply -f -
|
||||
apiVersion: appmesh.k8s.aws/v1beta2
|
||||
kind: GatewayRoute
|
||||
metadata:
|
||||
name: podinfo
|
||||
namespace: test
|
||||
spec:
|
||||
httpRoute:
|
||||
match:
|
||||
prefix: "/"
|
||||
action:
|
||||
target:
|
||||
virtualService:
|
||||
virtualServiceRef:
|
||||
name: podinfo
|
||||
EOF
|
||||
```
|
||||
|
||||
Open your browser and navigate to the ingress address to access podinfo UI.
|
||||
|
||||
## Automated canary promotion
|
||||
|
||||
A canary deployment is triggered by changes in any of the following objects:
|
||||
|
||||
* Deployment PodSpec (container image, command, ports, env, resources, etc)
|
||||
* ConfigMaps and Secrets mounted as volumes or mapped to environment variables
|
||||
|
||||
Trigger a canary deployment by updating the container image:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=ghcr.io/stefanprodan/podinfo:6.0.1
|
||||
```
|
||||
|
||||
Flagger detects that the deployment revision changed and starts a new rollout:
|
||||
|
||||
```text
|
||||
kubectl -n test describe canary/podinfo
|
||||
|
||||
Status:
|
||||
Canary Weight: 0
|
||||
Failed Checks: 0
|
||||
Phase: Succeeded
|
||||
Events:
|
||||
New revision detected! Scaling up podinfo.test
|
||||
Waiting for podinfo.test rollout to finish: 0 of 1 updated replicas are available
|
||||
Pre-rollout check acceptance-test passed
|
||||
Advance podinfo.test canary weight 5
|
||||
Advance podinfo.test canary weight 10
|
||||
Advance podinfo.test canary weight 15
|
||||
Advance podinfo.test canary weight 20
|
||||
Advance podinfo.test canary weight 25
|
||||
Advance podinfo.test canary weight 30
|
||||
Advance podinfo.test canary weight 35
|
||||
Advance podinfo.test canary weight 40
|
||||
Advance podinfo.test canary weight 45
|
||||
Advance podinfo.test canary weight 50
|
||||
Copying podinfo.test template spec to podinfo-primary.test
|
||||
Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
|
||||
Routing all traffic to primary
|
||||
Promotion completed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
When the canary analysis starts, Flagger will call the pre-rollout webhooks before routing traffic to the canary.
|
||||
|
||||
**Note** that if you apply new changes to the deployment during the canary analysis, Flagger will restart the analysis.
|
||||
|
||||
During the analysis the canary’s progress can be monitored with Grafana.
|
||||
The App Mesh dashboard URL is
|
||||
[http://localhost:3000/d/flagger-appmesh/appmesh-canary?refresh=10s&orgId=1&var-namespace=test&var-primary=podinfo-primary&var-canary=podinfo](http://localhost:3000/d/flagger-appmesh/appmesh-canary?refresh=10s&orgId=1&var-namespace=test&var-primary=podinfo-primary&var-canary=podinfo).
|
||||
|
||||

|
||||
|
||||
You can monitor all canaries with:
|
||||
|
||||
```bash
|
||||
watch kubectl get canaries --all-namespaces
|
||||
|
||||
NAMESPACE NAME STATUS WEIGHT
|
||||
test podinfo Progressing 15
|
||||
prod frontend Succeeded 0
|
||||
prod backend Failed 0
|
||||
```
|
||||
|
||||
If you’ve enabled the Slack notifications, you should receive the following messages:
|
||||
|
||||

|
||||
|
||||
## Automated rollback
|
||||
|
||||
During the canary analysis you can generate HTTP 500 errors or high latency to test if Flagger pauses the rollout.
|
||||
|
||||
Trigger a canary deployment:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=ghcr.io/stefanprodan/podinfo:6.0.2
|
||||
```
|
||||
|
||||
Exec into the load tester pod with:
|
||||
|
||||
```bash
|
||||
kubectl -n test exec -it deploy/flagger-loadtester bash
|
||||
```
|
||||
|
||||
Generate HTTP 500 errors:
|
||||
|
||||
```bash
|
||||
hey -z 1m -c 5 -q 5 http://podinfo-canary.test:9898/status/500
|
||||
```
|
||||
|
||||
Generate latency:
|
||||
|
||||
```bash
|
||||
watch -n 1 curl http://podinfo-canary.test:9898/delay/1
|
||||
```
|
||||
|
||||
When the number of failed checks reaches the canary analysis threshold, the traffic is routed back to the primary,
|
||||
the canary is scaled to zero and the rollout is marked as failed.
|
||||
|
||||
```text
|
||||
kubectl -n appmesh-system logs deploy/flagger -f | jq .msg
|
||||
|
||||
New revision detected! progressing canary analysis for podinfo.test
|
||||
Pre-rollout check acceptance-test passed
|
||||
Advance podinfo.test canary weight 5
|
||||
Advance podinfo.test canary weight 10
|
||||
Advance podinfo.test canary weight 15
|
||||
Halt podinfo.test advancement success rate 69.17% < 99%
|
||||
Halt podinfo.test advancement success rate 61.39% < 99%
|
||||
Halt podinfo.test advancement success rate 55.06% < 99%
|
||||
Halt podinfo.test advancement request duration 1.20s > 0.5s
|
||||
Halt podinfo.test advancement request duration 1.45s > 0.5s
|
||||
Rolling back podinfo.test failed checks threshold reached 5
|
||||
Canary failed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
If you’ve enabled the Slack notifications, you’ll receive a message if the progress deadline is exceeded,
|
||||
or if the analysis reached the maximum number of failed checks:
|
||||
|
||||

|
||||
|
||||
## A/B Testing
|
||||
|
||||
Besides weighted routing, Flagger can be configured to route traffic to the canary based on HTTP match conditions.
|
||||
In an A/B testing scenario, you'll be using HTTP headers or cookies to target a certain segment of your users.
|
||||
This is particularly useful for frontend applications that require session affinity.
|
||||
|
||||

|
||||
|
||||
Edit the canary analysis, remove the max/step weight and add the match conditions and iterations:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
interval: 1m
|
||||
threshold: 5
|
||||
iterations: 10
|
||||
match:
|
||||
- headers:
|
||||
x-canary:
|
||||
exact: "insider"
|
||||
webhooks:
|
||||
- name: load-test
|
||||
url: http://flagger-loadtester.test/
|
||||
metadata:
|
||||
cmd: "hey -z 1m -q 10 -c 2 -H 'X-Canary: insider' http://podinfo.test:9898/"
|
||||
```
|
||||
|
||||
The above configuration will run an analysis for ten minutes targeting users that have a `X-Canary: insider` header.
|
||||
|
||||
You can also use a HTTP cookie, to target all users with a `canary` cookie set to `insider` the match condition should be:
|
||||
|
||||
```yaml
|
||||
match:
|
||||
- headers:
|
||||
cookie:
|
||||
regex: "^(.*?;)?(canary=insider)(;.*)?$"
|
||||
webhooks:
|
||||
- name: load-test
|
||||
url: http://flagger-loadtester.test/
|
||||
metadata:
|
||||
cmd: "hey -z 1m -q 10 -c 2 -H 'Cookie: canary=insider' http://podinfo.test:9898/"
|
||||
```
|
||||
|
||||
Trigger a canary deployment by updating the container image:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=ghcr.io/stefanprodan/podinfo:6.0.3
|
||||
```
|
||||
|
||||
Flagger detects that the deployment revision changed and starts the A/B test:
|
||||
|
||||
```text
|
||||
kubectl -n appmesh-system logs deploy/flagger -f | jq .msg
|
||||
|
||||
New revision detected! progressing canary analysis for podinfo.test
|
||||
Advance podinfo.test canary iteration 1/10
|
||||
Advance podinfo.test canary iteration 2/10
|
||||
Advance podinfo.test canary iteration 3/10
|
||||
Advance podinfo.test canary iteration 4/10
|
||||
Advance podinfo.test canary iteration 5/10
|
||||
Advance podinfo.test canary iteration 6/10
|
||||
Advance podinfo.test canary iteration 7/10
|
||||
Advance podinfo.test canary iteration 8/10
|
||||
Advance podinfo.test canary iteration 9/10
|
||||
Advance podinfo.test canary iteration 10/10
|
||||
Copying podinfo.test template spec to podinfo-primary.test
|
||||
Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
|
||||
Routing all traffic to primary
|
||||
Promotion completed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
The above procedure can be extended with
|
||||
[custom metrics](../usage/metrics.md) checks,
|
||||
[webhooks](../usage/webhooks.md),
|
||||
[manual promotion](../usage/webhooks.md#manual-gating) approval and
|
||||
[Slack or MS Teams](../usage/alerting.md) notifications.
|
|
@ -0,0 +1,431 @@
|
|||
---
|
||||
type: docs
|
||||
weight: 5
|
||||
title: Contour Canary Deployments
|
||||
---
|
||||
|
||||
This guide shows you how to use [Contour](https://projectcontour.io/) ingress controller and Flagger to automate canary releases and A/B testing.
|
||||
|
||||

|
||||
|
||||
## Prerequisites
|
||||
|
||||
Flagger requires a Kubernetes cluster **v1.16** or newer and Contour **v1.0** or newer.
|
||||
|
||||
Install Contour on a cluster with LoadBalancer support:
|
||||
|
||||
```bash
|
||||
kubectl apply -f https://projectcontour.io/quickstart/contour.yaml
|
||||
```
|
||||
|
||||
The above command will deploy Contour and an Envoy daemonset in the `projectcontour` namespace.
|
||||
|
||||
Install Flagger using Kustomize (kubectl 1.14) in the `projectcontour` namespace:
|
||||
|
||||
```bash
|
||||
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/contour?ref=main
|
||||
```
|
||||
|
||||
The above command will deploy Flagger and Prometheus configured to scrape the Contour's Envoy instances.
|
||||
|
||||
Or you can install Flagger using Helm v3:
|
||||
|
||||
```bash
|
||||
helm repo add flagger https://flagger.app
|
||||
|
||||
helm upgrade -i flagger flagger/flagger \
|
||||
--namespace projectcontour \
|
||||
--set meshProvider=contour \
|
||||
--set ingressClass=contour \
|
||||
--set prometheus.install=true
|
||||
```
|
||||
|
||||
You can also enable Slack, Discord, Rocket or MS Teams notifications, see the alerting [docs](../usage/alerting.md).
|
||||
|
||||
## Bootstrap
|
||||
|
||||
Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler \(HPA\), then creates a series of objects \(Kubernetes deployments, ClusterIP services and Contour HTTPProxy\). These objects expose the application in the cluster and drive the canary analysis and promotion.
|
||||
|
||||
Create a test namespace:
|
||||
|
||||
```bash
|
||||
kubectl create ns test
|
||||
```
|
||||
|
||||
Install the load testing service to generate traffic during the canary analysis:
|
||||
|
||||
```bash
|
||||
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/tester?ref=main
|
||||
```
|
||||
|
||||
Create a deployment and a horizontal pod autoscaler:
|
||||
|
||||
```bash
|
||||
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/podinfo?ref=main
|
||||
```
|
||||
|
||||
Create a canary custom resource \(replace `app.example.com` with your own domain\):
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: Canary
|
||||
metadata:
|
||||
name: podinfo
|
||||
namespace: test
|
||||
spec:
|
||||
# deployment reference
|
||||
targetRef:
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: podinfo
|
||||
# HPA reference
|
||||
autoscalerRef:
|
||||
apiVersion: autoscaling/v2beta2
|
||||
kind: HorizontalPodAutoscaler
|
||||
name: podinfo
|
||||
service:
|
||||
# service port
|
||||
port: 80
|
||||
# container port
|
||||
targetPort: 9898
|
||||
# Contour request timeout
|
||||
timeout: 15s
|
||||
# Contour retry policy
|
||||
retries:
|
||||
attempts: 3
|
||||
perTryTimeout: 5s
|
||||
# define the canary analysis timing and KPIs
|
||||
analysis:
|
||||
# schedule interval (default 60s)
|
||||
interval: 30s
|
||||
# max number of failed metric checks before rollback
|
||||
threshold: 5
|
||||
# max traffic percentage routed to canary
|
||||
# percentage (0-100)
|
||||
maxWeight: 50
|
||||
# canary increment step
|
||||
# percentage (0-100)
|
||||
stepWeight: 5
|
||||
# Contour Prometheus checks
|
||||
metrics:
|
||||
- name: request-success-rate
|
||||
# minimum req success rate (non 5xx responses)
|
||||
# percentage (0-100)
|
||||
thresholdRange:
|
||||
min: 99
|
||||
interval: 1m
|
||||
- name: request-duration
|
||||
# maximum req duration P99 in milliseconds
|
||||
thresholdRange:
|
||||
max: 500
|
||||
interval: 30s
|
||||
# testing
|
||||
webhooks:
|
||||
- name: acceptance-test
|
||||
type: pre-rollout
|
||||
url: http://flagger-loadtester.test/
|
||||
timeout: 30s
|
||||
metadata:
|
||||
type: bash
|
||||
cmd: "curl -sd 'test' http://podinfo-canary.test/token | grep token"
|
||||
- name: load-test
|
||||
url: http://flagger-loadtester.test/
|
||||
type: rollout
|
||||
timeout: 5s
|
||||
metadata:
|
||||
cmd: "hey -z 1m -q 10 -c 2 -host app.example.com http://envoy.projectcontour"
|
||||
```
|
||||
|
||||
Save the above resource as podinfo-canary.yaml and then apply it:
|
||||
|
||||
```bash
|
||||
kubectl apply -f ./podinfo-canary.yaml
|
||||
```
|
||||
|
||||
The canary analysis will run for five minutes while validating the HTTP metrics and rollout hooks every half a minute.
|
||||
|
||||
After a couple of seconds Flagger will create the canary objects:
|
||||
|
||||
```bash
|
||||
# applied
|
||||
deployment.apps/podinfo
|
||||
horizontalpodautoscaler.autoscaling/podinfo
|
||||
canary.flagger.app/podinfo
|
||||
|
||||
# generated
|
||||
deployment.apps/podinfo-primary
|
||||
horizontalpodautoscaler.autoscaling/podinfo-primary
|
||||
service/podinfo
|
||||
service/podinfo-canary
|
||||
service/podinfo-primary
|
||||
httpproxy.projectcontour.io/podinfo
|
||||
```
|
||||
|
||||
After the boostrap, the podinfo deployment will be scaled to zero and the traffic to `podinfo.test` will be routed to the primary pods. During the canary analysis, the `podinfo-canary.test` address can be used to target directly the canary pods.
|
||||
|
||||
## Expose the app outside the cluster
|
||||
|
||||
Find the external address of Contour's Envoy load balancer:
|
||||
|
||||
```bash
|
||||
export ADDRESS="$(kubectl -n projectcontour get svc/envoy -ojson \
|
||||
| jq -r ".status.loadBalancer.ingress[].hostname")"
|
||||
echo $ADDRESS
|
||||
```
|
||||
|
||||
Configure your DNS server with a CNAME record \(AWS\) or A record \(GKE/AKS/DOKS\) and point a domain e.g. `app.example.com` to the LB address.
|
||||
|
||||
Create a HTTPProxy definition and include the podinfo proxy generated by Flagger \(replace `app.example.com` with your own domain\):
|
||||
|
||||
```yaml
|
||||
apiVersion: projectcontour.io/v1
|
||||
kind: HTTPProxy
|
||||
metadata:
|
||||
name: podinfo-ingress
|
||||
namespace: test
|
||||
spec:
|
||||
virtualhost:
|
||||
fqdn: app.example.com
|
||||
includes:
|
||||
- name: podinfo
|
||||
namespace: test
|
||||
conditions:
|
||||
- prefix: /
|
||||
```
|
||||
|
||||
Save the above resource as podinfo-ingress.yaml and then apply it:
|
||||
|
||||
```bash
|
||||
kubectl apply -f ./podinfo-ingress.yaml
|
||||
```
|
||||
|
||||
Verify that Contour processed the proxy definition with:
|
||||
|
||||
```bash
|
||||
kubectl -n test get httpproxies
|
||||
|
||||
NAME FQDN STATUS
|
||||
podinfo valid
|
||||
podinfo-ingress app.example.com valid
|
||||
```
|
||||
|
||||
Now you can access podinfo UI using your domain address.
|
||||
|
||||
Note that you should be using HTTPS when exposing production workloads on internet. You can obtain free TLS certs from Let's Encrypt, read this [guide](https://github.com/stefanprodan/eks-contour-ingress) on how to configure cert-manager to secure Contour with TLS certificates.
|
||||
|
||||
## Automated canary promotion
|
||||
|
||||
Flagger implements a control loop that gradually shifts traffic to the canary while measuring key performance indicators like HTTP requests success rate, requests average duration and pod health. Based on analysis of the KPIs a canary is promoted or aborted.
|
||||
|
||||

|
||||
|
||||
A canary deployment is triggered by changes in any of the following objects:
|
||||
|
||||
* Deployment PodSpec \(container image, command, ports, env, resources, etc\)
|
||||
* ConfigMaps and Secrets mounted as volumes or mapped to environment variables
|
||||
|
||||
Trigger a canary deployment by updating the container image:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=ghcr.io/stefanprodan/podinfo:6.0.1
|
||||
```
|
||||
|
||||
Flagger detects that the deployment revision changed and starts a new rollout:
|
||||
|
||||
```text
|
||||
kubectl -n test describe canary/podinfo
|
||||
|
||||
Status:
|
||||
Canary Weight: 0
|
||||
Failed Checks: 0
|
||||
Phase: Succeeded
|
||||
Events:
|
||||
New revision detected! Scaling up podinfo.test
|
||||
Waiting for podinfo.test rollout to finish: 0 of 1 updated replicas are available
|
||||
Pre-rollout check acceptance-test passed
|
||||
Advance podinfo.test canary weight 5
|
||||
Advance podinfo.test canary weight 10
|
||||
Advance podinfo.test canary weight 15
|
||||
Advance podinfo.test canary weight 20
|
||||
Advance podinfo.test canary weight 25
|
||||
Advance podinfo.test canary weight 30
|
||||
Advance podinfo.test canary weight 35
|
||||
Advance podinfo.test canary weight 40
|
||||
Advance podinfo.test canary weight 45
|
||||
Advance podinfo.test canary weight 50
|
||||
Copying podinfo.test template spec to podinfo-primary.test
|
||||
Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
|
||||
Routing all traffic to primary
|
||||
Promotion completed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
When the canary analysis starts, Flagger will call the pre-rollout webhooks before routing traffic to the canary.
|
||||
|
||||
**Note** that if you apply new changes to the deployment during the canary analysis, Flagger will restart the analysis.
|
||||
|
||||
You can monitor all canaries with:
|
||||
|
||||
```bash
|
||||
watch kubectl get canaries --all-namespaces
|
||||
|
||||
NAMESPACE NAME STATUS WEIGHT LASTTRANSITIONTIME
|
||||
test podinfo Progressing 15 2019-12-20T14:05:07Z
|
||||
```
|
||||
|
||||
If you’ve enabled the Slack notifications, you should receive the following messages:
|
||||
|
||||

|
||||
|
||||
## Automated rollback
|
||||
|
||||
During the canary analysis you can generate HTTP 500 errors or high latency to test if Flagger pauses the rollout.
|
||||
|
||||
Trigger a canary deployment:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=ghcr.io/stefanprodan/podinfo:6.0.2
|
||||
```
|
||||
|
||||
Exec into the load tester pod with:
|
||||
|
||||
```bash
|
||||
kubectl -n test exec -it deploy/flagger-loadtester bash
|
||||
```
|
||||
|
||||
Generate HTTP 500 errors:
|
||||
|
||||
```bash
|
||||
hey -z 1m -c 5 -q 5 http://app.example.com/status/500
|
||||
```
|
||||
|
||||
Generate latency:
|
||||
|
||||
```bash
|
||||
watch -n 1 curl http://app.example.com/delay/1
|
||||
```
|
||||
|
||||
When the number of failed checks reaches the canary analysis threshold, the traffic is routed back to the primary, the canary is scaled to zero and the rollout is marked as failed.
|
||||
|
||||
```text
|
||||
kubectl -n projectcontour logs deploy/flagger -f | jq .msg
|
||||
|
||||
New revision detected! progressing canary analysis for podinfo.test
|
||||
Pre-rollout check acceptance-test passed
|
||||
Advance podinfo.test canary weight 5
|
||||
Advance podinfo.test canary weight 10
|
||||
Advance podinfo.test canary weight 15
|
||||
Halt podinfo.test advancement success rate 69.17% < 99%
|
||||
Halt podinfo.test advancement success rate 61.39% < 99%
|
||||
Halt podinfo.test advancement success rate 55.06% < 99%
|
||||
Halt podinfo.test advancement request duration 1.20s > 500ms
|
||||
Halt podinfo.test advancement request duration 1.45s > 500ms
|
||||
Rolling back podinfo.test failed checks threshold reached 5
|
||||
Canary failed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
If you’ve enabled the Slack notifications, you’ll receive a message if the progress deadline is exceeded, or if the analysis reached the maximum number of failed checks:
|
||||
|
||||

|
||||
|
||||
## A/B Testing
|
||||
|
||||
Besides weighted routing, Flagger can be configured to route traffic to the canary based on HTTP match conditions. In an A/B testing scenario, you'll be using HTTP headers or cookies to target a certain segment of your users. This is particularly useful for frontend applications that require session affinity.
|
||||
|
||||

|
||||
|
||||
Edit the canary analysis, remove the max/step weight and add the match conditions and iterations:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
interval: 1m
|
||||
threshold: 5
|
||||
iterations: 10
|
||||
match:
|
||||
- headers:
|
||||
x-canary:
|
||||
exact: "insider"
|
||||
webhooks:
|
||||
- name: load-test
|
||||
url: http://flagger-loadtester.test/
|
||||
metadata:
|
||||
cmd: "hey -z 1m -q 5 -c 5 -H 'X-Canary: insider' -host app.example.com http://envoy.projectcontour"
|
||||
```
|
||||
|
||||
The above configuration will run an analysis for ten minutes targeting users that have a `X-Canary: insider` header.
|
||||
|
||||
You can also use a HTTP cookie. To target all users with a cookie set to `insider`, the match condition should be:
|
||||
|
||||
```yaml
|
||||
match:
|
||||
- headers:
|
||||
cookie:
|
||||
suffix: "insider"
|
||||
webhooks:
|
||||
- name: load-test
|
||||
url: http://flagger-loadtester.test/
|
||||
metadata:
|
||||
cmd: "hey -z 1m -q 5 -c 5 -H 'Cookie: canary=insider' -host app.example.com http://envoy.projectcontour"
|
||||
```
|
||||
|
||||
Trigger a canary deployment by updating the container image:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=ghcr.io/stefanprodan/podinfo:6.0.3
|
||||
```
|
||||
|
||||
Flagger detects that the deployment revision changed and starts the A/B test:
|
||||
|
||||
```text
|
||||
kubectl -n projectcontour logs deploy/flagger -f | jq .msg
|
||||
|
||||
New revision detected! Progressing canary analysis for podinfo.test
|
||||
Advance podinfo.test canary iteration 1/10
|
||||
Advance podinfo.test canary iteration 2/10
|
||||
Advance podinfo.test canary iteration 3/10
|
||||
Advance podinfo.test canary iteration 4/10
|
||||
Advance podinfo.test canary iteration 5/10
|
||||
Advance podinfo.test canary iteration 6/10
|
||||
Advance podinfo.test canary iteration 7/10
|
||||
Advance podinfo.test canary iteration 8/10
|
||||
Advance podinfo.test canary iteration 9/10
|
||||
Advance podinfo.test canary iteration 10/10
|
||||
Copying podinfo.test template spec to podinfo-primary.test
|
||||
Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
|
||||
Routing all traffic to primary
|
||||
Promotion completed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
The web browser user agent header allows user segmentation based on device or OS.
|
||||
|
||||
For example, if you want to route all mobile users to the canary instance:
|
||||
|
||||
```yaml
|
||||
match:
|
||||
- headers:
|
||||
user-agent:
|
||||
prefix: "Mobile"
|
||||
```
|
||||
|
||||
Or if you want to target only Android users:
|
||||
|
||||
```yaml
|
||||
match:
|
||||
- headers:
|
||||
user-agent:
|
||||
prefix: "Android"
|
||||
```
|
||||
|
||||
Or a specific browser version:
|
||||
|
||||
```yaml
|
||||
match:
|
||||
- headers:
|
||||
user-agent:
|
||||
suffix: "Firefox/71.0"
|
||||
```
|
||||
|
||||
For an in-depth look at the analysis process read the [usage docs](../usage/how-it-works.md).
|
||||
|
|
@ -0,0 +1,488 @@
|
|||
---
|
||||
weight: 11
|
||||
title: Gateway API Canary Deployments
|
||||
type: docs
|
||||
---
|
||||
|
||||
This guide shows you how to use [Gateway API](https://gateway-api.sigs.k8s.io/) and Flagger to automate canary deployments and A/B testing.
|
||||
|
||||

|
||||
|
||||
## Prerequisites
|
||||
|
||||
Flagger requires a Kubernetes cluster **v1.16** or newer and any mesh/ingress that implements the `v1alpha2` of Gateway API. We'll be using Contour for the sake of this tutorial, but you can use any other implementation.
|
||||
|
||||
Install the GatewayAPI CRDs:
|
||||
|
||||
```bash
|
||||
kubectl apply -k github.com/kubernetes-sigs/gateway-api/config/crd?ref=v0.4.1
|
||||
```
|
||||
|
||||
Install a cluster-wide GatewayClass; a Gateway belonging to the GatewayClass and Contour components in the `projectcontour` namespace:
|
||||
|
||||
```bash
|
||||
kubectl apply -f https://raw.githubusercontent.com/projectcontour/contour/release-1.20/examples/render/contour-gateway.yaml
|
||||
```
|
||||
|
||||
Install Flagger in the `flagger-system` namespace:
|
||||
|
||||
```bash
|
||||
kubectl apply -k github.com/fluxcd/flagger//kustomize/gatewayapi
|
||||
```
|
||||
|
||||
## Bootstrap
|
||||
|
||||
Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler \(HPA\), then creates a series of objects \(Kubernetes deployments, ClusterIP services, HTTPRoutes for the Gateway\). These objects expose the application inside the mesh and drive the canary analysis and promotion.
|
||||
|
||||
Create a test namespace:
|
||||
|
||||
```bash
|
||||
kubectl create ns test
|
||||
```
|
||||
|
||||
Create a deployment and a horizontal pod autoscaler:
|
||||
|
||||
```bash
|
||||
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/podinfo?ref=main
|
||||
```
|
||||
|
||||
Deploy the load testing service to generate traffic during the canary analysis:
|
||||
|
||||
```bash
|
||||
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/tester?ref=main
|
||||
```
|
||||
|
||||
Create metric templates targeting the Prometheus server in the `flagger-system` namespace. The PromQL queries below are meant for `Envoy`, but you can [change it to your ingress/mesh provider](/flagger/faq#metrics) accordingly.
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: MetricTemplate
|
||||
metadata:
|
||||
name: latency
|
||||
namespace: flagger-system
|
||||
spec:
|
||||
provider:
|
||||
type: prometheus
|
||||
address: http://flagger-prometheus:9090
|
||||
query: |
|
||||
histogram_quantile(0.99,
|
||||
sum(
|
||||
rate(
|
||||
envoy_cluster_upstream_rq_time_bucket{
|
||||
envoy_cluster_name=~"{{ namespace }}_{{ target }}-canary_[0-9a-zA-Z-]+",
|
||||
}[{{ interval }}]
|
||||
)
|
||||
) by (le)
|
||||
)/1000
|
||||
---
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: MetricTemplate
|
||||
metadata:
|
||||
name: error-rate
|
||||
namespace: flagger-system
|
||||
spec:
|
||||
provider:
|
||||
type: prometheus
|
||||
address: http://flagger-prometheus:9090
|
||||
query: |
|
||||
100 - sum(
|
||||
rate(
|
||||
envoy_cluster_upstream_rq{
|
||||
envoy_cluster_name=~"{{ namespace }}_{{ target }}-canary_[0-9a-zA-Z-]+",
|
||||
envoy_response_code!~"5.*"
|
||||
}[{{ interval }}]
|
||||
)
|
||||
)
|
||||
/
|
||||
sum(
|
||||
rate(
|
||||
envoy_cluster_upstream_rq{
|
||||
envoy_cluster_name=~"{{ namespace }}_{{ target }}-canary_[0-9a-zA-Z-]+",
|
||||
}[{{ interval }}]
|
||||
)
|
||||
)
|
||||
* 100
|
||||
```
|
||||
|
||||
Save the above resource as metric-templates.yaml and then apply it:
|
||||
|
||||
```bash
|
||||
kubectl apply -f metric-templates.yaml
|
||||
```
|
||||
|
||||
Create a canary custom resource \(replace "loaclproject.contour.io" with your own domain\):
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: Canary
|
||||
metadata:
|
||||
name: podinfo
|
||||
namespace: test
|
||||
spec:
|
||||
# deployment reference
|
||||
targetRef:
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: podinfo
|
||||
# the maximum time in seconds for the canary deployment
|
||||
# to make progress before it is rollback (default 600s)
|
||||
progressDeadlineSeconds: 60
|
||||
# HPA reference (optional)
|
||||
autoscalerRef:
|
||||
apiVersion: autoscaling/v2beta2
|
||||
kind: HorizontalPodAutoscaler
|
||||
name: podinfo
|
||||
service:
|
||||
# service port number
|
||||
port: 9898
|
||||
# container port number or name (optional)
|
||||
targetPort: 9898
|
||||
# Gateway API HTTPRoute host names
|
||||
hosts:
|
||||
- localproject.contour.io
|
||||
# Reference to the Gateway that the generated HTTPRoute would attach to.
|
||||
gatewayRefs:
|
||||
- name: contour
|
||||
namespace: projectcontour
|
||||
analysis:
|
||||
# schedule interval (default 60s)
|
||||
interval: 1m
|
||||
# max number of failed metric checks before rollback
|
||||
threshold: 5
|
||||
# max traffic percentage routed to canary
|
||||
# percentage (0-100)
|
||||
maxWeight: 50
|
||||
# canary increment step
|
||||
# percentage (0-100)
|
||||
stepWeight: 10
|
||||
metrics:
|
||||
- name: error-rate
|
||||
# max error rate (5xx responses)
|
||||
# percentage (0-100)
|
||||
templateRef:
|
||||
name: error-rate
|
||||
namespace: flagger-system
|
||||
thresholdRange:
|
||||
max: 1
|
||||
interval: 1m
|
||||
- name: latency
|
||||
templateRef:
|
||||
name: latency
|
||||
namespace: flagger-system
|
||||
# seconds
|
||||
thresholdRange:
|
||||
max: 0.5
|
||||
interval: 30s
|
||||
# testing (optional)
|
||||
webhooks:
|
||||
- name: smoke-test
|
||||
type: pre-rollout
|
||||
url: http://flagger-loadtester.test/
|
||||
timeout: 15s
|
||||
metadata:
|
||||
type: bash
|
||||
cmd: "curl -sd 'anon' http://podinfo-canary.test:9898/token | grep token"
|
||||
- name: load-test
|
||||
url: http://flagger-loadtester.test/
|
||||
timeout: 5s
|
||||
metadata:
|
||||
cmd: "hey -z 2m -q 10 -c 2 -host localproject.contour.io http://envoy.projectcontour/"
|
||||
```
|
||||
|
||||
Save the above resource as podinfo-canary.yaml and then apply it:
|
||||
|
||||
```bash
|
||||
kubectl apply -f ./podinfo-canary.yaml
|
||||
```
|
||||
|
||||
When the canary analysis starts, Flagger will call the pre-rollout webhooks before routing traffic to the canary. The canary analysis will run for five minutes while validating the HTTP metrics and rollout hooks every minute.
|
||||
|
||||
After a couple of seconds Flagger will create the canary objects:
|
||||
|
||||
```bash
|
||||
# applied
|
||||
deployment.apps/podinfo
|
||||
horizontalpodautoscaler.autoscaling/podinfo
|
||||
canary.flagger.app/podinfo
|
||||
|
||||
# generated
|
||||
deployment.apps/podinfo-primary
|
||||
horizontalpodautoscaler.autoscaling/podinfo-primary
|
||||
service/podinfo
|
||||
service/podinfo-canary
|
||||
service/podinfo-primary
|
||||
httproutes.gateway.networking.k8s.io/podinfo
|
||||
```
|
||||
|
||||
## Expose the app outside the cluster
|
||||
|
||||
Find the external address of Contour's Envoy load balancer:
|
||||
|
||||
```bash
|
||||
export ADDRESS="$(kubectl -n projectcontour get svc/envoy -ojson \
|
||||
| jq -r ".status.loadBalancer.ingress[].hostname")"
|
||||
echo $ADDRESS
|
||||
```
|
||||
|
||||
Configure your DNS server with a CNAME record \(AWS\) or A record \(GKE/AKS/DOKS\) and point a domain e.g. `localproject.contour.io` to the LB address.
|
||||
|
||||
Now you can access the podinfo UI using your domain address.
|
||||
|
||||
Note that you should be using HTTPS when exposing production workloads on internet. You can obtain free TLS certs from Let's Encrypt, read this [guide](https://github.com/stefanprodan/eks-contour-ingress) on how to configure cert-manager to secure Contour with TLS certificates.
|
||||
|
||||
If you're using a local cluster via kind/k3s you can port forward the Envoy LoadBalancer service:
|
||||
```bash
|
||||
kubectl port-forward -n projectcontour svc/envoy 8080:80
|
||||
```
|
||||
|
||||
Now you can access podinfo via `curl -H "Host: localproject.contour.io" localhost:8080`
|
||||
|
||||
## Automated canary promotion
|
||||
|
||||
Trigger a canary deployment by updating the container image:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=stefanprodan/podinfo:6.0.1
|
||||
```
|
||||
|
||||
Flagger detects that the deployment revision changed and starts a new rollout:
|
||||
|
||||
```text
|
||||
kubectl -n test describe canary/podinfo
|
||||
|
||||
Status:
|
||||
Canary Weight: 0
|
||||
Failed Checks: 0
|
||||
Phase: Succeeded
|
||||
Events:
|
||||
Type Reason Age From Message
|
||||
---- ------ ---- ---- -------
|
||||
Normal Synced 3m flagger New revision detected podinfo.test
|
||||
Normal Synced 3m flagger Scaling up podinfo.test
|
||||
Warning Synced 3m flagger Waiting for podinfo.test rollout to finish: 0 of 1 updated replicas are available
|
||||
Normal Synced 3m flagger Advance podinfo.test canary weight 5
|
||||
Normal Synced 3m flagger Advance podinfo.test canary weight 10
|
||||
Normal Synced 3m flagger Advance podinfo.test canary weight 15
|
||||
Normal Synced 2m flagger Advance podinfo.test canary weight 20
|
||||
Normal Synced 2m flagger Advance podinfo.test canary weight 25
|
||||
Normal Synced 1m flagger Advance podinfo.test canary weight 30
|
||||
Normal Synced 1m flagger Advance podinfo.test canary weight 35
|
||||
Normal Synced 55s flagger Advance podinfo.test canary weight 40
|
||||
Normal Synced 45s flagger Advance podinfo.test canary weight 45
|
||||
Normal Synced 35s flagger Advance podinfo.test canary weight 50
|
||||
Normal Synced 25s flagger Copying podinfo.test template spec to podinfo-primary.test
|
||||
Warning Synced 15s flagger Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
|
||||
Normal Synced 5s flagger Promotion completed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
**Note** that if you apply new changes to the deployment during the canary analysis, Flagger will restart the analysis.
|
||||
|
||||
A canary deployment is triggered by changes in any of the following objects:
|
||||
|
||||
* Deployment PodSpec \(container image, command, ports, env, resources, etc\)
|
||||
* ConfigMaps mounted as volumes or mapped to environment variables
|
||||
* Secrets mounted as volumes or mapped to environment variables
|
||||
|
||||
You can monitor how Flagger progressively changes the weights of the HTTPRoute object that is attahed to the Gateway with:
|
||||
|
||||
```bash
|
||||
watch kubectl get httproute -n test podinfo -o=jsonpath='{.spec.rules}'
|
||||
```
|
||||
|
||||
You can monitor all canaries with:
|
||||
|
||||
```bash
|
||||
watch kubectl get canaries --all-namespaces
|
||||
|
||||
NAMESPACE NAME STATUS WEIGHT LASTTRANSITIONTIME
|
||||
test podinfo Progressing 15 2022-01-16T14:05:07Z
|
||||
prod frontend Succeeded 0 2022-01-15T16:15:07Z
|
||||
prod backend Failed 0 2022-01-14T17:05:07Z
|
||||
```
|
||||
|
||||
## Automated rollback
|
||||
|
||||
During the canary analysis you can generate HTTP 500 errors and high latency to test if Flagger pauses the rollout.
|
||||
|
||||
Trigger another canary deployment:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=stefanprodan/podinfo:6.0.2
|
||||
```
|
||||
|
||||
Exec into the load tester pod with:
|
||||
|
||||
```bash
|
||||
kubectl -n test exec -it flagger-loadtester-xx-xx sh
|
||||
```
|
||||
|
||||
Generate HTTP 500 errors:
|
||||
|
||||
```bash
|
||||
watch curl http://podinfo-canary:9898/status/500
|
||||
```
|
||||
|
||||
Generate latency:
|
||||
|
||||
```bash
|
||||
watch curl http://podinfo-canary:9898/delay/1
|
||||
```
|
||||
|
||||
When the number of failed checks reaches the canary analysis threshold, the traffic is routed back to the primary, the canary is scaled to zero and the rollout is marked as failed.
|
||||
|
||||
```text
|
||||
kubectl -n test describe canary/podinfo
|
||||
|
||||
Status:
|
||||
Canary Weight: 0
|
||||
Failed Checks: 10
|
||||
Phase: Failed
|
||||
Events:
|
||||
Type Reason Age From Message
|
||||
---- ------ ---- ---- -------
|
||||
Normal Synced 3m flagger Starting canary deployment for podinfo.test
|
||||
Normal Synced 3m flagger Advance podinfo.test canary weight 5
|
||||
Normal Synced 3m flagger Advance podinfo.test canary weight 10
|
||||
Normal Synced 3m flagger Advance podinfo.test canary weight 15
|
||||
Normal Synced 3m flagger Halt podinfo.test advancement error rate 69.17% > 1%
|
||||
Normal Synced 2m flagger Halt podinfo.test advancement error rate 61.39% > 1%
|
||||
Normal Synced 2m flagger Halt podinfo.test advancement error rate 55.06% > 1%
|
||||
Normal Synced 2m flagger Halt podinfo.test advancement error rate 47.00% > 1%
|
||||
Normal Synced 2m flagger (combined from similar events): Halt podinfo.test advancement error rate 38.08% > 1%
|
||||
Warning Synced 1m flagger Rolling back podinfo.test failed checks threshold reached 10
|
||||
Warning Synced 1m flagger Canary failed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
# A/B Testing
|
||||
|
||||
Besides weighted routing, Flagger can be configured to route traffic to the canary based on HTTP match conditions. In an A/B testing scenario, you'll be using HTTP headers or cookies to target a certain segment of your users. This is particularly useful for frontend applications that require session affinity.
|
||||
|
||||

|
||||
|
||||
Create a canary custom resource \(replace "loaclproject.contour.io" with your own domain\):
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: Canary
|
||||
metadata:
|
||||
name: podinfo
|
||||
namespace: test
|
||||
spec:
|
||||
# deployment reference
|
||||
targetRef:
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: podinfo
|
||||
# the maximum time in seconds for the canary deployment
|
||||
# to make progress before it is rollback (default 600s)
|
||||
progressDeadlineSeconds: 60
|
||||
# HPA reference (optional)
|
||||
autoscalerRef:
|
||||
apiVersion: autoscaling/v2beta2
|
||||
kind: HorizontalPodAutoscaler
|
||||
name: podinfo
|
||||
service:
|
||||
# service port number
|
||||
port: 9898
|
||||
# container port number or name (optional)
|
||||
targetPort: 9898
|
||||
# Gateway API HTTPRoute host names
|
||||
hosts:
|
||||
- localproject.contour.io
|
||||
# Reference to the Gateway that the generated HTTPRoute would attach to.
|
||||
gatewayRefs:
|
||||
- name: contour
|
||||
namespace: projectcontour
|
||||
analysis:
|
||||
# schedule interval (default 60s)
|
||||
interval: 1m
|
||||
# max number of failed metric checks before rollback
|
||||
threshold: 5
|
||||
# max traffic percentage routed to canary
|
||||
# percentage (0-100)
|
||||
maxWeight: 50
|
||||
# canary increment step
|
||||
# percentage (0-100)
|
||||
stepWeight: 10
|
||||
metrics:
|
||||
- name: error-rate
|
||||
# max error rate (5xx responses)
|
||||
# percentage (0-100)
|
||||
templateRef:
|
||||
name: error-rate
|
||||
namespace: flagger-system
|
||||
thresholdRange:
|
||||
max: 1
|
||||
interval: 1m
|
||||
- name: latency
|
||||
templateRef:
|
||||
name: latency
|
||||
namespace: flagger-system
|
||||
# seconds
|
||||
thresholdRange:
|
||||
max: 0.5
|
||||
interval: 30s
|
||||
# testing (optional)
|
||||
webhooks:
|
||||
- name: smoke-test
|
||||
type: pre-rollout
|
||||
url: http://flagger-loadtester.test/
|
||||
timeout: 15s
|
||||
metadata:
|
||||
type: bash
|
||||
cmd: "curl -sd 'anon' http://podinfo-canary.test:9898/token | grep token"
|
||||
- name: load-test
|
||||
url: http://flagger-loadtester.test/
|
||||
timeout: 5s
|
||||
metadata:
|
||||
cmd: "hey -z 2m -q 10 -c 2 -host localproject.contour.io -H 'X-Canary: insider' http://envoy.projectcontour/"
|
||||
```
|
||||
|
||||
The above configuration will run an analysis for ten minutes targeting those users that have an insider cookie.
|
||||
|
||||
Save the above resource as podinfo-ab-canary.yaml and then apply it:
|
||||
|
||||
```bash
|
||||
kubectl apply -f ./podinfo-ab-canary.yaml
|
||||
```
|
||||
|
||||
Trigger a canary deployment by updating the container image:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=stefanprodan/podinfo:6.0.3
|
||||
```
|
||||
|
||||
Flagger detects that the deployment revision changed and starts a new rollout:
|
||||
|
||||
```text
|
||||
kubectl -n test describe canary/abtest
|
||||
|
||||
Status:
|
||||
Failed Checks: 0
|
||||
Phase: Succeeded
|
||||
Events:
|
||||
Type Reason Age From Message
|
||||
---- ------ ---- ---- -------
|
||||
Normal Synced 3m flagger New revision detected podinfo.test
|
||||
Normal Synced 3m flagger Scaling up podinfo.test
|
||||
Warning Synced 3m flagger Waiting for podinfo.test rollout to finish: 0 of 1 updated replicas are available
|
||||
Normal Synced 3m flagger Advance podinfo.test canary iteration 1/10
|
||||
Normal Synced 3m flagger Advance podinfo.test canary iteration 2/10
|
||||
Normal Synced 3m flagger Advance podinfo.test canary iteration 3/10
|
||||
Normal Synced 2m flagger Advance podinfo.test canary iteration 4/10
|
||||
Normal Synced 2m flagger Advance podinfo.test canary iteration 5/10
|
||||
Normal Synced 1m flagger Advance podinfo.test canary iteration 6/10
|
||||
Normal Synced 1m flagger Advance podinfo.test canary iteration 7/10
|
||||
Normal Synced 55s flagger Advance podinfo.test canary iteration 8/10
|
||||
Normal Synced 45s flagger Advance podinfo.test canary iteration 9/10
|
||||
Normal Synced 35s flagger Advance podinfo.test canary iteration 10/10
|
||||
Normal Synced 25s flagger Copying podinfo.test template spec to podinfo-primary.test
|
||||
Warning Synced 15s flagger Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
|
||||
Normal Synced 5s flagger Promotion completed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
|
||||
The above procedures can be extended with [custom metrics](../usage/metrics.md) checks, [webhooks](../usage/webhooks.md), [manual promotion](../usage/webhooks.md#manual-gating) approval and [Slack or MS Teams](../usage/alerting.md) notifications.
|
|
@ -0,0 +1,488 @@
|
|||
---
|
||||
weight: 5
|
||||
title: Gloo Canary Deployments
|
||||
type: docs
|
||||
---
|
||||
|
||||
This guide shows you how to use the [Gloo Edge](https://gloo.solo.io/) ingress controller
|
||||
and Flagger to automate canary releases and A/B testing.
|
||||
|
||||

|
||||
|
||||
## Prerequisites
|
||||
|
||||
Flagger requires a Kubernetes cluster **v1.16** or newer and Gloo Edge ingress **1.6.0** or newer.
|
||||
|
||||
This guide was written for Flagger version **1.6.0** or higher. Prior versions of Flagger
|
||||
used Gloo `UpstreamGroup`s to handle canaries, but newer versions of Flagger use Gloo
|
||||
`RouteTable`s to handle canaries as well as A/B testing.
|
||||
|
||||
Install Gloo with Helm v3:
|
||||
|
||||
```bash
|
||||
helm repo add gloo https://storage.googleapis.com/solo-public-helm
|
||||
kubectl create ns gloo-system
|
||||
helm upgrade -i gloo gloo/gloo \
|
||||
--namespace gloo-system
|
||||
```
|
||||
|
||||
Install Flagger and the Prometheus add-on in the same namespace as Gloo:
|
||||
|
||||
```bash
|
||||
helm repo add flagger https://flagger.app
|
||||
|
||||
helm upgrade -i flagger flagger/flagger \
|
||||
--namespace gloo-system \
|
||||
--set prometheus.install=true \
|
||||
--set meshProvider=gloo
|
||||
```
|
||||
|
||||
## Bootstrap
|
||||
|
||||
Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler (HPA),
|
||||
then creates a series of objects (Kubernetes deployments, ClusterIP services, Gloo route tables and upstreams).
|
||||
These objects expose the application outside the cluster and drive the canary analysis and promotion.
|
||||
|
||||
Create a test namespace:
|
||||
|
||||
```bash
|
||||
kubectl create ns test
|
||||
```
|
||||
|
||||
Create a deployment and a horizontal pod autoscaler:
|
||||
|
||||
```bash
|
||||
kubectl -n test apply -k https://github.com/fluxcd/flagger//kustomize/podinfo?ref=main
|
||||
```
|
||||
|
||||
Deploy the load testing service to generate traffic during the canary analysis:
|
||||
|
||||
```bash
|
||||
kubectl -n test apply -k https://github.com/fluxcd/flagger//kustomize/tester?ref=main
|
||||
```
|
||||
|
||||
Create a virtual service definition that references a route table that will be generated by Flagger
|
||||
(replace `app.example.com` with your own domain):
|
||||
|
||||
```yaml
|
||||
apiVersion: gateway.solo.io/v1
|
||||
kind: VirtualService
|
||||
metadata:
|
||||
name: podinfo
|
||||
namespace: test
|
||||
spec:
|
||||
virtualHost:
|
||||
domains:
|
||||
- 'app.example.com'
|
||||
routes:
|
||||
- matchers:
|
||||
- prefix: /
|
||||
delegateAction:
|
||||
ref:
|
||||
name: podinfo
|
||||
namespace: test
|
||||
```
|
||||
|
||||
Save the above resource as podinfo-virtualservice.yaml and then apply it:
|
||||
|
||||
```bash
|
||||
kubectl apply -f ./podinfo-virtualservice.yaml
|
||||
```
|
||||
|
||||
Create a canary custom resource (replace `app.example.com` with your own domain):
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: Canary
|
||||
metadata:
|
||||
name: podinfo
|
||||
namespace: test
|
||||
spec:
|
||||
# upstreamRef (optional)
|
||||
# defines an upstream to copy the spec from when flagger generates new upstreams.
|
||||
# necessary to copy over TLS config, circuit breakers, etc. (anything nonstandard)
|
||||
# upstreamRef:
|
||||
# apiVersion: gloo.solo.io/v1
|
||||
# kind: Upstream
|
||||
# name: podinfo-upstream
|
||||
# namespace: gloo-system
|
||||
provider: gloo
|
||||
# deployment reference
|
||||
targetRef:
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: podinfo
|
||||
# HPA reference (optional)
|
||||
autoscalerRef:
|
||||
apiVersion: autoscaling/v2beta2
|
||||
kind: HorizontalPodAutoscaler
|
||||
name: podinfo
|
||||
service:
|
||||
# ClusterIP port number
|
||||
port: 9898
|
||||
# container port number or name (optional)
|
||||
targetPort: 9898
|
||||
analysis:
|
||||
# schedule interval (default 60s)
|
||||
interval: 10s
|
||||
# max number of failed metric checks before rollback
|
||||
threshold: 5
|
||||
# max traffic percentage routed to canary
|
||||
# percentage (0-100)
|
||||
maxWeight: 50
|
||||
# canary increment step
|
||||
# percentage (0-100)
|
||||
stepWeight: 5
|
||||
# Gloo Prometheus checks
|
||||
metrics:
|
||||
- name: request-success-rate
|
||||
# minimum req success rate (non 5xx responses)
|
||||
# percentage (0-100)
|
||||
thresholdRange:
|
||||
min: 99
|
||||
interval: 1m
|
||||
- name: request-duration
|
||||
# maximum req duration P99
|
||||
# milliseconds
|
||||
thresholdRange:
|
||||
max: 500
|
||||
interval: 30s
|
||||
# testing (optional)
|
||||
webhooks:
|
||||
- name: acceptance-test
|
||||
type: pre-rollout
|
||||
url: http://flagger-loadtester.test/
|
||||
timeout: 10s
|
||||
metadata:
|
||||
type: bash
|
||||
cmd: "curl -sd 'test' http://podinfo-canary:9898/token | grep token"
|
||||
- name: load-test
|
||||
url: http://flagger-loadtester.test/
|
||||
timeout: 5s
|
||||
metadata:
|
||||
type: cmd
|
||||
cmd: "hey -z 2m -q 5 -c 2 -host app.example.com http://gateway-proxy.gloo-system"
|
||||
```
|
||||
|
||||
*Note: when using upstreamRef the following fields are copied over from the original upstream: `Labels, SslConfig, CircuitBreakers, ConnectionConfig, UseHttp2, InitialStreamWindowSize`*
|
||||
|
||||
Save the above resource as podinfo-canary.yaml and then apply it:
|
||||
|
||||
```bash
|
||||
kubectl apply -f ./podinfo-canary.yaml
|
||||
```
|
||||
|
||||
After a couple of seconds Flagger will create the canary objects:
|
||||
|
||||
```bash
|
||||
# applied
|
||||
deployment.apps/podinfo
|
||||
horizontalpodautoscaler.autoscaling/podinfo
|
||||
virtualservices.gateway.solo.io/podinfo
|
||||
canary.flagger.app/podinfo
|
||||
|
||||
# generated
|
||||
deployment.apps/podinfo-primary
|
||||
horizontalpodautoscaler.autoscaling/podinfo-primary
|
||||
service/podinfo
|
||||
service/podinfo-canary
|
||||
service/podinfo-primary
|
||||
routetables.gateway.solo.io/podinfo
|
||||
upstreams.gloo.solo.io/test-podinfo-canaryupstream-9898
|
||||
upstreams.gloo.solo.io/test-podinfo-primaryupstream-9898
|
||||
```
|
||||
|
||||
When the bootstrap finishes Flagger will set the canary status to initialized:
|
||||
|
||||
```bash
|
||||
kubectl -n test get canary podinfo
|
||||
|
||||
NAME STATUS WEIGHT LASTTRANSITIONTIME
|
||||
podinfo Initialized 0 2019-05-17T08:09:51Z
|
||||
```
|
||||
|
||||
## Automated canary promotion
|
||||
|
||||
Flagger implements a control loop that gradually shifts traffic to the canary while measuring
|
||||
key performance indicators like HTTP requests success rate, requests average duration and pod health.
|
||||
Based on analysis of the KPIs a canary is promoted or aborted, and the analysis result is published to Slack.
|
||||
|
||||

|
||||
|
||||
Trigger a canary deployment by updating the container image:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=ghcr.io/stefanprodan/podinfo:6.0.1
|
||||
```
|
||||
|
||||
Flagger detects that the deployment revision changed and starts a new rollout:
|
||||
|
||||
```text
|
||||
kubectl -n test describe canary/podinfo
|
||||
|
||||
Status:
|
||||
Canary Weight: 0
|
||||
Failed Checks: 0
|
||||
Phase: Succeeded
|
||||
Events:
|
||||
Type Reason Age From Message
|
||||
---- ------ ---- ---- -------
|
||||
Normal Synced 3m flagger New revision detected podinfo.test
|
||||
Normal Synced 3m flagger Scaling up podinfo.test
|
||||
Warning Synced 3m flagger Waiting for podinfo.test rollout to finish: 0 of 1 updated replicas are available
|
||||
Normal Synced 3m flagger Advance podinfo.test canary weight 5
|
||||
Normal Synced 3m flagger Advance podinfo.test canary weight 10
|
||||
Normal Synced 3m flagger Advance podinfo.test canary weight 15
|
||||
Normal Synced 2m flagger Advance podinfo.test canary weight 20
|
||||
Normal Synced 2m flagger Advance podinfo.test canary weight 25
|
||||
Normal Synced 1m flagger Advance podinfo.test canary weight 30
|
||||
Normal Synced 1m flagger Advance podinfo.test canary weight 35
|
||||
Normal Synced 55s flagger Advance podinfo.test canary weight 40
|
||||
Normal Synced 45s flagger Advance podinfo.test canary weight 45
|
||||
Normal Synced 35s flagger Advance podinfo.test canary weight 50
|
||||
Normal Synced 25s flagger Copying podinfo.test template spec to podinfo-primary.test
|
||||
Warning Synced 15s flagger Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
|
||||
Normal Synced 5s flagger Promotion completed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
**Note** that if you apply new changes to the deployment during the canary analysis, Flagger will restart the analysis.
|
||||
|
||||
You can monitor all canaries with:
|
||||
|
||||
```bash
|
||||
watch kubectl get canaries --all-namespaces
|
||||
|
||||
NAMESPACE NAME STATUS WEIGHT LASTTRANSITIONTIME
|
||||
test podinfo Progressing 15 2019-05-17T14:05:07Z
|
||||
prod frontend Succeeded 0 2019-05-17T16:15:07Z
|
||||
prod backend Failed 0 2019-05-17T17:05:07Z
|
||||
```
|
||||
|
||||
## Automated rollback
|
||||
|
||||
During the canary analysis you can generate HTTP 500 errors and high latency to test if
|
||||
Flagger pauses and rolls back the faulted version.
|
||||
|
||||
Trigger another canary deployment:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=ghcr.io/stefanprodan/podinfo:6.0.2
|
||||
```
|
||||
|
||||
Generate HTTP 500 errors:
|
||||
|
||||
```bash
|
||||
watch curl -H 'Host: app.example.com' http://gateway-proxy.gloo-system/status/500
|
||||
```
|
||||
|
||||
Generate high latency:
|
||||
|
||||
```bash
|
||||
watch curl -H 'Host: app.example.com' http://gateway-proxy.gloo-system/delay/2
|
||||
```
|
||||
|
||||
When the number of failed checks reaches the canary analysis threshold, the traffic is routed back to the primary,
|
||||
the canary is scaled to zero and the rollout is marked as failed.
|
||||
|
||||
```text
|
||||
kubectl -n test describe canary/podinfo
|
||||
|
||||
Status:
|
||||
Canary Weight: 0
|
||||
Failed Checks: 10
|
||||
Phase: Failed
|
||||
Events:
|
||||
Type Reason Age From Message
|
||||
---- ------ ---- ---- -------
|
||||
Normal Synced 3m flagger Starting canary deployment for podinfo.test
|
||||
Normal Synced 3m flagger Advance podinfo.test canary weight 5
|
||||
Normal Synced 3m flagger Advance podinfo.test canary weight 10
|
||||
Normal Synced 3m flagger Advance podinfo.test canary weight 15
|
||||
Normal Synced 3m flagger Halt podinfo.test advancement success rate 69.17% < 99%
|
||||
Normal Synced 2m flagger Halt podinfo.test advancement success rate 61.39% < 99%
|
||||
Normal Synced 2m flagger Halt podinfo.test advancement success rate 55.06% < 99%
|
||||
Normal Synced 2m flagger Halt podinfo.test advancement success rate 47.00% < 99%
|
||||
Normal Synced 2m flagger (combined from similar events): Halt podinfo.test advancement success rate 38.08% < 99%
|
||||
Warning Synced 1m flagger Rolling back podinfo.test failed checks threshold reached 10
|
||||
Warning Synced 1m flagger Canary failed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
## Custom metrics
|
||||
|
||||
The canary analysis can be extended with Prometheus queries.
|
||||
|
||||
The demo app is instrumented with Prometheus so you can create a custom check that will use the HTTP request
|
||||
duration histogram to validate the canary.
|
||||
|
||||
Create a metric template and apply it on the cluster:
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: MetricTemplate
|
||||
metadata:
|
||||
name: not-found-percentage
|
||||
namespace: test
|
||||
spec:
|
||||
provider:
|
||||
type: prometheus
|
||||
address: http://flagger-prometheus.gloo-system:9090
|
||||
query: |
|
||||
100 - sum(
|
||||
rate(
|
||||
http_request_duration_seconds_count{
|
||||
kubernetes_namespace="{{ namespace }}",
|
||||
kubernetes_pod_name=~"{{ target }}-[0-9a-zA-Z]+(-[0-9a-zA-Z]+)"
|
||||
status!="{{ interval }}"
|
||||
}[1m]
|
||||
)
|
||||
)
|
||||
/
|
||||
sum(
|
||||
rate(
|
||||
http_request_duration_seconds_count{
|
||||
kubernetes_namespace="{{ namespace }}",
|
||||
kubernetes_pod_name=~"{{ target }}-[0-9a-zA-Z]+(-[0-9a-zA-Z]+)"
|
||||
}[{{ interval }}]
|
||||
)
|
||||
) * 100
|
||||
```
|
||||
|
||||
Edit the canary analysis and add the following metric:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
metrics:
|
||||
- name: "404s percentage"
|
||||
templateRef:
|
||||
name: not-found-percentage
|
||||
thresholdRange:
|
||||
max: 5
|
||||
interval: 1m
|
||||
```
|
||||
|
||||
The above configuration validates the canary by checking if the HTTP 404 req/sec percentage
|
||||
is below 5 percent of the total traffic. If the 404s rate reaches the 5% threshold, then the canary fails.
|
||||
|
||||
Trigger a canary deployment by updating the container image:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=ghcr.io/stefanprodan/podinfo:6.0.3
|
||||
```
|
||||
|
||||
Generate 404s:
|
||||
|
||||
```bash
|
||||
watch curl -H 'Host: app.example.com' http://gateway-proxy.gloo-system/status/404
|
||||
```
|
||||
|
||||
Watch Flagger logs:
|
||||
|
||||
```text
|
||||
kubectl -n gloo-system logs deployment/flagger -f | jq .msg
|
||||
|
||||
Starting canary deployment for podinfo.test
|
||||
Advance podinfo.test canary weight 5
|
||||
Advance podinfo.test canary weight 10
|
||||
Advance podinfo.test canary weight 15
|
||||
Halt podinfo.test advancement 404s percentage 6.20 > 5
|
||||
Halt podinfo.test advancement 404s percentage 6.45 > 5
|
||||
Halt podinfo.test advancement 404s percentage 7.60 > 5
|
||||
Halt podinfo.test advancement 404s percentage 8.69 > 5
|
||||
Halt podinfo.test advancement 404s percentage 9.70 > 5
|
||||
Rolling back podinfo.test failed checks threshold reached 5
|
||||
Canary failed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
If you have [alerting](../usage/alerting.md) configured,
|
||||
Flagger will send a notification with the reason why the canary failed.
|
||||
|
||||
## A/B Testing
|
||||
|
||||
Besides weighted routing, Flagger can be configured to route traffic to the canary based on HTTP match conditions.
|
||||
In an A/B testing scenario, you'll be using HTTP headers or cookies to target a certain segment of your users.
|
||||
This is particularly useful for frontend applications that require session affinity.
|
||||
|
||||

|
||||
|
||||
Edit the canary analysis, remove the max/step weight and add the match conditions and iterations:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
interval: 1m
|
||||
threshold: 5
|
||||
iterations: 10
|
||||
match:
|
||||
- headers:
|
||||
x-canary:
|
||||
exact: "insider"
|
||||
webhooks:
|
||||
- name: load-test
|
||||
url: http://flagger-loadtester.test/
|
||||
metadata:
|
||||
cmd: "hey -z 1m -q 5 -c 5 -H 'X-Canary: insider' -host app.example.com http://gateway-proxy.gloo-system"
|
||||
```
|
||||
|
||||
The above configuration will run an analysis for ten minutes targeting users that have a `X-Canary: insider` header.
|
||||
|
||||
Trigger a canary deployment by updating the container image:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=ghcr.io/stefanprodan/podinfo:6.0.4
|
||||
```
|
||||
|
||||
Flagger detects that the deployment revision changed and starts the A/B test:
|
||||
|
||||
```text
|
||||
kubectl -n gloo-system logs deploy/flagger -f | jq .msg
|
||||
|
||||
New revision detected! Progressing canary analysis for podinfo.test
|
||||
Advance podinfo.test canary iteration 1/10
|
||||
Advance podinfo.test canary iteration 2/10
|
||||
Advance podinfo.test canary iteration 3/10
|
||||
Advance podinfo.test canary iteration 4/10
|
||||
Advance podinfo.test canary iteration 5/10
|
||||
Advance podinfo.test canary iteration 6/10
|
||||
Advance podinfo.test canary iteration 7/10
|
||||
Advance podinfo.test canary iteration 8/10
|
||||
Advance podinfo.test canary iteration 9/10
|
||||
Advance podinfo.test canary iteration 10/10
|
||||
Copying podinfo.test template spec to podinfo-primary.test
|
||||
Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
|
||||
Routing all traffic to primary
|
||||
Promotion completed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
The web browser user agent header allows user segmentation based on device or OS.
|
||||
|
||||
For example, if you want to route all mobile users to the canary instance:
|
||||
|
||||
```yaml
|
||||
match:
|
||||
- headers:
|
||||
user-agent:
|
||||
regex: ".*Mobile.*"
|
||||
```
|
||||
|
||||
Or if you want to target only Android users:
|
||||
|
||||
```yaml
|
||||
match:
|
||||
- headers:
|
||||
user-agent:
|
||||
regex: ".*Android.*"
|
||||
```
|
||||
|
||||
Or a specific browser version:
|
||||
|
||||
```yaml
|
||||
match:
|
||||
- headers:
|
||||
user-agent:
|
||||
regex: ".*Firefox.*"
|
||||
```
|
||||
|
||||
For an in-depth look at the analysis process read the [usage docs](../usage/how-it-works.md).
|
|
@ -0,0 +1,263 @@
|
|||
---
|
||||
type: docs
|
||||
weight: 2
|
||||
title: Istio A/B Testing
|
||||
---
|
||||
|
||||
This guide shows you how to automate A/B testing with Istio and Flagger.
|
||||
|
||||
Besides weighted routing, Flagger can be configured to route traffic to the canary based on HTTP match conditions. In an A/B testing scenario, you'll be using HTTP headers or cookies to target a certain segment of your users. This is particularly useful for frontend applications that require session affinity.
|
||||
|
||||

|
||||
|
||||
## Prerequisites
|
||||
|
||||
Flagger requires a Kubernetes cluster **v1.16** or newer and Istio **v1.0** or newer.
|
||||
|
||||
Install Istio with telemetry support and Prometheus:
|
||||
|
||||
```bash
|
||||
istioctl manifest install --set profile=default
|
||||
|
||||
kubectl apply -f https://raw.githubusercontent.com/istio/istio/release-1.8/samples/addons/prometheus.yaml
|
||||
```
|
||||
|
||||
Install Flagger in the `istio-system` namespace:
|
||||
|
||||
```bash
|
||||
kubectl apply -k github.com/fluxcd/flagger//kustomize/istio
|
||||
```
|
||||
|
||||
Create an ingress gateway to expose the demo app outside of the mesh:
|
||||
|
||||
```yaml
|
||||
apiVersion: networking.istio.io/v1alpha3
|
||||
kind: Gateway
|
||||
metadata:
|
||||
name: public-gateway
|
||||
namespace: istio-system
|
||||
spec:
|
||||
selector:
|
||||
istio: ingressgateway
|
||||
servers:
|
||||
- port:
|
||||
number: 80
|
||||
name: http
|
||||
protocol: HTTP
|
||||
hosts:
|
||||
- "*"
|
||||
```
|
||||
|
||||
## Bootstrap
|
||||
|
||||
Create a test namespace with Istio sidecar injection enabled:
|
||||
|
||||
```bash
|
||||
kubectl create ns test
|
||||
kubectl label namespace test istio-injection=enabled
|
||||
```
|
||||
|
||||
Create a deployment and a horizontal pod autoscaler:
|
||||
|
||||
```bash
|
||||
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/podinfo?ref=main
|
||||
```
|
||||
|
||||
Deploy the load testing service to generate traffic during the canary analysis:
|
||||
|
||||
```bash
|
||||
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/tester?ref=main
|
||||
```
|
||||
|
||||
Create a canary custom resource \(replace example.com with your own domain\):
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: Canary
|
||||
metadata:
|
||||
name: podinfo
|
||||
namespace: test
|
||||
spec:
|
||||
# deployment reference
|
||||
targetRef:
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: podinfo
|
||||
# the maximum time in seconds for the canary deployment
|
||||
# to make progress before it is rollback (default 600s)
|
||||
progressDeadlineSeconds: 60
|
||||
# HPA reference (optional)
|
||||
autoscalerRef:
|
||||
apiVersion: autoscaling/v2beta2
|
||||
kind: HorizontalPodAutoscaler
|
||||
name: podinfo
|
||||
service:
|
||||
# container port
|
||||
port: 9898
|
||||
# Istio gateways (optional)
|
||||
gateways:
|
||||
- public-gateway.istio-system.svc.cluster.local
|
||||
# Istio virtual service host names (optional)
|
||||
hosts:
|
||||
- app.example.com
|
||||
# Istio traffic policy (optional)
|
||||
trafficPolicy:
|
||||
tls:
|
||||
# use ISTIO_MUTUAL when mTLS is enabled
|
||||
mode: DISABLE
|
||||
analysis:
|
||||
# schedule interval (default 60s)
|
||||
interval: 1m
|
||||
# total number of iterations
|
||||
iterations: 10
|
||||
# max number of failed iterations before rollback
|
||||
threshold: 2
|
||||
# canary match condition
|
||||
match:
|
||||
- headers:
|
||||
user-agent:
|
||||
regex: ".*Firefox.*"
|
||||
- headers:
|
||||
cookie:
|
||||
regex: "^(.*?;)?(type=insider)(;.*)?$"
|
||||
metrics:
|
||||
- name: request-success-rate
|
||||
# minimum req success rate (non 5xx responses)
|
||||
# percentage (0-100)
|
||||
thresholdRange:
|
||||
min: 99
|
||||
interval: 1m
|
||||
- name: request-duration
|
||||
# maximum req duration P99
|
||||
# milliseconds
|
||||
thresholdRange:
|
||||
max: 500
|
||||
interval: 30s
|
||||
# generate traffic during analysis
|
||||
webhooks:
|
||||
- name: load-test
|
||||
url: http://flagger-loadtester.test/
|
||||
timeout: 15s
|
||||
metadata:
|
||||
cmd: "hey -z 1m -q 10 -c 2 -H 'Cookie: type=insider' http://podinfo.test:9898/"
|
||||
```
|
||||
|
||||
**Note** that when using Istio 1.5 you have to replace the `request-duration` with a [metric template](/flagger/dev/upgrade-guide#istio-telemetry-v2).
|
||||
|
||||
The above configuration will run an analysis for ten minutes targeting Firefox users and those that have an insider cookie.
|
||||
|
||||
Save the above resource as podinfo-abtest.yaml and then apply it:
|
||||
|
||||
```bash
|
||||
kubectl apply -f ./podinfo-abtest.yaml
|
||||
```
|
||||
|
||||
After a couple of seconds Flagger will create the canary objects:
|
||||
|
||||
```bash
|
||||
# applied
|
||||
deployment.apps/podinfo
|
||||
horizontalpodautoscaler.autoscaling/podinfo
|
||||
canary.flagger.app/podinfo
|
||||
|
||||
# generated
|
||||
deployment.apps/podinfo-primary
|
||||
horizontalpodautoscaler.autoscaling/podinfo-primary
|
||||
service/podinfo
|
||||
service/podinfo-canary
|
||||
service/podinfo-primary
|
||||
destinationrule.networking.istio.io/podinfo-canary
|
||||
destinationrule.networking.istio.io/podinfo-primary
|
||||
virtualservice.networking.istio.io/podinfo
|
||||
```
|
||||
|
||||
## Automated canary promotion
|
||||
|
||||
Trigger a canary deployment by updating the container image:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=ghcr.io/stefanprodan/podinfo:6.0.1
|
||||
```
|
||||
|
||||
Flagger detects that the deployment revision changed and starts a new rollout:
|
||||
|
||||
```text
|
||||
kubectl -n test describe canary/abtest
|
||||
|
||||
Status:
|
||||
Failed Checks: 0
|
||||
Phase: Succeeded
|
||||
Events:
|
||||
Type Reason Age From Message
|
||||
---- ------ ---- ---- -------
|
||||
Normal Synced 3m flagger New revision detected podinfo.test
|
||||
Normal Synced 3m flagger Scaling up podinfo.test
|
||||
Warning Synced 3m flagger Waiting for podinfo.test rollout to finish: 0 of 1 updated replicas are available
|
||||
Normal Synced 3m flagger Advance podinfo.test canary iteration 1/10
|
||||
Normal Synced 3m flagger Advance podinfo.test canary iteration 2/10
|
||||
Normal Synced 3m flagger Advance podinfo.test canary iteration 3/10
|
||||
Normal Synced 2m flagger Advance podinfo.test canary iteration 4/10
|
||||
Normal Synced 2m flagger Advance podinfo.test canary iteration 5/10
|
||||
Normal Synced 1m flagger Advance podinfo.test canary iteration 6/10
|
||||
Normal Synced 1m flagger Advance podinfo.test canary iteration 7/10
|
||||
Normal Synced 55s flagger Advance podinfo.test canary iteration 8/10
|
||||
Normal Synced 45s flagger Advance podinfo.test canary iteration 9/10
|
||||
Normal Synced 35s flagger Advance podinfo.test canary iteration 10/10
|
||||
Normal Synced 25s flagger Copying podinfo.test template spec to podinfo-primary.test
|
||||
Warning Synced 15s flagger Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
|
||||
Normal Synced 5s flagger Promotion completed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
**Note** that if you apply new changes to the deployment during the canary analysis, Flagger will restart the analysis.
|
||||
|
||||
You can monitor all canaries with:
|
||||
|
||||
```bash
|
||||
watch kubectl get canaries --all-namespaces
|
||||
|
||||
NAMESPACE NAME STATUS WEIGHT LASTTRANSITIONTIME
|
||||
test podinfo Progressing 100 2019-03-16T14:05:07Z
|
||||
prod frontend Succeeded 0 2019-03-15T16:15:07Z
|
||||
prod backend Failed 0 2019-03-14T17:05:07Z
|
||||
```
|
||||
|
||||
## Automated rollback
|
||||
|
||||
During the canary analysis you can generate HTTP 500 errors and high latency to test Flagger's rollback.
|
||||
|
||||
Generate HTTP 500 errors:
|
||||
|
||||
```bash
|
||||
watch curl -b 'type=insider' http://app.example.com/status/500
|
||||
```
|
||||
|
||||
Generate latency:
|
||||
|
||||
```bash
|
||||
watch curl -b 'type=insider' http://app.example.com/delay/1
|
||||
```
|
||||
|
||||
When the number of failed checks reaches the canary analysis threshold, the traffic is routed back to the primary, the canary is scaled to zero and the rollout is marked as failed.
|
||||
|
||||
```text
|
||||
kubectl -n test describe canary/podinfo
|
||||
|
||||
Status:
|
||||
Failed Checks: 2
|
||||
Phase: Failed
|
||||
Events:
|
||||
Type Reason Age From Message
|
||||
---- ------ ---- ---- -------
|
||||
Normal Synced 3m flagger Starting canary deployment for podinfo.test
|
||||
Normal Synced 3m flagger Advance podinfo.test canary iteration 1/10
|
||||
Normal Synced 3m flagger Advance podinfo.test canary iteration 2/10
|
||||
Normal Synced 3m flagger Advance podinfo.test canary iteration 3/10
|
||||
Normal Synced 3m flagger Halt podinfo.test advancement success rate 69.17% < 99%
|
||||
Normal Synced 2m flagger Halt podinfo.test advancement success rate 61.39% < 99%
|
||||
Warning Synced 2m flagger Rolling back podinfo.test failed checks threshold reached 2
|
||||
Warning Synced 1m flagger Canary failed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
The above procedure can be extended with [custom metrics](../usage/metrics.md) checks, [webhooks](../usage/webhooks.md), [manual promotion](../usage/webhooks.md#manual-gating) approval and [Slack or MS Teams](../usage/alerting.md) notifications.
|
||||
|
|
@ -0,0 +1,374 @@
|
|||
---
|
||||
type: docs
|
||||
weight: 1
|
||||
title: Istio Canary Deployments
|
||||
---
|
||||
|
||||
This guide shows you how to use Istio and Flagger to automate canary deployments.
|
||||
|
||||

|
||||
|
||||
## Prerequisites
|
||||
|
||||
Flagger requires a Kubernetes cluster **v1.16** or newer and Istio **v1.5** or newer.
|
||||
|
||||
Install Istio with telemetry support and Prometheus:
|
||||
|
||||
```bash
|
||||
istioctl manifest install --set profile=default
|
||||
|
||||
# Suggestion: Please change release-1.8 in below command, to your real istio version.
|
||||
kubectl apply -f https://raw.githubusercontent.com/istio/istio/release-1.8/samples/addons/prometheus.yaml
|
||||
```
|
||||
|
||||
Install Flagger in the `istio-system` namespace:
|
||||
|
||||
```bash
|
||||
kubectl apply -k github.com/fluxcd/flagger//kustomize/istio
|
||||
```
|
||||
|
||||
Create an ingress gateway to expose the demo app outside of the mesh:
|
||||
|
||||
```yaml
|
||||
apiVersion: networking.istio.io/v1alpha3
|
||||
kind: Gateway
|
||||
metadata:
|
||||
name: public-gateway
|
||||
namespace: istio-system
|
||||
spec:
|
||||
selector:
|
||||
istio: ingressgateway
|
||||
servers:
|
||||
- port:
|
||||
number: 80
|
||||
name: http
|
||||
protocol: HTTP
|
||||
hosts:
|
||||
- "*"
|
||||
```
|
||||
|
||||
## Bootstrap
|
||||
|
||||
Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler \(HPA\), then creates a series of objects \(Kubernetes deployments, ClusterIP services, Istio destination rules and virtual services\). These objects expose the application inside the mesh and drive the canary analysis and promotion.
|
||||
|
||||
Create a test namespace with Istio sidecar injection enabled:
|
||||
|
||||
```bash
|
||||
kubectl create ns test
|
||||
kubectl label namespace test istio-injection=enabled
|
||||
```
|
||||
|
||||
Create a deployment and a horizontal pod autoscaler:
|
||||
|
||||
```bash
|
||||
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/podinfo?ref=main
|
||||
```
|
||||
|
||||
Deploy the load testing service to generate traffic during the canary analysis:
|
||||
|
||||
```bash
|
||||
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/tester?ref=main
|
||||
```
|
||||
|
||||
Create a canary custom resource \(replace example.com with your own domain\):
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: Canary
|
||||
metadata:
|
||||
name: podinfo
|
||||
namespace: test
|
||||
spec:
|
||||
# deployment reference
|
||||
targetRef:
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: podinfo
|
||||
# the maximum time in seconds for the canary deployment
|
||||
# to make progress before it is rollback (default 600s)
|
||||
progressDeadlineSeconds: 60
|
||||
# HPA reference (optional)
|
||||
autoscalerRef:
|
||||
apiVersion: autoscaling/v2beta2
|
||||
kind: HorizontalPodAutoscaler
|
||||
name: podinfo
|
||||
service:
|
||||
# service port number
|
||||
port: 9898
|
||||
# container port number or name (optional)
|
||||
targetPort: 9898
|
||||
# Istio gateways (optional)
|
||||
gateways:
|
||||
- public-gateway.istio-system.svc.cluster.local
|
||||
# Istio virtual service host names (optional)
|
||||
hosts:
|
||||
- app.example.com
|
||||
# Istio traffic policy (optional)
|
||||
trafficPolicy:
|
||||
tls:
|
||||
# use ISTIO_MUTUAL when mTLS is enabled
|
||||
mode: DISABLE
|
||||
# Istio retry policy (optional)
|
||||
retries:
|
||||
attempts: 3
|
||||
perTryTimeout: 1s
|
||||
retryOn: "gateway-error,connect-failure,refused-stream"
|
||||
analysis:
|
||||
# schedule interval (default 60s)
|
||||
interval: 1m
|
||||
# max number of failed metric checks before rollback
|
||||
threshold: 5
|
||||
# max traffic percentage routed to canary
|
||||
# percentage (0-100)
|
||||
maxWeight: 50
|
||||
# canary increment step
|
||||
# percentage (0-100)
|
||||
stepWeight: 10
|
||||
metrics:
|
||||
- name: request-success-rate
|
||||
# minimum req success rate (non 5xx responses)
|
||||
# percentage (0-100)
|
||||
thresholdRange:
|
||||
min: 99
|
||||
interval: 1m
|
||||
- name: request-duration
|
||||
# maximum req duration P99
|
||||
# milliseconds
|
||||
thresholdRange:
|
||||
max: 500
|
||||
interval: 30s
|
||||
# testing (optional)
|
||||
webhooks:
|
||||
- name: acceptance-test
|
||||
type: pre-rollout
|
||||
url: http://flagger-loadtester.test/
|
||||
timeout: 30s
|
||||
metadata:
|
||||
type: bash
|
||||
cmd: "curl -sd 'test' http://podinfo-canary:9898/token | grep token"
|
||||
- name: load-test
|
||||
url: http://flagger-loadtester.test/
|
||||
timeout: 5s
|
||||
metadata:
|
||||
cmd: "hey -z 1m -q 10 -c 2 http://podinfo-canary.test:9898/"
|
||||
```
|
||||
|
||||
**Note** that when using Istio 1.4 you have to replace the `request-duration` with a [metric template](/flagger/dev/upgrade-guide#istio-telemetry-v2).
|
||||
|
||||
Save the above resource as podinfo-canary.yaml and then apply it:
|
||||
|
||||
```bash
|
||||
kubectl apply -f ./podinfo-canary.yaml
|
||||
```
|
||||
|
||||
When the canary analysis starts, Flagger will call the pre-rollout webhooks before routing traffic to the canary. The canary analysis will run for five minutes while validating the HTTP metrics and rollout hooks every minute.
|
||||
|
||||

|
||||
|
||||
After a couple of seconds Flagger will create the canary objects:
|
||||
|
||||
```bash
|
||||
# applied
|
||||
deployment.apps/podinfo
|
||||
horizontalpodautoscaler.autoscaling/podinfo
|
||||
canary.flagger.app/podinfo
|
||||
|
||||
# generated
|
||||
deployment.apps/podinfo-primary
|
||||
horizontalpodautoscaler.autoscaling/podinfo-primary
|
||||
service/podinfo
|
||||
service/podinfo-canary
|
||||
service/podinfo-primary
|
||||
destinationrule.networking.istio.io/podinfo-canary
|
||||
destinationrule.networking.istio.io/podinfo-primary
|
||||
virtualservice.networking.istio.io/podinfo
|
||||
```
|
||||
|
||||
## Automated canary promotion
|
||||
|
||||
Trigger a canary deployment by updating the container image:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=ghcr.io/stefanprodan/podinfo:6.0.1
|
||||
```
|
||||
|
||||
Flagger detects that the deployment revision changed and starts a new rollout:
|
||||
|
||||
```text
|
||||
kubectl -n test describe canary/podinfo
|
||||
|
||||
Status:
|
||||
Canary Weight: 0
|
||||
Failed Checks: 0
|
||||
Phase: Succeeded
|
||||
Events:
|
||||
Type Reason Age From Message
|
||||
---- ------ ---- ---- -------
|
||||
Normal Synced 3m flagger New revision detected podinfo.test
|
||||
Normal Synced 3m flagger Scaling up podinfo.test
|
||||
Warning Synced 3m flagger Waiting for podinfo.test rollout to finish: 0 of 1 updated replicas are available
|
||||
Normal Synced 3m flagger Advance podinfo.test canary weight 5
|
||||
Normal Synced 3m flagger Advance podinfo.test canary weight 10
|
||||
Normal Synced 3m flagger Advance podinfo.test canary weight 15
|
||||
Normal Synced 2m flagger Advance podinfo.test canary weight 20
|
||||
Normal Synced 2m flagger Advance podinfo.test canary weight 25
|
||||
Normal Synced 1m flagger Advance podinfo.test canary weight 30
|
||||
Normal Synced 1m flagger Advance podinfo.test canary weight 35
|
||||
Normal Synced 55s flagger Advance podinfo.test canary weight 40
|
||||
Normal Synced 45s flagger Advance podinfo.test canary weight 45
|
||||
Normal Synced 35s flagger Advance podinfo.test canary weight 50
|
||||
Normal Synced 25s flagger Copying podinfo.test template spec to podinfo-primary.test
|
||||
Warning Synced 15s flagger Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
|
||||
Normal Synced 5s flagger Promotion completed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
**Note** that if you apply new changes to the deployment during the canary analysis, Flagger will restart the analysis.
|
||||
|
||||
A canary deployment is triggered by changes in any of the following objects:
|
||||
|
||||
* Deployment PodSpec \(container image, command, ports, env, resources, etc\)
|
||||
* ConfigMaps mounted as volumes or mapped to environment variables
|
||||
* Secrets mounted as volumes or mapped to environment variables
|
||||
|
||||
You can monitor all canaries with:
|
||||
|
||||
```bash
|
||||
watch kubectl get canaries --all-namespaces
|
||||
|
||||
NAMESPACE NAME STATUS WEIGHT LASTTRANSITIONTIME
|
||||
test podinfo Progressing 15 2019-01-16T14:05:07Z
|
||||
prod frontend Succeeded 0 2019-01-15T16:15:07Z
|
||||
prod backend Failed 0 2019-01-14T17:05:07Z
|
||||
```
|
||||
|
||||
## Automated rollback
|
||||
|
||||
During the canary analysis you can generate HTTP 500 errors and high latency to test if Flagger pauses the rollout.
|
||||
|
||||
Trigger another canary deployment:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=ghcr.io/stefanprodan/podinfo:6.0.2
|
||||
```
|
||||
|
||||
Exec into the load tester pod with:
|
||||
|
||||
```bash
|
||||
kubectl -n test exec -it flagger-loadtester-xx-xx sh
|
||||
```
|
||||
|
||||
Generate HTTP 500 errors:
|
||||
|
||||
```bash
|
||||
watch curl http://podinfo-canary:9898/status/500
|
||||
```
|
||||
|
||||
Generate latency:
|
||||
|
||||
```bash
|
||||
watch curl http://podinfo-canary:9898/delay/1
|
||||
```
|
||||
|
||||
When the number of failed checks reaches the canary analysis threshold, the traffic is routed back to the primary, the canary is scaled to zero and the rollout is marked as failed.
|
||||
|
||||
```text
|
||||
kubectl -n test describe canary/podinfo
|
||||
|
||||
Status:
|
||||
Canary Weight: 0
|
||||
Failed Checks: 10
|
||||
Phase: Failed
|
||||
Events:
|
||||
Type Reason Age From Message
|
||||
---- ------ ---- ---- -------
|
||||
Normal Synced 3m flagger Starting canary deployment for podinfo.test
|
||||
Normal Synced 3m flagger Advance podinfo.test canary weight 5
|
||||
Normal Synced 3m flagger Advance podinfo.test canary weight 10
|
||||
Normal Synced 3m flagger Advance podinfo.test canary weight 15
|
||||
Normal Synced 3m flagger Halt podinfo.test advancement success rate 69.17% < 99%
|
||||
Normal Synced 2m flagger Halt podinfo.test advancement success rate 61.39% < 99%
|
||||
Normal Synced 2m flagger Halt podinfo.test advancement success rate 55.06% < 99%
|
||||
Normal Synced 2m flagger Halt podinfo.test advancement success rate 47.00% < 99%
|
||||
Normal Synced 2m flagger (combined from similar events): Halt podinfo.test advancement success rate 38.08% < 99%
|
||||
Warning Synced 1m flagger Rolling back podinfo.test failed checks threshold reached 10
|
||||
Warning Synced 1m flagger Canary failed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
## Traffic mirroring
|
||||
|
||||

|
||||
|
||||
For applications that perform read operations, Flagger can be configured to drive canary releases with traffic mirroring. Istio traffic mirroring will copy each incoming request, sending one request to the primary and one to the canary service. The response from the primary is sent back to the user and the response from the canary is discarded. Metrics are collected on both requests so that the deployment will only proceed if the canary metrics are within the threshold values.
|
||||
|
||||
Note that mirroring should be used for requests that are **idempotent** or capable of being processed twice \(once by the primary and once by the canary\).
|
||||
|
||||
You can enable mirroring by replacing `stepWeight/maxWeight` with `iterations` and by setting `analysis.mirror` to `true`:
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: Canary
|
||||
metadata:
|
||||
name: podinfo
|
||||
namespace: test
|
||||
spec:
|
||||
analysis:
|
||||
# schedule interval
|
||||
interval: 1m
|
||||
# max number of failed metric checks before rollback
|
||||
threshold: 5
|
||||
# total number of iterations
|
||||
iterations: 10
|
||||
# enable traffic shadowing
|
||||
mirror: true
|
||||
# weight of the traffic mirrored to your canary (defaults to 100%)
|
||||
mirrorWeight: 100
|
||||
metrics:
|
||||
- name: request-success-rate
|
||||
thresholdRange:
|
||||
min: 99
|
||||
interval: 1m
|
||||
- name: request-duration
|
||||
thresholdRange:
|
||||
max: 500
|
||||
interval: 1m
|
||||
webhooks:
|
||||
- name: acceptance-test
|
||||
type: pre-rollout
|
||||
url: http://flagger-loadtester.test/
|
||||
timeout: 30s
|
||||
metadata:
|
||||
type: bash
|
||||
cmd: "curl -sd 'test' http://podinfo-canary:9898/token | grep token"
|
||||
- name: load-test
|
||||
url: http://flagger-loadtester.test/
|
||||
timeout: 5s
|
||||
metadata:
|
||||
cmd: "hey -z 1m -q 10 -c 2 http://podinfo.test:9898/"
|
||||
```
|
||||
|
||||
With the above configuration, Flagger will run a canary release with the following steps:
|
||||
|
||||
* detect new revision \(deployment spec, secrets or configmaps changes\)
|
||||
* scale from zero the canary deployment
|
||||
* wait for the HPA to set the canary minimum replicas
|
||||
* check canary pods health
|
||||
* run the acceptance tests
|
||||
* abort the canary release if tests fail
|
||||
* start the load tests
|
||||
* mirror 100% of the traffic from primary to canary
|
||||
* check request success rate and request duration every minute
|
||||
* abort the canary release if the metrics check failure threshold is reached
|
||||
* stop traffic mirroring after the number of iterations is reached
|
||||
* route live traffic to the canary pods
|
||||
* promote the canary \(update the primary secrets, configmaps and deployment spec\)
|
||||
* wait for the primary deployment rollout to finish
|
||||
* wait for the HPA to set the primary minimum replicas
|
||||
* check primary pods health
|
||||
* switch live traffic back to primary
|
||||
* scale to zero the canary
|
||||
* send notification with the canary analysis result
|
||||
|
||||
The above procedure can be extended with [custom metrics](../usage/metrics.md) checks, [webhooks](../usage/webhooks.md), [manual promotion](../usage/webhooks.md#manual-gating) approval and [Slack or MS Teams](../usage/alerting.md) notifications.
|
||||
|
|
@ -0,0 +1,376 @@
|
|||
---
|
||||
weight: 12
|
||||
title: Blue/Green Deployments
|
||||
type: docs
|
||||
---
|
||||
|
||||
This guide shows you how to automate Blue/Green deployments with Flagger and Kubernetes.
|
||||
|
||||
For applications that are not deployed on a service mesh, Flagger can orchestrate Blue/Green style deployments with Kubernetes L4 networking. When using a service mesh blue/green can be used as specified [here](../usage/deployment-strategies.md).
|
||||
|
||||

|
||||
|
||||
## Prerequisites
|
||||
|
||||
Flagger requires a Kubernetes cluster **v1.16** or newer.
|
||||
|
||||
Install Flagger and the Prometheus add-on:
|
||||
|
||||
```bash
|
||||
helm repo add flagger https://flagger.app
|
||||
|
||||
helm upgrade -i flagger flagger/flagger \
|
||||
--namespace flagger \
|
||||
--set prometheus.install=true \
|
||||
--set meshProvider=kubernetes
|
||||
```
|
||||
|
||||
If you already have a Prometheus instance running in your cluster, you can point Flagger to the ClusterIP service with:
|
||||
|
||||
```bash
|
||||
helm upgrade -i flagger flagger/flagger \
|
||||
--namespace flagger \
|
||||
--set metricsServer=http://prometheus.monitoring:9090
|
||||
```
|
||||
|
||||
Optionally you can enable Slack notifications:
|
||||
|
||||
```bash
|
||||
helm upgrade -i flagger flagger/flagger \
|
||||
--reuse-values \
|
||||
--namespace flagger \
|
||||
--set slack.url=https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK \
|
||||
--set slack.channel=general \
|
||||
--set slack.user=flagger
|
||||
```
|
||||
|
||||
## Bootstrap
|
||||
|
||||
Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler \(HPA\), then creates a series of objects \(Kubernetes deployment and ClusterIP services\). These objects expose the application inside the cluster and drive the canary analysis and Blue/Green promotion.
|
||||
|
||||
Create a test namespace:
|
||||
|
||||
```bash
|
||||
kubectl create ns test
|
||||
```
|
||||
|
||||
Create a deployment and a horizontal pod autoscaler:
|
||||
|
||||
```bash
|
||||
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/podinfo?ref=main
|
||||
```
|
||||
|
||||
Deploy the load testing service to generate traffic during the analysis:
|
||||
|
||||
```bash
|
||||
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/tester?ref=main
|
||||
```
|
||||
|
||||
Create a canary custom resource:
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: Canary
|
||||
metadata:
|
||||
name: podinfo
|
||||
namespace: test
|
||||
spec:
|
||||
# service mesh provider can be: kubernetes, istio, appmesh, nginx, gloo
|
||||
provider: kubernetes
|
||||
# deployment reference
|
||||
targetRef:
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: podinfo
|
||||
# the maximum time in seconds for the canary deployment
|
||||
# to make progress before rollback (default 600s)
|
||||
progressDeadlineSeconds: 60
|
||||
# HPA reference (optional)
|
||||
autoscalerRef:
|
||||
apiVersion: autoscaling/v2beta2
|
||||
kind: HorizontalPodAutoscaler
|
||||
name: podinfo
|
||||
service:
|
||||
port: 9898
|
||||
portDiscovery: true
|
||||
analysis:
|
||||
# schedule interval (default 60s)
|
||||
interval: 30s
|
||||
# max number of failed checks before rollback
|
||||
threshold: 2
|
||||
# number of checks to run before rollback
|
||||
iterations: 10
|
||||
# Prometheus checks based on
|
||||
# http_request_duration_seconds histogram
|
||||
metrics:
|
||||
- name: request-success-rate
|
||||
# minimum req success rate (non 5xx responses)
|
||||
# percentage (0-100)
|
||||
thresholdRange:
|
||||
min: 99
|
||||
interval: 1m
|
||||
- name: request-duration
|
||||
# maximum req duration P99
|
||||
# milliseconds
|
||||
thresholdRange:
|
||||
max: 500
|
||||
interval: 30s
|
||||
# acceptance/load testing hooks
|
||||
webhooks:
|
||||
- name: smoke-test
|
||||
type: pre-rollout
|
||||
url: http://flagger-loadtester.test/
|
||||
timeout: 15s
|
||||
metadata:
|
||||
type: bash
|
||||
cmd: "curl -sd 'anon' http://podinfo-canary.test:9898/token | grep token"
|
||||
- name: load-test
|
||||
url: http://flagger-loadtester.test/
|
||||
timeout: 5s
|
||||
metadata:
|
||||
type: cmd
|
||||
cmd: "hey -z 1m -q 10 -c 2 http://podinfo-canary.test:9898/"
|
||||
```
|
||||
|
||||
The above configuration will run an analysis for five minutes.
|
||||
|
||||
Save the above resource as podinfo-canary.yaml and then apply it:
|
||||
|
||||
```bash
|
||||
kubectl apply -f ./podinfo-canary.yaml
|
||||
```
|
||||
|
||||
After a couple of seconds Flagger will create the canary objects:
|
||||
|
||||
```bash
|
||||
# applied
|
||||
deployment.apps/podinfo
|
||||
horizontalpodautoscaler.autoscaling/podinfo
|
||||
canary.flagger.app/podinfo
|
||||
|
||||
# generated
|
||||
deployment.apps/podinfo-primary
|
||||
horizontalpodautoscaler.autoscaling/podinfo-primary
|
||||
service/podinfo
|
||||
service/podinfo-canary
|
||||
service/podinfo-primary
|
||||
```
|
||||
|
||||
Blue/Green scenario:
|
||||
|
||||
* on bootstrap, Flagger will create three ClusterIP services \(`app-primary`,`app-canary`, `app`\)
|
||||
|
||||
and a shadow deployment named `app-primary` that represents the blue version
|
||||
|
||||
* when a new version is detected, Flagger would scale up the green version and run the conformance tests
|
||||
|
||||
\(the tests should target the `app-canary` ClusterIP service to reach the green version\)
|
||||
|
||||
* if the conformance tests are passing, Flagger would start the load tests and validate them with custom Prometheus queries
|
||||
* if the load test analysis is successful, Flagger will promote the new version to `app-primary` and scale down the green version
|
||||
|
||||
## Automated Blue/Green promotion
|
||||
|
||||
Trigger a deployment by updating the container image:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=ghcr.io/stefanprodan/podinfo:6.0.1
|
||||
```
|
||||
|
||||
Flagger detects that the deployment revision changed and starts a new rollout:
|
||||
|
||||
```text
|
||||
kubectl -n test describe canary/podinfo
|
||||
|
||||
Events:
|
||||
|
||||
New revision detected podinfo.test
|
||||
Waiting for podinfo.test rollout to finish: 0 of 1 updated replicas are available
|
||||
Pre-rollout check acceptance-test passed
|
||||
Advance podinfo.test canary iteration 1/10
|
||||
Advance podinfo.test canary iteration 2/10
|
||||
Advance podinfo.test canary iteration 3/10
|
||||
Advance podinfo.test canary iteration 4/10
|
||||
Advance podinfo.test canary iteration 5/10
|
||||
Advance podinfo.test canary iteration 6/10
|
||||
Advance podinfo.test canary iteration 7/10
|
||||
Advance podinfo.test canary iteration 8/10
|
||||
Advance podinfo.test canary iteration 9/10
|
||||
Advance podinfo.test canary iteration 10/10
|
||||
Copying podinfo.test template spec to podinfo-primary.test
|
||||
Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
|
||||
Promotion completed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
**Note** that if you apply new changes to the deployment during the canary analysis, Flagger will restart the analysis.
|
||||
|
||||
You can monitor all canaries with:
|
||||
|
||||
```bash
|
||||
watch kubectl get canaries --all-namespaces
|
||||
|
||||
NAMESPACE NAME STATUS WEIGHT LASTTRANSITIONTIME
|
||||
test podinfo Progressing 100 2019-06-16T14:05:07Z
|
||||
prod frontend Succeeded 0 2019-06-15T16:15:07Z
|
||||
prod backend Failed 0 2019-06-14T17:05:07Z
|
||||
```
|
||||
|
||||
## Automated rollback
|
||||
|
||||
During the analysis you can generate HTTP 500 errors and high latency to test Flagger's rollback.
|
||||
|
||||
Exec into the load tester pod with:
|
||||
|
||||
```bash
|
||||
kubectl -n test exec -it flagger-loadtester-xx-xx sh
|
||||
```
|
||||
|
||||
Generate HTTP 500 errors:
|
||||
|
||||
```bash
|
||||
watch curl http://podinfo-canary.test:9898/status/500
|
||||
```
|
||||
|
||||
Generate latency:
|
||||
|
||||
```bash
|
||||
watch curl http://podinfo-canary.test:9898/delay/1
|
||||
```
|
||||
|
||||
When the number of failed checks reaches the analysis threshold, the green version is scaled to zero and the rollout is marked as failed.
|
||||
|
||||
```text
|
||||
kubectl -n test describe canary/podinfo
|
||||
|
||||
Status:
|
||||
Failed Checks: 2
|
||||
Phase: Failed
|
||||
Events:
|
||||
Type Reason Age From Message
|
||||
---- ------ ---- ---- -------
|
||||
Normal Synced 3m flagger New revision detected podinfo.test
|
||||
Normal Synced 3m flagger Advance podinfo.test canary iteration 1/10
|
||||
Normal Synced 3m flagger Advance podinfo.test canary iteration 2/10
|
||||
Normal Synced 3m flagger Advance podinfo.test canary iteration 3/10
|
||||
Normal Synced 3m flagger Halt podinfo.test advancement success rate 69.17% < 99%
|
||||
Normal Synced 2m flagger Halt podinfo.test advancement success rate 61.39% < 99%
|
||||
Warning Synced 2m flagger Rolling back podinfo.test failed checks threshold reached 2
|
||||
Warning Synced 1m flagger Canary failed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
## Custom metrics
|
||||
|
||||
The analysis can be extended with Prometheus queries. The demo app is instrumented with Prometheus so you can create a custom check that will use the HTTP request duration histogram to validate the canary \(green version\).
|
||||
|
||||
Create a metric template and apply it on the cluster:
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: MetricTemplate
|
||||
metadata:
|
||||
name: not-found-percentage
|
||||
namespace: test
|
||||
spec:
|
||||
provider:
|
||||
type: prometheus
|
||||
address: http://flagger-prometheus.flagger:9090
|
||||
query: |
|
||||
100 - sum(
|
||||
rate(
|
||||
http_request_duration_seconds_count{
|
||||
kubernetes_namespace="{{ namespace }}",
|
||||
kubernetes_pod_name=~"{{ target }}-[0-9a-zA-Z]+(-[0-9a-zA-Z]+)"
|
||||
status!="{{ interval }}"
|
||||
}[1m]
|
||||
)
|
||||
)
|
||||
/
|
||||
sum(
|
||||
rate(
|
||||
http_request_duration_seconds_count{
|
||||
kubernetes_namespace="{{ namespace }}",
|
||||
kubernetes_pod_name=~"{{ target }}-[0-9a-zA-Z]+(-[0-9a-zA-Z]+)"
|
||||
}[{{ interval }}]
|
||||
)
|
||||
) * 100
|
||||
```
|
||||
|
||||
Edit the canary analysis and add the following metric:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
metrics:
|
||||
- name: "404s percentage"
|
||||
templateRef:
|
||||
name: not-found-percentage
|
||||
thresholdRange:
|
||||
max: 5
|
||||
interval: 1m
|
||||
```
|
||||
|
||||
The above configuration validates the canary \(green version\) by checking if the HTTP 404 req/sec percentage is below 5 percent of the total traffic. If the 404s rate reaches the 5% threshold, then the rollout is rolled back.
|
||||
|
||||
Trigger a deployment by updating the container image:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=ghcr.io/stefanprodan/podinfo:6.0.3
|
||||
```
|
||||
|
||||
Generate 404s:
|
||||
|
||||
```bash
|
||||
watch curl http://podinfo-canary.test:9898/status/400
|
||||
```
|
||||
|
||||
Watch Flagger logs:
|
||||
|
||||
```text
|
||||
kubectl -n flagger logs deployment/flagger -f | jq .msg
|
||||
|
||||
New revision detected podinfo.test
|
||||
Scaling up podinfo.test
|
||||
Advance podinfo.test canary iteration 1/10
|
||||
Halt podinfo.test advancement 404s percentage 6.20 > 5
|
||||
Halt podinfo.test advancement 404s percentage 6.45 > 5
|
||||
Rolling back podinfo.test failed checks threshold reached 2
|
||||
Canary failed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
If you have [alerting](../usage/alerting.md) configured, Flagger will send a notification with the reason why the canary failed.
|
||||
|
||||
## Conformance Testing with Helm
|
||||
|
||||
Flagger comes with a testing service that can run Helm tests when configured as a pre-rollout webhook.
|
||||
|
||||
Deploy the Helm test runner in the `kube-system` namespace using the `tiller` service account:
|
||||
|
||||
```bash
|
||||
helm repo add flagger https://flagger.app
|
||||
|
||||
helm upgrade -i flagger-helmtester flagger/loadtester \
|
||||
--namespace=kube-system \
|
||||
--set serviceAccountName=tiller
|
||||
```
|
||||
|
||||
When deployed the Helm tester API will be available at `http://flagger-helmtester.kube-system/`.
|
||||
|
||||
Add a helm test pre-rollout hook to your chart:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
webhooks:
|
||||
- name: "conformance testing"
|
||||
type: pre-rollout
|
||||
url: http://flagger-helmtester.kube-system/
|
||||
timeout: 3m
|
||||
metadata:
|
||||
type: "helm"
|
||||
cmd: "test {{ .Release.Name }} --cleanup"
|
||||
```
|
||||
|
||||
When the canary analysis starts, Flagger will call the pre-rollout webhooks. If the helm test fails, Flagger will retry until the analysis threshold is reached and the canary is rolled back.
|
||||
|
||||
For an in-depth look at the analysis process read the [usage docs](../usage/how-it-works.md).
|
||||
|
|
@ -0,0 +1,256 @@
|
|||
---
|
||||
weight: 10
|
||||
title: Kuma Canary Deployments
|
||||
type: docs
|
||||
---
|
||||
|
||||
This guide shows you how to use Kuma and Flagger to automate canary deployments.
|
||||
|
||||

|
||||
|
||||
## Prerequisites
|
||||
|
||||
Flagger requires a Kubernetes cluster **v1.16** or newer and Kuma **1.3** or newer.
|
||||
|
||||
Install Kuma and Prometheus (part of Kuma Metrics):
|
||||
|
||||
```bash
|
||||
kumactl install control-plane | kubectl apply -f -
|
||||
kumactl install metrics | kubectl apply -f -
|
||||
```
|
||||
|
||||
Install Flagger in the `kuma-system` namespace:
|
||||
|
||||
```bash
|
||||
kubectl apply -k github.com/fluxcd/flagger//kustomize/kuma
|
||||
```
|
||||
|
||||
## Bootstrap
|
||||
|
||||
Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler (HPA),
|
||||
then creates a series of objects (Kubernetes deployments, ClusterIP services and Kuma `TrafficRoute`).
|
||||
These objects expose the application inside the mesh and drive the canary analysis and promotion.
|
||||
|
||||
Create a test namespace and enable Kuma sidecar injection:
|
||||
|
||||
```bash
|
||||
kubectl create ns test
|
||||
kubectl annotate namespace test kuma.io/sidecar-injection=enabled
|
||||
```
|
||||
|
||||
Install the load testing service to generate traffic during the canary analysis:
|
||||
|
||||
```bash
|
||||
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/tester?ref=main
|
||||
```
|
||||
|
||||
Create a deployment and a horizontal pod autoscaler:
|
||||
|
||||
```bash
|
||||
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/podinfo?ref=main
|
||||
```
|
||||
|
||||
Create a canary custom resource for the `podinfo` deployment:
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: Canary
|
||||
metadata:
|
||||
name: podinfo
|
||||
namespace: test
|
||||
annotations:
|
||||
kuma.io/mesh: default
|
||||
spec:
|
||||
targetRef:
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: podinfo
|
||||
progressDeadlineSeconds: 60
|
||||
service:
|
||||
port: 9898
|
||||
targetPort: 9898
|
||||
apex:
|
||||
annotations:
|
||||
9898.service.kuma.io/protocol: "http"
|
||||
canary:
|
||||
annotations:
|
||||
9898.service.kuma.io/protocol: "http"
|
||||
primary:
|
||||
annotations:
|
||||
9898.service.kuma.io/protocol: "http"
|
||||
analysis:
|
||||
# schedule interval (default 60s)
|
||||
interval: 30s
|
||||
# max number of failed metric checks before rollback
|
||||
threshold: 5
|
||||
# max traffic percentage routed to canary
|
||||
# percentage (0-100)
|
||||
maxWeight: 50
|
||||
# canary increment step
|
||||
# percentage (0-100)
|
||||
stepWeight: 5
|
||||
metrics:
|
||||
- name: request-success-rate
|
||||
threshold: 99
|
||||
interval: 1m
|
||||
- name: request-duration
|
||||
threshold: 500
|
||||
interval: 30s
|
||||
webhooks:
|
||||
- name: acceptance-test
|
||||
type: pre-rollout
|
||||
url: http://flagger-loadtester.test/
|
||||
timeout: 30s
|
||||
metadata:
|
||||
type: bash
|
||||
cmd: "curl -sd 'test' http://podinfo-canary.test:9898/token | grep token"
|
||||
- name: load-test
|
||||
type: rollout
|
||||
url: http://flagger-loadtester.test/
|
||||
metadata:
|
||||
cmd: "hey -z 2m -q 10 -c 2 http://podinfo-canary.test:9898/"
|
||||
```
|
||||
|
||||
Save the above resource as `podinfo-canary.yaml` and then apply it:
|
||||
|
||||
```bash
|
||||
kubectl apply -f ./podinfo-canary.yaml
|
||||
```
|
||||
|
||||
When the canary analysis starts, Flagger will call the pre-rollout webhooks before routing traffic to the canary. The canary analysis will run for five minutes while validating the HTTP metrics and rollout hooks every half a minute.
|
||||
|
||||
After a couple of seconds Flagger will create the canary objects:
|
||||
|
||||
```bash
|
||||
# applied
|
||||
deployment.apps/podinfo
|
||||
horizontalpodautoscaler.autoscaling/podinfo
|
||||
ingresses.extensions/podinfo
|
||||
canary.flagger.app/podinfo
|
||||
|
||||
# generated
|
||||
deployment.apps/podinfo-primary
|
||||
horizontalpodautoscaler.autoscaling/podinfo-primary
|
||||
service/podinfo
|
||||
service/podinfo-canary
|
||||
service/podinfo-primary
|
||||
trafficroutes.kuma.io/podinfo
|
||||
```
|
||||
|
||||
After the boostrap, the podinfo deployment will be scaled to zero and the traffic to `podinfo.test` will be routed to the primary pods. During the canary analysis, the `podinfo-canary.test` address can be used to target directly the canary pods.
|
||||
|
||||
## Automated canary promotion
|
||||
|
||||
Flagger implements a control loop that gradually shifts traffic to the canary while measuring key performance indicators like HTTP requests success rate, requests average duration and pod health. Based on analysis of the KPIs a canary is promoted or aborted, and the analysis result is published to Slack.
|
||||
|
||||

|
||||
|
||||
Trigger a canary deployment by updating the container image:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=ghcr.io/stefanprodan/podinfo:6.0.1
|
||||
```
|
||||
|
||||
Flagger detects that the deployment revision changed and starts a new rollout:
|
||||
|
||||
```text
|
||||
kubectl -n test describe canary/podinfo
|
||||
|
||||
Status:
|
||||
Canary Weight: 0
|
||||
Failed Checks: 0
|
||||
Phase: Succeeded
|
||||
Events:
|
||||
New revision detected! Scaling up podinfo.test
|
||||
Waiting for podinfo.test rollout to finish: 0 of 1 updated replicas are available
|
||||
Pre-rollout check acceptance-test passed
|
||||
Advance podinfo.test canary weight 5
|
||||
Advance podinfo.test canary weight 10
|
||||
Advance podinfo.test canary weight 15
|
||||
Advance podinfo.test canary weight 20
|
||||
Advance podinfo.test canary weight 25
|
||||
Waiting for podinfo.test rollout to finish: 1 of 2 updated replicas are available
|
||||
Advance podinfo.test canary weight 30
|
||||
Advance podinfo.test canary weight 35
|
||||
Advance podinfo.test canary weight 40
|
||||
Advance podinfo.test canary weight 45
|
||||
Advance podinfo.test canary weight 50
|
||||
Copying podinfo.test template spec to podinfo-primary.test
|
||||
Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
|
||||
Promotion completed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
**Note** that if you apply new changes to the deployment during the canary analysis, Flagger will restart the analysis.
|
||||
|
||||
A canary deployment is triggered by changes in any of the following objects:
|
||||
|
||||
* Deployment PodSpec \(container image, command, ports, env, resources, etc\)
|
||||
* ConfigMaps mounted as volumes or mapped to environment variables
|
||||
* Secrets mounted as volumes or mapped to environment variables
|
||||
|
||||
You can monitor all canaries with:
|
||||
|
||||
```bash
|
||||
watch kubectl get canaries --all-namespaces
|
||||
|
||||
NAMESPACE NAME STATUS WEIGHT LASTTRANSITIONTIME
|
||||
test podinfo Progressing 15 2019-06-30T14:05:07Z
|
||||
prod frontend Succeeded 0 2019-06-30T16:15:07Z
|
||||
prod backend Failed 0 2019-06-30T17:05:07Z
|
||||
```
|
||||
|
||||
## Automated rollback
|
||||
|
||||
During the canary analysis you can generate HTTP 500 errors and high latency to test if Flagger pauses and rolls back the faulted version.
|
||||
|
||||
Trigger another canary deployment:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=ghcr.io/stefanprodan/podinfo:6.0.2
|
||||
```
|
||||
|
||||
Exec into the load tester pod with:
|
||||
|
||||
```bash
|
||||
kubectl -n test exec -it flagger-loadtester-xx-xx sh
|
||||
```
|
||||
|
||||
Generate HTTP 500 errors:
|
||||
|
||||
```bash
|
||||
watch -n 1 curl http://podinfo-canary.test:9898/status/500
|
||||
```
|
||||
|
||||
Generate latency:
|
||||
|
||||
```bash
|
||||
watch -n 1 curl http://podinfo-canary.test:9898/delay/1
|
||||
```
|
||||
|
||||
When the number of failed checks reaches the canary analysis threshold, the traffic is routed back to the primary, the canary is scaled to zero and the rollout is marked as failed.
|
||||
|
||||
```text
|
||||
kubectl -n test describe canary/podinfo
|
||||
|
||||
Status:
|
||||
Canary Weight: 0
|
||||
Failed Checks: 10
|
||||
Phase: Failed
|
||||
Events:
|
||||
Starting canary analysis for podinfo.test
|
||||
Pre-rollout check acceptance-test passed
|
||||
Advance podinfo.test canary weight 5
|
||||
Advance podinfo.test canary weight 10
|
||||
Advance podinfo.test canary weight 15
|
||||
Halt podinfo.test advancement success rate 69.17% < 99%
|
||||
Halt podinfo.test advancement success rate 61.39% < 99%
|
||||
Halt podinfo.test advancement success rate 55.06% < 99%
|
||||
Halt podinfo.test advancement request duration 1.20s > 0.5s
|
||||
Halt podinfo.test advancement request duration 1.45s > 0.5s
|
||||
Rolling back podinfo.test failed checks threshold reached 5
|
||||
Canary failed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
The above procedures can be extended with [custom metrics](../usage/metrics.md) checks, [webhooks](../usage/webhooks.md), [manual promotion](../usage/webhooks.md#manual-gating) approval and [Slack or MS Teams](../usage/alerting.md) notifications.
|
|
@ -0,0 +1,476 @@
|
|||
---
|
||||
type: docs
|
||||
weight: 3
|
||||
title: Linkerd Canary Deployments
|
||||
---
|
||||
|
||||
This guide shows you how to use Linkerd and Flagger to automate canary deployments.
|
||||
|
||||

|
||||
|
||||
## Prerequisites
|
||||
|
||||
Flagger requires a Kubernetes cluster **v1.16** or newer and Linkerd **2.10** or newer.
|
||||
|
||||
Install Linkerd the Promethues (part of Linkerd Viz):
|
||||
|
||||
```bash
|
||||
linkerd install | kubectl apply -f -
|
||||
linkerd viz install | kubectl apply -f -
|
||||
```
|
||||
|
||||
Install Flagger in the linkerd namespace:
|
||||
|
||||
```bash
|
||||
kubectl apply -k github.com/fluxcd/flagger//kustomize/linkerd
|
||||
```
|
||||
|
||||
## Bootstrap
|
||||
|
||||
Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler (HPA),
|
||||
then creates a series of objects (Kubernetes deployments, ClusterIP services and SMI traffic split).
|
||||
These objects expose the application inside the mesh and drive the canary analysis and promotion.
|
||||
|
||||
Create a test namespace and enable Linkerd proxy injection:
|
||||
|
||||
```bash
|
||||
kubectl create ns test
|
||||
kubectl annotate namespace test linkerd.io/inject=enabled
|
||||
```
|
||||
|
||||
Install the load testing service to generate traffic during the canary analysis:
|
||||
|
||||
```bash
|
||||
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/tester?ref=main
|
||||
```
|
||||
|
||||
Create a deployment and a horizontal pod autoscaler:
|
||||
|
||||
```bash
|
||||
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/podinfo?ref=main
|
||||
```
|
||||
|
||||
Create a canary custom resource for the podinfo deployment:
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: Canary
|
||||
metadata:
|
||||
name: podinfo
|
||||
namespace: test
|
||||
spec:
|
||||
# deployment reference
|
||||
targetRef:
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: podinfo
|
||||
# HPA reference (optional)
|
||||
autoscalerRef:
|
||||
apiVersion: autoscaling/v2beta2
|
||||
kind: HorizontalPodAutoscaler
|
||||
name: podinfo
|
||||
# the maximum time in seconds for the canary deployment
|
||||
# to make progress before it is rollback (default 600s)
|
||||
progressDeadlineSeconds: 60
|
||||
service:
|
||||
# ClusterIP port number
|
||||
port: 9898
|
||||
# container port number or name (optional)
|
||||
targetPort: 9898
|
||||
analysis:
|
||||
# schedule interval (default 60s)
|
||||
interval: 30s
|
||||
# max number of failed metric checks before rollback
|
||||
threshold: 5
|
||||
# max traffic percentage routed to canary
|
||||
# percentage (0-100)
|
||||
maxWeight: 50
|
||||
# canary increment step
|
||||
# percentage (0-100)
|
||||
stepWeight: 5
|
||||
# Linkerd Prometheus checks
|
||||
metrics:
|
||||
- name: request-success-rate
|
||||
# minimum req success rate (non 5xx responses)
|
||||
# percentage (0-100)
|
||||
thresholdRange:
|
||||
min: 99
|
||||
interval: 1m
|
||||
- name: request-duration
|
||||
# maximum req duration P99
|
||||
# milliseconds
|
||||
thresholdRange:
|
||||
max: 500
|
||||
interval: 30s
|
||||
# testing (optional)
|
||||
webhooks:
|
||||
- name: acceptance-test
|
||||
type: pre-rollout
|
||||
url: http://flagger-loadtester.test/
|
||||
timeout: 30s
|
||||
metadata:
|
||||
type: bash
|
||||
cmd: "curl -sd 'test' http://podinfo-canary.test:9898/token | grep token"
|
||||
- name: load-test
|
||||
type: rollout
|
||||
url: http://flagger-loadtester.test/
|
||||
metadata:
|
||||
cmd: "hey -z 2m -q 10 -c 2 http://podinfo-canary.test:9898/"
|
||||
```
|
||||
|
||||
Save the above resource as podinfo-canary.yaml and then apply it:
|
||||
|
||||
```bash
|
||||
kubectl apply -f ./podinfo-canary.yaml
|
||||
```
|
||||
|
||||
When the canary analysis starts, Flagger will call the pre-rollout webhooks before routing traffic to the canary. The canary analysis will run for five minutes while validating the HTTP metrics and rollout hooks every half a minute.
|
||||
|
||||
After a couple of seconds Flagger will create the canary objects:
|
||||
|
||||
```bash
|
||||
# applied
|
||||
deployment.apps/podinfo
|
||||
horizontalpodautoscaler.autoscaling/podinfo
|
||||
ingresses.extensions/podinfo
|
||||
canary.flagger.app/podinfo
|
||||
|
||||
# generated
|
||||
deployment.apps/podinfo-primary
|
||||
horizontalpodautoscaler.autoscaling/podinfo-primary
|
||||
service/podinfo
|
||||
service/podinfo-canary
|
||||
service/podinfo-primary
|
||||
trafficsplits.split.smi-spec.io/podinfo
|
||||
```
|
||||
|
||||
After the boostrap, the podinfo deployment will be scaled to zero and the traffic to `podinfo.test` will be routed to the primary pods. During the canary analysis, the `podinfo-canary.test` address can be used to target directly the canary pods.
|
||||
|
||||
## Automated canary promotion
|
||||
|
||||
Flagger implements a control loop that gradually shifts traffic to the canary while measuring key performance indicators like HTTP requests success rate, requests average duration and pod health. Based on analysis of the KPIs a canary is promoted or aborted, and the analysis result is published to Slack.
|
||||
|
||||

|
||||
|
||||
Trigger a canary deployment by updating the container image:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=ghcr.io/stefanprodan/podinfo:6.0.1
|
||||
```
|
||||
|
||||
Flagger detects that the deployment revision changed and starts a new rollout:
|
||||
|
||||
```text
|
||||
kubectl -n test describe canary/podinfo
|
||||
|
||||
Status:
|
||||
Canary Weight: 0
|
||||
Failed Checks: 0
|
||||
Phase: Succeeded
|
||||
Events:
|
||||
New revision detected! Scaling up podinfo.test
|
||||
Waiting for podinfo.test rollout to finish: 0 of 1 updated replicas are available
|
||||
Pre-rollout check acceptance-test passed
|
||||
Advance podinfo.test canary weight 5
|
||||
Advance podinfo.test canary weight 10
|
||||
Advance podinfo.test canary weight 15
|
||||
Advance podinfo.test canary weight 20
|
||||
Advance podinfo.test canary weight 25
|
||||
Waiting for podinfo.test rollout to finish: 1 of 2 updated replicas are available
|
||||
Advance podinfo.test canary weight 30
|
||||
Advance podinfo.test canary weight 35
|
||||
Advance podinfo.test canary weight 40
|
||||
Advance podinfo.test canary weight 45
|
||||
Advance podinfo.test canary weight 50
|
||||
Copying podinfo.test template spec to podinfo-primary.test
|
||||
Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
|
||||
Promotion completed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
**Note** that if you apply new changes to the deployment during the canary analysis, Flagger will restart the analysis.
|
||||
|
||||
A canary deployment is triggered by changes in any of the following objects:
|
||||
|
||||
* Deployment PodSpec \(container image, command, ports, env, resources, etc\)
|
||||
* ConfigMaps mounted as volumes or mapped to environment variables
|
||||
* Secrets mounted as volumes or mapped to environment variables
|
||||
|
||||
You can monitor all canaries with:
|
||||
|
||||
```bash
|
||||
watch kubectl get canaries --all-namespaces
|
||||
|
||||
NAMESPACE NAME STATUS WEIGHT LASTTRANSITIONTIME
|
||||
test podinfo Progressing 15 2019-06-30T14:05:07Z
|
||||
prod frontend Succeeded 0 2019-06-30T16:15:07Z
|
||||
prod backend Failed 0 2019-06-30T17:05:07Z
|
||||
```
|
||||
|
||||
## Automated rollback
|
||||
|
||||
During the canary analysis you can generate HTTP 500 errors and high latency to test if Flagger pauses and rolls back the faulted version.
|
||||
|
||||
Trigger another canary deployment:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=ghcr.io/stefanprodan/podinfo:6.0.2
|
||||
```
|
||||
|
||||
Exec into the load tester pod with:
|
||||
|
||||
```bash
|
||||
kubectl -n test exec -it flagger-loadtester-xx-xx sh
|
||||
```
|
||||
|
||||
Generate HTTP 500 errors:
|
||||
|
||||
```bash
|
||||
watch -n 1 curl http://podinfo-canary.test:9898/status/500
|
||||
```
|
||||
|
||||
Generate latency:
|
||||
|
||||
```bash
|
||||
watch -n 1 curl http://podinfo-canary.test:9898/delay/1
|
||||
```
|
||||
|
||||
When the number of failed checks reaches the canary analysis threshold, the traffic is routed back to the primary, the canary is scaled to zero and the rollout is marked as failed.
|
||||
|
||||
```text
|
||||
kubectl -n test describe canary/podinfo
|
||||
|
||||
Status:
|
||||
Canary Weight: 0
|
||||
Failed Checks: 10
|
||||
Phase: Failed
|
||||
Events:
|
||||
Starting canary analysis for podinfo.test
|
||||
Pre-rollout check acceptance-test passed
|
||||
Advance podinfo.test canary weight 5
|
||||
Advance podinfo.test canary weight 10
|
||||
Advance podinfo.test canary weight 15
|
||||
Halt podinfo.test advancement success rate 69.17% < 99%
|
||||
Halt podinfo.test advancement success rate 61.39% < 99%
|
||||
Halt podinfo.test advancement success rate 55.06% < 99%
|
||||
Halt podinfo.test advancement request duration 1.20s > 0.5s
|
||||
Halt podinfo.test advancement request duration 1.45s > 0.5s
|
||||
Rolling back podinfo.test failed checks threshold reached 5
|
||||
Canary failed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
## Custom metrics
|
||||
|
||||
The canary analysis can be extended with Prometheus queries.
|
||||
|
||||
Let's a define a check for not found errors. Edit the canary analysis and add the following metric:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
metrics:
|
||||
- name: "404s percentage"
|
||||
threshold: 3
|
||||
query: |
|
||||
100 - sum(
|
||||
rate(
|
||||
response_total{
|
||||
namespace="test",
|
||||
deployment="podinfo",
|
||||
status_code!="404",
|
||||
direction="inbound"
|
||||
}[1m]
|
||||
)
|
||||
)
|
||||
/
|
||||
sum(
|
||||
rate(
|
||||
response_total{
|
||||
namespace="test",
|
||||
deployment="podinfo",
|
||||
direction="inbound"
|
||||
}[1m]
|
||||
)
|
||||
)
|
||||
* 100
|
||||
```
|
||||
|
||||
The above configuration validates the canary version by checking if the HTTP 404 req/sec percentage is below three percent of the total traffic. If the 404s rate reaches the 3% threshold, then the analysis is aborted and the canary is marked as failed.
|
||||
|
||||
Trigger a canary deployment by updating the container image:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=ghcr.io/stefanprodan/podinfo:6.0.3
|
||||
```
|
||||
|
||||
Generate 404s:
|
||||
|
||||
```bash
|
||||
watch -n 1 curl http://podinfo-canary:9898/status/404
|
||||
```
|
||||
|
||||
Watch Flagger logs:
|
||||
|
||||
```text
|
||||
kubectl -n linkerd logs deployment/flagger -f | jq .msg
|
||||
|
||||
Starting canary deployment for podinfo.test
|
||||
Pre-rollout check acceptance-test passed
|
||||
Advance podinfo.test canary weight 5
|
||||
Halt podinfo.test advancement 404s percentage 6.20 > 3
|
||||
Halt podinfo.test advancement 404s percentage 6.45 > 3
|
||||
Halt podinfo.test advancement 404s percentage 7.22 > 3
|
||||
Halt podinfo.test advancement 404s percentage 6.50 > 3
|
||||
Halt podinfo.test advancement 404s percentage 6.34 > 3
|
||||
Rolling back podinfo.test failed checks threshold reached 5
|
||||
Canary failed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
If you have Slack configured, Flagger will send a notification with the reason why the canary failed.
|
||||
|
||||
## Linkerd Ingress
|
||||
|
||||
There are two ingress controllers that are compatible with both Flagger and Linkerd: NGINX and Gloo.
|
||||
|
||||
Install NGINX:
|
||||
|
||||
```bash
|
||||
helm upgrade -i nginx-ingress stable/nginx-ingress \
|
||||
--namespace ingress-nginx
|
||||
```
|
||||
|
||||
Create an ingress definition for podinfo that rewrites the incoming header to the internal service name \(required by Linkerd\):
|
||||
|
||||
```yaml
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: podinfo
|
||||
namespace: test
|
||||
labels:
|
||||
app: podinfo
|
||||
annotations:
|
||||
kubernetes.io/ingress.class: "nginx"
|
||||
nginx.ingress.kubernetes.io/configuration-snippet: |
|
||||
proxy_set_header l5d-dst-override $service_name.$namespace.svc.cluster.local:9898;
|
||||
proxy_hide_header l5d-remote-ip;
|
||||
proxy_hide_header l5d-server-id;
|
||||
spec:
|
||||
rules:
|
||||
- host: app.example.com
|
||||
http:
|
||||
paths:
|
||||
- backend:
|
||||
serviceName: podinfo
|
||||
servicePort: 9898
|
||||
```
|
||||
|
||||
When using an ingress controller, the Linkerd traffic split does not apply to incoming traffic since NGINX in running outside of the mesh. In order to run a canary analysis for a frontend app, Flagger creates a shadow ingress and sets the NGINX specific annotations.
|
||||
|
||||
## A/B Testing
|
||||
|
||||
Besides weighted routing, Flagger can be configured to route traffic to the canary based on HTTP match conditions. In an A/B testing scenario, you'll be using HTTP headers or cookies to target a certain segment of your users. This is particularly useful for frontend applications that require session affinity.
|
||||
|
||||

|
||||
|
||||
Edit podinfo canary analysis, set the provider to `nginx`, add the ingress reference, remove the max/step weight and add the match conditions and iterations:
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: Canary
|
||||
metadata:
|
||||
name: podinfo
|
||||
namespace: test
|
||||
spec:
|
||||
# ingress reference
|
||||
provider: nginx
|
||||
ingressRef:
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: Ingress
|
||||
name: podinfo
|
||||
targetRef:
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: podinfo
|
||||
autoscalerRef:
|
||||
apiVersion: autoscaling/v2beta2
|
||||
kind: HorizontalPodAutoscaler
|
||||
name: podinfo
|
||||
service:
|
||||
# container port
|
||||
port: 9898
|
||||
analysis:
|
||||
interval: 1m
|
||||
threshold: 10
|
||||
iterations: 10
|
||||
match:
|
||||
# curl -H 'X-Canary: always' http://app.example.com
|
||||
- headers:
|
||||
x-canary:
|
||||
exact: "always"
|
||||
# curl -b 'canary=always' http://app.example.com
|
||||
- headers:
|
||||
cookie:
|
||||
exact: "canary"
|
||||
# Linkerd Prometheus checks
|
||||
metrics:
|
||||
- name: request-success-rate
|
||||
thresholdRange:
|
||||
min: 99
|
||||
interval: 1m
|
||||
- name: request-duration
|
||||
thresholdRange:
|
||||
max: 500
|
||||
interval: 30s
|
||||
webhooks:
|
||||
- name: acceptance-test
|
||||
type: pre-rollout
|
||||
url: http://flagger-loadtester.test/
|
||||
timeout: 30s
|
||||
metadata:
|
||||
type: bash
|
||||
cmd: "curl -sd 'test' http://podinfo-canary:9898/token | grep token"
|
||||
- name: load-test
|
||||
type: rollout
|
||||
url: http://flagger-loadtester.test/
|
||||
metadata:
|
||||
cmd: "hey -z 2m -q 10 -c 2 -H 'Cookie: canary=always' http://app.example.com"
|
||||
```
|
||||
|
||||
The above configuration will run an analysis for ten minutes targeting users that have a `canary` cookie set to `always` or those that call the service using the `X-Canary: always` header.
|
||||
|
||||
**Note** that the load test now targets the external address and uses the canary cookie.
|
||||
|
||||
Trigger a canary deployment by updating the container image:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=ghcr.io/stefanprodan/podinfo:6.0.4
|
||||
```
|
||||
|
||||
Flagger detects that the deployment revision changed and starts the A/B testing:
|
||||
|
||||
```text
|
||||
kubectl -n test describe canary/podinfo
|
||||
|
||||
Events:
|
||||
Starting canary deployment for podinfo.test
|
||||
Pre-rollout check acceptance-test passed
|
||||
Advance podinfo.test canary iteration 1/10
|
||||
Advance podinfo.test canary iteration 2/10
|
||||
Advance podinfo.test canary iteration 3/10
|
||||
Advance podinfo.test canary iteration 4/10
|
||||
Advance podinfo.test canary iteration 5/10
|
||||
Advance podinfo.test canary iteration 6/10
|
||||
Advance podinfo.test canary iteration 7/10
|
||||
Advance podinfo.test canary iteration 8/10
|
||||
Advance podinfo.test canary iteration 9/10
|
||||
Advance podinfo.test canary iteration 10/10
|
||||
Copying podinfo.test template spec to podinfo-primary.test
|
||||
Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
|
||||
Promotion completed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
The above procedure can be extended with [custom metrics](../usage/metrics.md) checks, [webhooks](../usage/webhooks.md), [manual promotion](../usage/webhooks.md#manual-gating) approval and [Slack or MS Teams](../usage/alerting.md) notifications.
|
||||
|
|
@ -0,0 +1,450 @@
|
|||
---
|
||||
weight: 6
|
||||
title: NGINX Canary Deployments
|
||||
type: docs
|
||||
---
|
||||
|
||||
This guide shows you how to use the NGINX ingress controller and Flagger to automate canary deployments and A/B testing.
|
||||
|
||||

|
||||
|
||||
## Prerequisites
|
||||
|
||||
Flagger requires a Kubernetes cluster **v1.19** or newer and NGINX ingress **v1.0.2** or newer.
|
||||
|
||||
Install the NGINX ingress controller with Helm v3:
|
||||
|
||||
```bash
|
||||
helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx
|
||||
kubectl create ns ingress-nginx
|
||||
helm upgrade -i ingress-nginx ingress-nginx/ingress-nginx \
|
||||
--namespace ingress-nginx \
|
||||
--set controller.metrics.enabled=true \
|
||||
--set controller.podAnnotations."prometheus\.io/scrape"=true \
|
||||
--set controller.podAnnotations."prometheus\.io/port"=10254
|
||||
```
|
||||
|
||||
Install Flagger and the Prometheus add-on in the same namespace as the ingress controller:
|
||||
|
||||
```bash
|
||||
helm repo add flagger https://flagger.app
|
||||
|
||||
helm upgrade -i flagger flagger/flagger \
|
||||
--namespace ingress-nginx \
|
||||
--set prometheus.install=true \
|
||||
--set meshProvider=nginx
|
||||
```
|
||||
|
||||
## Bootstrap
|
||||
|
||||
Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler (HPA),
|
||||
then creates a series of objects (Kubernetes deployments, ClusterIP services and canary ingress).
|
||||
These objects expose the application outside the cluster and drive the canary analysis and promotion.
|
||||
|
||||
Create a test namespace:
|
||||
|
||||
```bash
|
||||
kubectl create ns test
|
||||
```
|
||||
|
||||
Create a deployment and a horizontal pod autoscaler:
|
||||
|
||||
```bash
|
||||
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/podinfo?ref=main
|
||||
```
|
||||
|
||||
Deploy the load testing service to generate traffic during the canary analysis:
|
||||
|
||||
```bash
|
||||
helm upgrade -i flagger-loadtester flagger/loadtester \
|
||||
--namespace=test
|
||||
```
|
||||
|
||||
Create an ingress definition (replace `app.example.com` with your own domain):
|
||||
|
||||
```yaml
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: podinfo
|
||||
namespace: test
|
||||
labels:
|
||||
app: podinfo
|
||||
annotations:
|
||||
kubernetes.io/ingress.class: "nginx"
|
||||
spec:
|
||||
rules:
|
||||
- host: "app.example.com"
|
||||
http:
|
||||
paths:
|
||||
- pathType: Prefix
|
||||
path: "/"
|
||||
backend:
|
||||
service:
|
||||
name: podinfo
|
||||
port:
|
||||
number: 80
|
||||
```
|
||||
|
||||
Save the above resource as podinfo-ingress.yaml and then apply it:
|
||||
|
||||
```bash
|
||||
kubectl apply -f ./podinfo-ingress.yaml
|
||||
```
|
||||
|
||||
Create a canary custom resource (replace `app.example.com` with your own domain):
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: Canary
|
||||
metadata:
|
||||
name: podinfo
|
||||
namespace: test
|
||||
spec:
|
||||
provider: nginx
|
||||
# deployment reference
|
||||
targetRef:
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: podinfo
|
||||
# ingress reference
|
||||
ingressRef:
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
name: podinfo
|
||||
# HPA reference (optional)
|
||||
autoscalerRef:
|
||||
apiVersion: autoscaling/v2beta2
|
||||
kind: HorizontalPodAutoscaler
|
||||
name: podinfo
|
||||
# the maximum time in seconds for the canary deployment
|
||||
# to make progress before it is rollback (default 600s)
|
||||
progressDeadlineSeconds: 60
|
||||
service:
|
||||
# ClusterIP port number
|
||||
port: 80
|
||||
# container port number or name
|
||||
targetPort: 9898
|
||||
analysis:
|
||||
# schedule interval (default 60s)
|
||||
interval: 10s
|
||||
# max number of failed metric checks before rollback
|
||||
threshold: 10
|
||||
# max traffic percentage routed to canary
|
||||
# percentage (0-100)
|
||||
maxWeight: 50
|
||||
# canary increment step
|
||||
# percentage (0-100)
|
||||
stepWeight: 5
|
||||
# NGINX Prometheus checks
|
||||
metrics:
|
||||
- name: request-success-rate
|
||||
# minimum req success rate (non 5xx responses)
|
||||
# percentage (0-100)
|
||||
thresholdRange:
|
||||
min: 99
|
||||
interval: 1m
|
||||
# testing (optional)
|
||||
webhooks:
|
||||
- name: acceptance-test
|
||||
type: pre-rollout
|
||||
url: http://flagger-loadtester.test/
|
||||
timeout: 30s
|
||||
metadata:
|
||||
type: bash
|
||||
cmd: "curl -sd 'test' http://podinfo-canary/token | grep token"
|
||||
- name: load-test
|
||||
url: http://flagger-loadtester.test/
|
||||
timeout: 5s
|
||||
metadata:
|
||||
cmd: "hey -z 1m -q 10 -c 2 http://app.example.com/"
|
||||
```
|
||||
|
||||
Save the above resource as podinfo-canary.yaml and then apply it:
|
||||
|
||||
```bash
|
||||
kubectl apply -f ./podinfo-canary.yaml
|
||||
```
|
||||
|
||||
After a couple of seconds Flagger will create the canary objects:
|
||||
|
||||
```bash
|
||||
# applied
|
||||
deployment.apps/podinfo
|
||||
horizontalpodautoscaler.autoscaling/podinfo
|
||||
ingresses.extensions/podinfo
|
||||
canary.flagger.app/podinfo
|
||||
|
||||
# generated
|
||||
deployment.apps/podinfo-primary
|
||||
horizontalpodautoscaler.autoscaling/podinfo-primary
|
||||
service/podinfo
|
||||
service/podinfo-canary
|
||||
service/podinfo-primary
|
||||
ingresses.extensions/podinfo-canary
|
||||
```
|
||||
|
||||
## Automated canary promotion
|
||||
|
||||
Flagger implements a control loop that gradually shifts traffic to the canary while measuring key performance
|
||||
indicators like HTTP requests success rate, requests average duration and pod health.
|
||||
Based on analysis of the KPIs a canary is promoted or aborted, and the analysis result is published to Slack or MS Teams.
|
||||
|
||||

|
||||
|
||||
Trigger a canary deployment by updating the container image:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=ghcr.io/stefanprodan/podinfo:6.0.1
|
||||
```
|
||||
|
||||
Flagger detects that the deployment revision changed and starts a new rollout:
|
||||
|
||||
```text
|
||||
kubectl -n test describe canary/podinfo
|
||||
|
||||
Status:
|
||||
Canary Weight: 0
|
||||
Failed Checks: 0
|
||||
Phase: Succeeded
|
||||
Events:
|
||||
Type Reason Age From Message
|
||||
---- ------ ---- ---- -------
|
||||
Normal Synced 3m flagger New revision detected podinfo.test
|
||||
Normal Synced 3m flagger Scaling up podinfo.test
|
||||
Warning Synced 3m flagger Waiting for podinfo.test rollout to finish: 0 of 1 updated replicas are available
|
||||
Normal Synced 3m flagger Advance podinfo.test canary weight 5
|
||||
Normal Synced 3m flagger Advance podinfo.test canary weight 10
|
||||
Normal Synced 3m flagger Advance podinfo.test canary weight 15
|
||||
Normal Synced 2m flagger Advance podinfo.test canary weight 20
|
||||
Normal Synced 2m flagger Advance podinfo.test canary weight 25
|
||||
Normal Synced 1m flagger Advance podinfo.test canary weight 30
|
||||
Normal Synced 1m flagger Advance podinfo.test canary weight 35
|
||||
Normal Synced 55s flagger Advance podinfo.test canary weight 40
|
||||
Normal Synced 45s flagger Advance podinfo.test canary weight 45
|
||||
Normal Synced 35s flagger Advance podinfo.test canary weight 50
|
||||
Normal Synced 25s flagger Copying podinfo.test template spec to podinfo-primary.test
|
||||
Warning Synced 15s flagger Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
|
||||
Normal Synced 5s flagger Promotion completed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
**Note** that if you apply new changes to the deployment during the canary analysis, Flagger will restart the analysis.
|
||||
|
||||
You can monitor all canaries with:
|
||||
|
||||
```bash
|
||||
watch kubectl get canaries --all-namespaces
|
||||
|
||||
NAMESPACE NAME STATUS WEIGHT LASTTRANSITIONTIME
|
||||
test podinfo Progressing 15 2019-05-06T14:05:07Z
|
||||
prod frontend Succeeded 0 2019-05-05T16:15:07Z
|
||||
prod backend Failed 0 2019-05-04T17:05:07Z
|
||||
```
|
||||
|
||||
## Automated rollback
|
||||
|
||||
During the canary analysis you can generate HTTP 500 errors to test if Flagger pauses and rolls back the faulted version.
|
||||
|
||||
Trigger another canary deployment:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=ghcr.io/stefanprodan/podinfo:6.0.2
|
||||
```
|
||||
|
||||
Generate HTTP 500 errors:
|
||||
|
||||
```bash
|
||||
watch curl http://app.example.com/status/500
|
||||
```
|
||||
|
||||
When the number of failed checks reaches the canary analysis threshold,
|
||||
the traffic is routed back to the primary,
|
||||
the canary is scaled to zero and the rollout is marked as failed.
|
||||
|
||||
```text
|
||||
kubectl -n test describe canary/podinfo
|
||||
|
||||
Status:
|
||||
Canary Weight: 0
|
||||
Failed Checks: 10
|
||||
Phase: Failed
|
||||
Events:
|
||||
Type Reason Age From Message
|
||||
---- ------ ---- ---- -------
|
||||
Normal Synced 3m flagger Starting canary deployment for podinfo.test
|
||||
Normal Synced 3m flagger Advance podinfo.test canary weight 5
|
||||
Normal Synced 3m flagger Advance podinfo.test canary weight 10
|
||||
Normal Synced 3m flagger Advance podinfo.test canary weight 15
|
||||
Normal Synced 3m flagger Halt podinfo.test advancement success rate 69.17% < 99%
|
||||
Normal Synced 2m flagger Halt podinfo.test advancement success rate 61.39% < 99%
|
||||
Normal Synced 2m flagger Halt podinfo.test advancement success rate 55.06% < 99%
|
||||
Normal Synced 2m flagger Halt podinfo.test advancement success rate 47.00% < 99%
|
||||
Normal Synced 2m flagger (combined from similar events): Halt podinfo.test advancement success rate 38.08% < 99%
|
||||
Warning Synced 1m flagger Rolling back podinfo.test failed checks threshold reached 10
|
||||
Warning Synced 1m flagger Canary failed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
## Custom metrics
|
||||
|
||||
The canary analysis can be extended with Prometheus queries.
|
||||
|
||||
The demo app is instrumented with Prometheus so you can create a custom check
|
||||
that will use the HTTP request duration histogram to validate the canary.
|
||||
|
||||
Create a metric template and apply it on the cluster:
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: MetricTemplate
|
||||
metadata:
|
||||
name: latency
|
||||
namespace: test
|
||||
spec:
|
||||
provider:
|
||||
type: prometheus
|
||||
address: http://flagger-prometheus.ingress-nginx:9090
|
||||
query: |
|
||||
histogram_quantile(0.99,
|
||||
sum(
|
||||
rate(
|
||||
http_request_duration_seconds_bucket{
|
||||
kubernetes_namespace="{{ namespace }}",
|
||||
kubernetes_pod_name=~"{{ target }}-[0-9a-zA-Z]+(-[0-9a-zA-Z]+)"
|
||||
}[1m]
|
||||
)
|
||||
) by (le)
|
||||
)
|
||||
```
|
||||
|
||||
Edit the canary analysis and add the latency check:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
metrics:
|
||||
- name: "latency"
|
||||
templateRef:
|
||||
name: latency
|
||||
thresholdRange:
|
||||
max: 0.5
|
||||
interval: 1m
|
||||
```
|
||||
|
||||
The threshold is set to 500ms so if the average request duration in the last minute goes over
|
||||
half a second then the analysis will fail and the canary will not be promoted.
|
||||
|
||||
Trigger a canary deployment by updating the container image:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=ghcr.io/stefanprodan/podinfo:6.0.3
|
||||
```
|
||||
|
||||
Generate high response latency:
|
||||
|
||||
```bash
|
||||
watch curl http://app.example.com/delay/2
|
||||
```
|
||||
|
||||
Watch Flagger logs:
|
||||
|
||||
```text
|
||||
kubectl -n nginx-ingress logs deployment/flagger -f | jq .msg
|
||||
|
||||
Starting canary deployment for podinfo.test
|
||||
Advance podinfo.test canary weight 5
|
||||
Advance podinfo.test canary weight 10
|
||||
Advance podinfo.test canary weight 15
|
||||
Halt podinfo.test advancement latency 1.20 > 0.5
|
||||
Halt podinfo.test advancement latency 1.45 > 0.5
|
||||
Halt podinfo.test advancement latency 1.60 > 0.5
|
||||
Halt podinfo.test advancement latency 1.69 > 0.5
|
||||
Halt podinfo.test advancement latency 1.70 > 0.5
|
||||
Rolling back podinfo.test failed checks threshold reached 5
|
||||
Canary failed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
If you have alerting configured, Flagger will send a notification with the reason why the canary failed.
|
||||
|
||||
## A/B Testing
|
||||
|
||||
Besides weighted routing, Flagger can be configured to route traffic to the canary based on HTTP match conditions.
|
||||
In an A/B testing scenario, you'll be using HTTP headers or cookies to target a certain segment of your users.
|
||||
This is particularly useful for frontend applications that require session affinity.
|
||||
|
||||

|
||||
|
||||
Edit the canary analysis, remove the max/step weight and add the match conditions and iterations:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
interval: 1m
|
||||
threshold: 10
|
||||
iterations: 10
|
||||
match:
|
||||
# curl -H 'X-Canary: insider' http://app.example.com
|
||||
- headers:
|
||||
x-canary:
|
||||
exact: "insider"
|
||||
# curl -b 'canary=always' http://app.example.com
|
||||
- headers:
|
||||
cookie:
|
||||
exact: "canary"
|
||||
metrics:
|
||||
- name: request-success-rate
|
||||
thresholdRange:
|
||||
min: 99
|
||||
interval: 1m
|
||||
webhooks:
|
||||
- name: load-test
|
||||
url: http://flagger-loadtester.test/
|
||||
timeout: 5s
|
||||
metadata:
|
||||
cmd: "hey -z 1m -q 10 -c 2 -H 'Cookie: canary=always' http://app.example.com/"
|
||||
```
|
||||
|
||||
The above configuration will run an analysis for ten minutes targeting users that have
|
||||
a `canary` cookie set to `always` or those that call the service using the `X-Canary: insider` header.
|
||||
|
||||
Trigger a canary deployment by updating the container image:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=ghcr.io/stefanprodan/podinfo:6.0.4
|
||||
```
|
||||
|
||||
Flagger detects that the deployment revision changed and starts the A/B testing:
|
||||
|
||||
```text
|
||||
kubectl -n test describe canary/podinfo
|
||||
|
||||
Status:
|
||||
Failed Checks: 0
|
||||
Phase: Succeeded
|
||||
Events:
|
||||
Type Reason Age From Message
|
||||
---- ------ ---- ---- -------
|
||||
Normal Synced 3m flagger New revision detected podinfo.test
|
||||
Normal Synced 3m flagger Scaling up podinfo.test
|
||||
Warning Synced 3m flagger Waiting for podinfo.test rollout to finish: 0 of 1 updated replicas are available
|
||||
Normal Synced 3m flagger Advance podinfo.test canary iteration 1/10
|
||||
Normal Synced 3m flagger Advance podinfo.test canary iteration 2/10
|
||||
Normal Synced 3m flagger Advance podinfo.test canary iteration 3/10
|
||||
Normal Synced 2m flagger Advance podinfo.test canary iteration 4/10
|
||||
Normal Synced 2m flagger Advance podinfo.test canary iteration 5/10
|
||||
Normal Synced 1m flagger Advance podinfo.test canary iteration 6/10
|
||||
Normal Synced 1m flagger Advance podinfo.test canary iteration 7/10
|
||||
Normal Synced 55s flagger Advance podinfo.test canary iteration 8/10
|
||||
Normal Synced 45s flagger Advance podinfo.test canary iteration 9/10
|
||||
Normal Synced 35s flagger Advance podinfo.test canary iteration 10/10
|
||||
Normal Synced 25s flagger Copying podinfo.test template spec to podinfo-primary.test
|
||||
Warning Synced 15s flagger Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
|
||||
Normal Synced 5s flagger Promotion completed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
The above procedure can be extended with
|
||||
[custom metrics](../usage/metrics.md) checks,
|
||||
[webhooks](../usage/webhooks.md),
|
||||
[manual promotion](../usage/webhooks.md#manual-gating) approval and
|
||||
[Slack or MS Teams](../usage/alerting.md) notifications.
|
|
@ -0,0 +1,367 @@
|
|||
---
|
||||
weight: 9
|
||||
title: Open Service Mesh Canary Deployments
|
||||
type: docs
|
||||
---
|
||||
|
||||
This guide shows you how to use Open Service Mesh (OSM) and Flagger to automate canary deployments.
|
||||
|
||||

|
||||
|
||||
## Prerequisites
|
||||
|
||||
Flagger requires a Kubernetes cluster **v1.16** or newer and Open Service Mesh **0.9.1** or newer.
|
||||
|
||||
OSM must have permissive traffic policy enabled and have an instance of Prometheus for metrics.
|
||||
|
||||
- If the OSM CLI is being used for installation, install OSM using the following command:
|
||||
```bash
|
||||
osm install \
|
||||
--set=OpenServiceMesh.deployPrometheus=true \
|
||||
--set=OpenServiceMesh.enablePermissiveTrafficPolicy=true
|
||||
```
|
||||
- If a managed instance of OSM is being used:
|
||||
- [Bring your own instance](docs.openservicemesh.io/docs/guides/observability/metrics/#byo-prometheus) of Prometheus,
|
||||
setting the namespace to match the managed OSM controller namespace
|
||||
- Enable permissive traffic policy after installation by updating the OSM MeshConfig resource:
|
||||
```bash
|
||||
# Replace <osm-namespace> with OSM controller's namespace
|
||||
kubectl patch meshconfig osm-mesh-config -n <osm-namespace> -p '{"spec":{"traffic":{"enablePermissiveTrafficPolicyMode":true}}}' --type=merge
|
||||
```
|
||||
|
||||
To install Flagger in the default `osm-system` namespace, use:
|
||||
```bash
|
||||
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/osm?ref=main
|
||||
```
|
||||
|
||||
Alternatively, if a non-default namespace or managed instance of OSM is in use, install Flagger with Helm, replacing the <osm-namespace>
|
||||
values as appropriate. If a custom instance of Prometheus is being used, replace `osm-prometheus` with the relevant Prometheus service name.
|
||||
```bash
|
||||
helm upgrade -i flagger flagger/flagger \
|
||||
--namespace=<osm-namespace> \
|
||||
--set meshProvider=osm \
|
||||
--set metricsServer=http://osm-prometheus.<osm-namespace>.svc:7070
|
||||
```
|
||||
|
||||
## Bootstrap
|
||||
|
||||
Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler (HPA),
|
||||
then creates a series of objects (Kubernetes deployments, ClusterIP services and SMI traffic split).
|
||||
These objects expose the application inside the mesh and drive the canary analysis and promotion.
|
||||
|
||||
Create a `test` namespace and enable OSM namespace monitoring and metrics scraping for the namespace.
|
||||
|
||||
```bash
|
||||
kubectl create namespace test
|
||||
osm namespace add test
|
||||
osm metrics enable --namespace test
|
||||
```
|
||||
|
||||
Create a `podinfo` deployment and a horizontal pod autoscaler:
|
||||
|
||||
```bash
|
||||
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/podinfo?ref=main
|
||||
```
|
||||
|
||||
Install the load testing service to generate traffic during the canary analysis:
|
||||
|
||||
```bash
|
||||
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/tester?ref=main
|
||||
```
|
||||
|
||||
Create a canary custom resource for the `podinfo` deployment.
|
||||
The following `podinfo` canary custom resource instructs Flagger to:
|
||||
1. monitor any changes to the `podinfo` deployment created earlier,
|
||||
2. detect `podinfo` deployment revision changes, and
|
||||
3. start a Flagger canary analysis, rollout, and promotion if there were deployment revision changes.
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: Canary
|
||||
metadata:
|
||||
name: podinfo
|
||||
namespace: test
|
||||
spec:
|
||||
provider: osm
|
||||
# deployment reference
|
||||
targetRef:
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: podinfo
|
||||
# HPA reference (optional)
|
||||
autoscalerRef:
|
||||
apiVersion: autoscaling/v2beta2
|
||||
kind: HorizontalPodAutoscaler
|
||||
name: podinfo
|
||||
# the maximum time in seconds for the canary deployment
|
||||
# to make progress before it is rolled back (default 600s)
|
||||
progressDeadlineSeconds: 60
|
||||
service:
|
||||
# ClusterIP port number
|
||||
port: 9898
|
||||
# container port number or name (optional)
|
||||
targetPort: 9898
|
||||
analysis:
|
||||
# schedule interval (default 60s)
|
||||
interval: 30s
|
||||
# max number of failed metric checks before rollback
|
||||
threshold: 5
|
||||
# max traffic percentage routed to canary
|
||||
# percentage (0-100)
|
||||
maxWeight: 50
|
||||
# canary increment step
|
||||
# percentage (0-100)
|
||||
stepWeight: 5
|
||||
# OSM Prometheus checks
|
||||
metrics:
|
||||
- name: request-success-rate
|
||||
# minimum req success rate (non 5xx responses)
|
||||
# percentage (0-100)
|
||||
thresholdRange:
|
||||
min: 99
|
||||
interval: 1m
|
||||
- name: request-duration
|
||||
# maximum req duration P99
|
||||
# milliseconds
|
||||
thresholdRange:
|
||||
max: 500
|
||||
interval: 30s
|
||||
# testing (optional)
|
||||
webhooks:
|
||||
- name: acceptance-test
|
||||
type: pre-rollout
|
||||
url: http://flagger-loadtester.test/
|
||||
timeout: 30s
|
||||
metadata:
|
||||
type: bash
|
||||
cmd: "curl -sd 'test' http://podinfo-canary.test:9898/token | grep token"
|
||||
- name: load-test
|
||||
type: rollout
|
||||
url: http://flagger-loadtester.test/
|
||||
timeout: 5s
|
||||
metadata:
|
||||
cmd: "hey -z 2m -q 10 -c 2 http://podinfo-canary.test:9898/"
|
||||
```
|
||||
|
||||
Save the above resource as podinfo-canary.yaml and then apply it:
|
||||
|
||||
```bash
|
||||
kubectl apply -f ./podinfo-canary.yaml
|
||||
```
|
||||
|
||||
When the canary analysis starts, Flagger will call the pre-rollout webhooks before routing traffic to the canary.
|
||||
The canary analysis will run for five minutes while validating the HTTP metrics and rollout hooks every half a minute.
|
||||
|
||||
After a couple of seconds Flagger will create the canary objects.
|
||||
|
||||
```bash
|
||||
# applied
|
||||
deployment.apps/podinfo
|
||||
horizontalpodautoscaler.autoscaling/podinfo
|
||||
ingresses.extensions/podinfo
|
||||
canary.flagger.app/podinfo
|
||||
|
||||
# generated
|
||||
deployment.apps/podinfo-primary
|
||||
horizontalpodautoscaler.autoscaling/podinfo-primary
|
||||
service/podinfo
|
||||
service/podinfo-canary
|
||||
service/podinfo-primary
|
||||
trafficsplits.split.smi-spec.io/podinfo
|
||||
```
|
||||
|
||||
After the boostrap, the `podinfo` deployment will be scaled to zero and the traffic to `podinfo.test` will be routed to the primary pods.
|
||||
During the canary analysis, the `podinfo-canary.test` address can be used to target directly the canary pods.
|
||||
|
||||
## Automated Canary Promotion
|
||||
|
||||
Flagger implements a control loop that gradually shifts traffic to the canary while measuring key performance indicators like HTTP requests success rate, requests average duration and pod health.
|
||||
Based on analysis of the KPIs a canary is promoted or aborted.
|
||||
|
||||

|
||||
|
||||
Trigger a canary deployment by updating the container image:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=ghcr.io/stefanprodan/podinfo:6.0.1
|
||||
```
|
||||
|
||||
Flagger detects that the deployment revision changed and starts a new rollout.
|
||||
|
||||
|
||||
```text
|
||||
kubectl -n test describe canary/podinfo
|
||||
|
||||
Status:
|
||||
Canary Weight: 0
|
||||
Failed Checks: 0
|
||||
Phase: Succeeded
|
||||
Events:
|
||||
New revision detected! Scaling up podinfo.test
|
||||
Waiting for podinfo.test rollout to finish: 0 of 1 updated replicas are available
|
||||
Pre-rollout check acceptance-test passed
|
||||
Advance podinfo.test canary weight 5
|
||||
Advance podinfo.test canary weight 10
|
||||
Advance podinfo.test canary weight 15
|
||||
Advance podinfo.test canary weight 20
|
||||
Advance podinfo.test canary weight 25
|
||||
Waiting for podinfo.test rollout to finish: 1 of 2 updated replicas are available
|
||||
Advance podinfo.test canary weight 30
|
||||
Advance podinfo.test canary weight 35
|
||||
Advance podinfo.test canary weight 40
|
||||
Advance podinfo.test canary weight 45
|
||||
Advance podinfo.test canary weight 50
|
||||
Copying podinfo.test template spec to podinfo-primary.test
|
||||
Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
|
||||
Promotion completed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
**Note** that if you apply any new changes to the `podinfo` deployment during the canary analysis, Flagger will restart the analysis.
|
||||
|
||||
A canary deployment is triggered by changes in any of the following objects:
|
||||
|
||||
* Deployment PodSpec \(container image, command, ports, env, resources, etc\)
|
||||
* ConfigMaps mounted as volumes or mapped to environment variables
|
||||
* Secrets mounted as volumes or mapped to environment variables
|
||||
|
||||
You can monitor all canaries with:
|
||||
|
||||
```bash
|
||||
watch kubectl get canaries --all-namespaces
|
||||
|
||||
NAMESPACE NAME STATUS WEIGHT LASTTRANSITIONTIME
|
||||
test podinfo Progressing 15 2019-06-30T14:05:07Z
|
||||
prod frontend Succeeded 0 2019-06-30T16:15:07Z
|
||||
prod backend Failed 0 2019-06-30T17:05:07Z
|
||||
```
|
||||
|
||||
## Automated Rollback
|
||||
|
||||
During the canary analysis you can generate HTTP 500 errors and high latency to test if Flagger pauses and rolls back the faulted version.
|
||||
|
||||
Trigger another canary deployment:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=ghcr.io/stefanprodan/podinfo:6.0.2
|
||||
```
|
||||
|
||||
Exec into the load tester pod with:
|
||||
|
||||
```bash
|
||||
kubectl -n test exec -it flagger-loadtester-xx-xx sh
|
||||
```
|
||||
|
||||
Repeatedly generate HTTP 500 errors until the `kubectl describe` output below shows canary rollout failure:
|
||||
|
||||
```bash
|
||||
watch -n 0.1 curl http://podinfo-canary.test:9898/status/500
|
||||
```
|
||||
|
||||
Repeatedly generate latency until canary rollout fails:
|
||||
|
||||
```bash
|
||||
watch -n 0.1 curl http://podinfo-canary.test:9898/delay/1
|
||||
```
|
||||
|
||||
When the number of failed checks reaches the canary analysis thresholds defined in the `podinfo` canary custom resource earlier, the traffic is routed back to the primary, the canary is scaled to zero and the rollout is marked as failed.
|
||||
|
||||
```text
|
||||
kubectl -n test describe canary/podinfo
|
||||
|
||||
Status:
|
||||
Canary Weight: 0
|
||||
Failed Checks: 10
|
||||
Phase: Failed
|
||||
Events:
|
||||
Starting canary analysis for podinfo.test
|
||||
Pre-rollout check acceptance-test passed
|
||||
Advance podinfo.test canary weight 5
|
||||
Advance podinfo.test canary weight 10
|
||||
Advance podinfo.test canary weight 15
|
||||
Halt podinfo.test advancement success rate 69.17% < 99%
|
||||
Halt podinfo.test advancement success rate 61.39% < 99%
|
||||
Halt podinfo.test advancement success rate 55.06% < 99%
|
||||
Halt podinfo.test advancement request duration 1.20s > 0.5s
|
||||
Halt podinfo.test advancement request duration 1.45s > 0.5s
|
||||
Rolling back podinfo.test failed checks threshold reached 5
|
||||
Canary failed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
## Custom Metrics
|
||||
|
||||
The canary analysis can be extended with Prometheus queries.
|
||||
|
||||
Let's define a check for 404 not found errors.
|
||||
Edit the canary analysis (`podinfo-canary.yaml` file) and add the following metric.
|
||||
For more information on creating additional custom metrics using OSM metrics, please check the [metrics available in OSM](https://docs.openservicemesh.io/docs/guides/observability/metrics/#available-metrics).
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
metrics:
|
||||
- name: "404s percentage"
|
||||
threshold: 3
|
||||
query: |
|
||||
100 - (
|
||||
sum(
|
||||
rate(
|
||||
osm_request_total{
|
||||
destination_namespace="test",
|
||||
destination_kind="Deployment",
|
||||
destination_name="podinfo",
|
||||
response_code!="404"
|
||||
}[1m]
|
||||
)
|
||||
)
|
||||
/
|
||||
sum(
|
||||
rate(
|
||||
osm_request_total{
|
||||
destination_namespace="test",
|
||||
destination_kind="Deployment",
|
||||
destination_name="podinfo"
|
||||
}[1m]
|
||||
)
|
||||
) * 100
|
||||
)
|
||||
```
|
||||
|
||||
The above configuration validates the canary version by checking if the HTTP 404 req/sec percentage is below three percent of the total traffic.
|
||||
If the 404s rate reaches the 3% threshold, then the analysis is aborted and the canary is marked as failed.
|
||||
|
||||
Trigger a canary deployment by updating the container image:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=ghcr.io/stefanprodan/podinfo:6.0.3
|
||||
```
|
||||
|
||||
Exec into the load tester pod with:
|
||||
|
||||
```bash
|
||||
kubectl -n test exec -it flagger-loadtester-xx-xx sh
|
||||
```
|
||||
|
||||
Repeatedly generate 404s until canary rollout fails:
|
||||
|
||||
```bash
|
||||
watch -n 0.1 curl http://podinfo-canary.test:9898/status/404
|
||||
```
|
||||
|
||||
Watch Flagger logs to confirm successful canary rollback.
|
||||
|
||||
```text
|
||||
kubectl -n osm-system logs deployment/flagger -f | jq .msg
|
||||
|
||||
Starting canary deployment for podinfo.test
|
||||
Pre-rollout check acceptance-test passed
|
||||
Advance podinfo.test canary weight 5
|
||||
Halt podinfo.test advancement 404s percentage 6.20 > 3
|
||||
Halt podinfo.test advancement 404s percentage 6.45 > 3
|
||||
Halt podinfo.test advancement 404s percentage 7.22 > 3
|
||||
Halt podinfo.test advancement 404s percentage 6.50 > 3
|
||||
Halt podinfo.test advancement 404s percentage 6.34 > 3
|
||||
Rolling back podinfo.test failed checks threshold reached 5
|
||||
Canary failed! Scaling down podinfo.test
|
||||
```
|
|
@ -0,0 +1,203 @@
|
|||
---
|
||||
weight: 13
|
||||
title: Canary analysis with Prometheus Operator
|
||||
type: docs
|
||||
---
|
||||
|
||||
This guide show you how to use
|
||||
[Prometheus Operator](https://github.com/prometheus-operator/prometheus-operator) for canary analysis.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Flagger requires a Kubernetes cluster **v1.16** or newer and Prometheus Operator **v0.40** or newer.
|
||||
|
||||
Install Prometheus Operator with Helm v3:
|
||||
|
||||
```bash
|
||||
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
|
||||
|
||||
kubectl create ns monitoring
|
||||
helm upgrade -i prometheus prometheus-community/kube-prometheus-stack \
|
||||
--namespace monitoring \
|
||||
--set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \
|
||||
--set fullnameOverride=prometheus
|
||||
```
|
||||
|
||||
The `prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false`
|
||||
option allows Prometheus Operator to watch serviceMonitors outside of its namespace.
|
||||
|
||||
Install Flagger by setting the metrics server to Prometheus:
|
||||
|
||||
```bash
|
||||
helm repo add flagger https://flagger.app
|
||||
|
||||
kubectl create ns flagger-system
|
||||
helm upgrade -i flagger flagger/flagger \
|
||||
--namespace flagger-system \
|
||||
--set metricsServer=http://prometheus-prometheus.monitoring:9090 \
|
||||
--set meshProvider=kubernetes
|
||||
```
|
||||
|
||||
Install Flagger's tester:
|
||||
|
||||
```bash
|
||||
helm upgrade -i loadtester flagger/loadtester \
|
||||
--namespace flagger-system
|
||||
```
|
||||
|
||||
Install [podinfo](https://github.com/stefanprodan/podinfo) demo app:
|
||||
|
||||
```bash
|
||||
helm repo add podinfo https://stefanprodan.github.io/podinfo
|
||||
|
||||
kubectl create ns test
|
||||
helm upgrade -i podinfo podinfo/podinfo \
|
||||
--namespace test \
|
||||
--set service.enabled=false
|
||||
```
|
||||
|
||||
## Service monitors
|
||||
|
||||
The demo app is instrumented with Prometheus,
|
||||
so you can create a `ServiceMonitor` objects to scrape podinfo's metrics endpoint:
|
||||
|
||||
```yaml
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: podinfo-canary
|
||||
namespace: test
|
||||
spec:
|
||||
endpoints:
|
||||
- path: /metrics
|
||||
port: http
|
||||
interval: 5s
|
||||
selector:
|
||||
matchLabels:
|
||||
app: podinfo-canary
|
||||
---
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: podinfo-primary
|
||||
namespace: test
|
||||
spec:
|
||||
endpoints:
|
||||
- path: /metrics
|
||||
port: http
|
||||
interval: 5s
|
||||
selector:
|
||||
matchLabels:
|
||||
app: podinfo
|
||||
```
|
||||
|
||||
We are setting `interval: 5s` to have a more aggressive scraping.
|
||||
If you do not define it, you should use a longer interval in the Canary object.
|
||||
|
||||
## Metric templates
|
||||
|
||||
Create a metric template to measure the HTTP requests error rate:
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: MetricTemplate
|
||||
metadata:
|
||||
name: error-rate
|
||||
namespace: test
|
||||
spec:
|
||||
provider:
|
||||
address: http://prometheus-prometheus.monitoring:9090
|
||||
type: prometheus
|
||||
query: |
|
||||
100 - rate(
|
||||
http_requests_total{
|
||||
namespace="{{ namespace }}",
|
||||
job="{{ target }}-canary",
|
||||
status!~"5.*"
|
||||
}[{{ interval }}])
|
||||
/
|
||||
rate(
|
||||
http_requests_total{
|
||||
namespace="{{ namespace }}",
|
||||
job="{{ target }}-canary"
|
||||
}[{{ interval }}]
|
||||
) * 100
|
||||
```
|
||||
|
||||
Create a metric template to measure the HTTP requests average duration:
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: MetricTemplate
|
||||
metadata:
|
||||
name: latency
|
||||
namespace: test
|
||||
spec:
|
||||
provider:
|
||||
address: http://prometheus-prometheus.monitoring:9090
|
||||
type: prometheus
|
||||
query: |
|
||||
histogram_quantile(0.99,
|
||||
sum(
|
||||
rate(
|
||||
http_request_duration_seconds_bucket{
|
||||
namespace="{{ namespace }}",
|
||||
job="{{ target }}-canary"
|
||||
}[{{ interval }}]
|
||||
)
|
||||
) by (le)
|
||||
)
|
||||
```
|
||||
|
||||
## Canary analysis
|
||||
|
||||
Using the metrics template you can configure the canary analysis with HTTP error rate and latency checks:
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: Canary
|
||||
metadata:
|
||||
name: podinfo
|
||||
namespace: test
|
||||
spec:
|
||||
provider: kubernetes
|
||||
targetRef:
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: podinfo
|
||||
progressDeadlineSeconds: 60
|
||||
service:
|
||||
port: 80
|
||||
targetPort: http
|
||||
name: podinfo
|
||||
analysis:
|
||||
interval: 30s
|
||||
iterations: 10
|
||||
threshold: 2
|
||||
metrics:
|
||||
- name: error-rate
|
||||
templateRef:
|
||||
name: error-rate
|
||||
thresholdRange:
|
||||
max: 1
|
||||
interval: 30s
|
||||
- name: latency
|
||||
templateRef:
|
||||
name: latency
|
||||
thresholdRange:
|
||||
max: 0.5
|
||||
interval: 30s
|
||||
webhooks:
|
||||
- name: load-test
|
||||
type: rollout
|
||||
url: "http://loadtester.flagger-system/"
|
||||
timeout: 5s
|
||||
metadata:
|
||||
type: cmd
|
||||
cmd: "hey -z 1m -q 10 -c 2 http://podinfo-canary.test/"
|
||||
```
|
||||
|
||||
Based on the above specification, Flagger creates the primary and canary Kubernetes ClusterIP service.
|
||||
|
||||
During the canary analysis, Prometheus will scrape the canary service and Flagger will use the HTTP error rate
|
||||
and latency queries to determine if the release should be promoted or rolled back.
|
|
@ -0,0 +1,383 @@
|
|||
---
|
||||
weight: 7
|
||||
title: Skipper Canary Deployments
|
||||
type: docs
|
||||
---
|
||||
|
||||
This guide shows you how to use the [Skipper ingress controller](https://opensource.zalando.com/skipper/kubernetes/ingress-controller/) and Flagger to automate canary deployments.
|
||||
|
||||

|
||||
|
||||
## Prerequisites
|
||||
|
||||
Flagger requires a Kubernetes cluster **v1.19** or newer and Skipper ingress **v0.13** or newer.
|
||||
|
||||
Install Skipper ingress-controller using [upstream definition](https://opensource.zalando.com/skipper/kubernetes/ingress-controller/#install-skipper-as-ingress-controller).
|
||||
|
||||
Certain arguments are relevant:
|
||||
|
||||
```yaml
|
||||
- -enable-connection-metrics
|
||||
- -histogram-metric-buckets=.01,1,10,100
|
||||
- -kubernetes
|
||||
- -kubernetes-in-cluster
|
||||
- -kubernetes-path-mode=path-prefix
|
||||
- -metrics-exp-decay-sample
|
||||
- -metrics-flavour=prometheus
|
||||
- -route-backend-metrics
|
||||
- -route-backend-error-counters
|
||||
- -route-response-metrics
|
||||
- -serve-host-metrics
|
||||
- -serve-route-metrics
|
||||
- -whitelisted-healthcheck-cidr=0.0.0.0/0 # permit Kind source health checks
|
||||
```
|
||||
|
||||
Install Flagger using kustomize:
|
||||
|
||||
```bash
|
||||
kustomize build https://github.com/fluxcd/flagger/kustomize/kubernetes | kubectl apply -f -
|
||||
```
|
||||
|
||||
## Bootstrap
|
||||
|
||||
Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler (HPA),
|
||||
then creates a series of objects (Kubernetes deployments, ClusterIP services and canary ingress).
|
||||
These objects expose the application outside the cluster and drive the canary analysis and promotion.
|
||||
|
||||
Create a test namespace:
|
||||
|
||||
```bash
|
||||
kubectl create ns test
|
||||
```
|
||||
|
||||
Create a deployment and a horizontal pod autoscaler:
|
||||
|
||||
```bash
|
||||
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/podinfo?ref=main
|
||||
```
|
||||
|
||||
Deploy the load testing service to generate traffic during the canary analysis:
|
||||
|
||||
```bash
|
||||
helm upgrade -i flagger-loadtester flagger/loadtester \
|
||||
--namespace=test
|
||||
```
|
||||
|
||||
Create an ingress definition \(replace `app.example.com` with your own domain\):
|
||||
|
||||
```yaml
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: podinfo
|
||||
namespace: test
|
||||
labels:
|
||||
app: podinfo
|
||||
annotations:
|
||||
kubernetes.io/ingress.class: "skipper"
|
||||
spec:
|
||||
rules:
|
||||
- host: "app.example.com"
|
||||
http:
|
||||
paths:
|
||||
- pathType: Prefix
|
||||
path: "/"
|
||||
backend:
|
||||
service:
|
||||
name: podinfo
|
||||
port:
|
||||
number: 80
|
||||
```
|
||||
|
||||
Save the above resource as podinfo-ingress.yaml and then apply it:
|
||||
|
||||
```bash
|
||||
kubectl apply -f ./podinfo-ingress.yaml
|
||||
```
|
||||
|
||||
Create a canary custom resource (replace `app.example.com` with your own domain):
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: Canary
|
||||
metadata:
|
||||
name: podinfo
|
||||
namespace: test
|
||||
spec:
|
||||
provider: skipper
|
||||
# deployment reference
|
||||
targetRef:
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: podinfo
|
||||
# ingress reference
|
||||
ingressRef:
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
name: podinfo
|
||||
# HPA reference (optional)
|
||||
autoscalerRef:
|
||||
apiVersion: autoscaling/v2beta2
|
||||
kind: HorizontalPodAutoscaler
|
||||
name: podinfo
|
||||
# the maximum time in seconds for the canary deployment
|
||||
# to make progress before it is rollback (default 600s)
|
||||
progressDeadlineSeconds: 60
|
||||
service:
|
||||
# ClusterIP port number
|
||||
port: 80
|
||||
# container port number or name
|
||||
targetPort: 9898
|
||||
analysis:
|
||||
# schedule interval (default 60s)
|
||||
interval: 10s
|
||||
# max number of failed metric checks before rollback
|
||||
threshold: 10
|
||||
# max traffic percentage routed to canary
|
||||
# percentage (0-100)
|
||||
maxWeight: 50
|
||||
# canary increment step
|
||||
# percentage (0-100)
|
||||
stepWeight: 5
|
||||
# Skipper Prometheus checks
|
||||
metrics:
|
||||
- name: request-success-rate
|
||||
interval: 1m
|
||||
# minimum req success rate (non 5xx responses)
|
||||
# percentage (0-100)
|
||||
thresholdRange:
|
||||
min: 99
|
||||
- name: request-duration
|
||||
interval: 1m
|
||||
# maximum req duration P99
|
||||
# milliseconds
|
||||
thresholdRange:
|
||||
max: 500
|
||||
webhooks:
|
||||
- name: gate
|
||||
type: confirm-rollout
|
||||
url: http://flagger-loadtester.test/gate/approve
|
||||
- name: acceptance-test
|
||||
type: pre-rollout
|
||||
url: http://flagger-loadtester.test/
|
||||
timeout: 10s
|
||||
metadata:
|
||||
type: bash
|
||||
cmd: "curl -sd 'test' http://podinfo-canary/token | grep token"
|
||||
- name: load-test
|
||||
type: rollout
|
||||
url: http://flagger-loadtester.test/
|
||||
timeout: 5s
|
||||
metadata:
|
||||
type: cmd
|
||||
cmd: "hey -z 10m -q 10 -c 2 -host app.example.com http://skipper-ingress.kube-system"
|
||||
logCmdOutput: "true"
|
||||
```
|
||||
|
||||
Save the above resource as podinfo-canary.yaml and then apply it:
|
||||
|
||||
```bash
|
||||
kubectl apply -f ./podinfo-canary.yaml
|
||||
```
|
||||
|
||||
After a couple of seconds Flagger will create the canary objects:
|
||||
|
||||
```bash
|
||||
# applied
|
||||
deployment.apps/podinfo
|
||||
horizontalpodautoscaler.autoscaling/podinfo
|
||||
ingress.networking.k8s.io/podinfo-ingress
|
||||
canary.flagger.app/podinfo
|
||||
|
||||
# generated
|
||||
deployment.apps/podinfo-primary
|
||||
horizontalpodautoscaler.autoscaling/podinfo-primary
|
||||
service/podinfo
|
||||
service/podinfo-canary
|
||||
service/podinfo-primary
|
||||
ingress.networking.k8s.io/podinfo-canary
|
||||
```
|
||||
|
||||
## Automated canary promotion
|
||||
|
||||
Flagger implements a control loop that gradually shifts traffic to the canary while measuring
|
||||
key performance indicators like HTTP requests success rate, requests average duration and pod health.
|
||||
Based on analysis of the KPIs a canary is promoted or aborted, and the analysis result is published to Slack or MS Teams.
|
||||
|
||||

|
||||
|
||||
Trigger a canary deployment by updating the container image:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=stefanprodan/podinfo:4.0.6
|
||||
```
|
||||
|
||||
Flagger detects that the deployment revision changed and starts a new rollout:
|
||||
|
||||
```text
|
||||
kubectl -n test describe canary/podinfo
|
||||
|
||||
Status:
|
||||
Canary Weight: 0
|
||||
Failed Checks: 0
|
||||
Phase: Succeeded
|
||||
Events:
|
||||
New revision detected! Scaling up podinfo.test
|
||||
Waiting for podinfo.test rollout to finish: 0 of 1 updated replicas are available
|
||||
Pre-rollout check acceptance-test passed
|
||||
Advance podinfo.test canary weight 5
|
||||
Advance podinfo.test canary weight 10
|
||||
Advance podinfo.test canary weight 15
|
||||
Advance podinfo.test canary weight 20
|
||||
Advance podinfo.test canary weight 25
|
||||
Advance podinfo.test canary weight 30
|
||||
Advance podinfo.test canary weight 35
|
||||
Advance podinfo.test canary weight 40
|
||||
Advance podinfo.test canary weight 45
|
||||
Advance podinfo.test canary weight 50
|
||||
Copying podinfo.test template spec to podinfo-primary.test
|
||||
Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
|
||||
Routing all traffic to primary
|
||||
Promotion completed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
**Note** that if you apply new changes to the deployment during the canary analysis, Flagger will restart the analysis.
|
||||
|
||||
You can monitor all canaries with:
|
||||
|
||||
```bash
|
||||
watch kubectl get canaries --all-namespaces
|
||||
|
||||
NAMESPACE NAME STATUS WEIGHT LASTTRANSITIONTIME
|
||||
test podinfo-2 Progressing 30 2020-08-14T12:32:12Z
|
||||
test podinfo Succeeded 0 2020-08-14T11:23:88Z
|
||||
```
|
||||
|
||||
## Automated rollback
|
||||
|
||||
During the canary analysis you can generate HTTP 500 errors to test if Flagger pauses and rolls back the faulted version.
|
||||
|
||||
Trigger another canary deployment:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=stefanprodan/podinfo:4.0.6
|
||||
```
|
||||
|
||||
Exec into the load tester pod with:
|
||||
|
||||
```bash
|
||||
kubectl -n test exec -it deploy/flagger-loadtester bash
|
||||
```
|
||||
|
||||
Generate HTTP 500 errors:
|
||||
|
||||
```bash
|
||||
hey -z 1m -c 5 -q 5 http://app.example.com/status/500
|
||||
```
|
||||
|
||||
Generate latency:
|
||||
|
||||
```bash
|
||||
watch -n 1 curl http://app.example.com/delay/1
|
||||
```
|
||||
|
||||
When the number of failed checks reaches the canary analysis threshold, the traffic is routed back to the primary,
|
||||
the canary is scaled to zero and the rollout is marked as failed.
|
||||
|
||||
```text
|
||||
kubectl -n flagger-system logs deploy/flagger -f | jq .msg
|
||||
|
||||
New revision detected! Scaling up podinfo.test
|
||||
Canary deployment podinfo.test not ready: waiting for rollout to finish: 0 of 1 updated replicas are available
|
||||
Starting canary analysis for podinfo.test
|
||||
Pre-rollout check acceptance-test passed
|
||||
Advance podinfo.test canary weight 5
|
||||
Advance podinfo.test canary weight 10
|
||||
Advance podinfo.test canary weight 15
|
||||
Advance podinfo.test canary weight 20
|
||||
Halt podinfo.test advancement success rate 53.42% < 99%
|
||||
Halt podinfo.test advancement success rate 53.19% < 99%
|
||||
Halt podinfo.test advancement success rate 48.05% < 99%
|
||||
Rolling back podinfo.test failed checks threshold reached 3
|
||||
Canary failed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
## Custom metrics
|
||||
|
||||
The canary analysis can be extended with Prometheus queries.
|
||||
|
||||
Create a metric template and apply it on the cluster:
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: MetricTemplate
|
||||
metadata:
|
||||
name: latency
|
||||
namespace: test
|
||||
spec:
|
||||
provider:
|
||||
type: prometheus
|
||||
address: http://flagger-prometheus.flagger-system:9090
|
||||
query: |
|
||||
histogram_quantile(0.99,
|
||||
sum(
|
||||
rate(
|
||||
skipper_serve_route_duration_seconds_bucket{
|
||||
route=~"{{ printf "kube(ew)?_%s__%s_canary__.*__%s_canary(_[0-9]+)?" namespace ingress service }}",
|
||||
le="+Inf"
|
||||
}[1m]
|
||||
)
|
||||
) by (le)
|
||||
)
|
||||
```
|
||||
|
||||
Edit the canary analysis and add the latency check:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
metrics:
|
||||
- name: "latency"
|
||||
templateRef:
|
||||
name: latency
|
||||
thresholdRange:
|
||||
max: 0.5
|
||||
interval: 1m
|
||||
```
|
||||
|
||||
The threshold is set to 500ms so if the average request duration in the last minute goes over half a second
|
||||
then the analysis will fail and the canary will not be promoted.
|
||||
|
||||
Trigger a canary deployment by updating the container image:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=stefanprodan/podinfo:4.0.6
|
||||
```
|
||||
|
||||
Generate high response latency:
|
||||
|
||||
```bash
|
||||
watch curl http://app.example.com/delay/2
|
||||
```
|
||||
|
||||
Watch Flagger logs:
|
||||
|
||||
```text
|
||||
kubectl -n flagger-system logs deployment/flagger -f | jq .msg
|
||||
|
||||
Starting canary deployment for podinfo.test
|
||||
Advance podinfo.test canary weight 5
|
||||
Advance podinfo.test canary weight 10
|
||||
Advance podinfo.test canary weight 15
|
||||
Halt podinfo.test advancement latency 1.20 > 0.5
|
||||
Halt podinfo.test advancement latency 1.45 > 0.5
|
||||
Halt podinfo.test advancement latency 1.60 > 0.5
|
||||
Halt podinfo.test advancement latency 1.69 > 0.5
|
||||
Halt podinfo.test advancement latency 1.70 > 0.5
|
||||
Rolling back podinfo.test failed checks threshold reached 5
|
||||
Canary failed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
If you have alerting configured, Flagger will send a notification with the reason why the canary failed.
|
|
@ -0,0 +1,374 @@
|
|||
---
|
||||
weight: 8
|
||||
title: Traefik Canary Deployments
|
||||
type: docs
|
||||
---
|
||||
|
||||
This guide shows you how to use the [Traefik](https://doc.traefik.io/traefik/) and Flagger to automate canary deployments.
|
||||
|
||||

|
||||
|
||||
## Prerequisites
|
||||
|
||||
Flagger requires a Kubernetes cluster **v1.16** or newer and Traefik **v2.3** or newer.
|
||||
|
||||
Install Traefik with Helm v3:
|
||||
|
||||
```bash
|
||||
helm repo add traefik https://helm.traefik.io/traefik
|
||||
kubectl create ns traefik
|
||||
|
||||
cat <<EOF | helm upgrade -i traefik traefik/traefik --namespace traefik -f -
|
||||
deployment:
|
||||
podAnnotations:
|
||||
prometheus.io/port: "9100"
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/path: "/metrics"
|
||||
metrics:
|
||||
prometheus:
|
||||
entryPoint: metrics
|
||||
EOF
|
||||
```
|
||||
|
||||
Install Flagger and the Prometheus add-on in the same namespace as Traefik:
|
||||
|
||||
```bash
|
||||
helm repo add flagger https://flagger.app
|
||||
|
||||
helm upgrade -i flagger flagger/flagger \
|
||||
--namespace traefik \
|
||||
--set prometheus.install=true \
|
||||
--set meshProvider=traefik
|
||||
```
|
||||
|
||||
## Bootstrap
|
||||
|
||||
Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler \(HPA\), then creates a series of objects \(Kubernetes deployments, ClusterIP services and TraefikService\). These objects expose the application outside the cluster and drive the canary analysis and promotion.
|
||||
|
||||
Create a test namespace:
|
||||
|
||||
```bash
|
||||
kubectl create ns test
|
||||
```
|
||||
|
||||
Create a deployment and a horizontal pod autoscaler:
|
||||
|
||||
```bash
|
||||
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/podinfo?ref=main
|
||||
```
|
||||
|
||||
Deploy the load testing service to generate traffic during the canary analysis:
|
||||
|
||||
```bash
|
||||
helm upgrade -i flagger-loadtester flagger/loadtester \
|
||||
--namespace=test
|
||||
```
|
||||
|
||||
Create Traefik IngressRoute that references TraefikService generated by Flagger \(replace `app.example.com` with your own domain\):
|
||||
|
||||
```yaml
|
||||
apiVersion: traefik.containo.us/v1alpha1
|
||||
kind: IngressRoute
|
||||
metadata:
|
||||
name: podinfo
|
||||
namespace: test
|
||||
spec:
|
||||
entryPoints:
|
||||
- web
|
||||
routes:
|
||||
- match: Host(`app.example.com`)
|
||||
kind: Rule
|
||||
services:
|
||||
- name: podinfo
|
||||
kind: TraefikService
|
||||
port: 80
|
||||
```
|
||||
|
||||
Save the above resource as podinfo-ingressroute.yaml and then apply it:
|
||||
|
||||
```bash
|
||||
kubectl apply -f ./podinfo-ingressroute.yaml
|
||||
```
|
||||
|
||||
Create a canary custom resource \(replace `app.example.com` with your own domain\):
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: Canary
|
||||
metadata:
|
||||
name: podinfo
|
||||
namespace: test
|
||||
spec:
|
||||
provider: traefik
|
||||
# deployment reference
|
||||
targetRef:
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: podinfo
|
||||
# HPA reference (optional)
|
||||
autoscalerRef:
|
||||
apiVersion: autoscaling/v2beta2
|
||||
kind: HorizontalPodAutoscaler
|
||||
name: podinfo
|
||||
# the maximum time in seconds for the canary deployment
|
||||
# to make progress before it is rollback (default 600s)
|
||||
progressDeadlineSeconds: 60
|
||||
service:
|
||||
# ClusterIP port number
|
||||
port: 80
|
||||
# container port number or name
|
||||
targetPort: 9898
|
||||
analysis:
|
||||
# schedule interval (default 60s)
|
||||
interval: 10s
|
||||
# max number of failed metric checks before rollback
|
||||
threshold: 10
|
||||
# max traffic percentage routed to canary
|
||||
# percentage (0-100)
|
||||
maxWeight: 50
|
||||
# canary increment step
|
||||
# percentage (0-100)
|
||||
stepWeight: 5
|
||||
# Traefik Prometheus checks
|
||||
metrics:
|
||||
- name: request-success-rate
|
||||
interval: 1m
|
||||
# minimum req success rate (non 5xx responses)
|
||||
# percentage (0-100)
|
||||
thresholdRange:
|
||||
min: 99
|
||||
- name: request-duration
|
||||
interval: 1m
|
||||
# maximum req duration P99
|
||||
# milliseconds
|
||||
thresholdRange:
|
||||
max: 500
|
||||
webhooks:
|
||||
- name: acceptance-test
|
||||
type: pre-rollout
|
||||
url: http://flagger-loadtester.test/
|
||||
timeout: 10s
|
||||
metadata:
|
||||
type: bash
|
||||
cmd: "curl -sd 'test' http://podinfo-canary.test/token | grep token"
|
||||
- name: load-test
|
||||
type: rollout
|
||||
url: http://flagger-loadtester.test/
|
||||
timeout: 5s
|
||||
metadata:
|
||||
type: cmd
|
||||
cmd: "hey -z 10m -q 10 -c 2 -host app.example.com http://traefik.traefik"
|
||||
logCmdOutput: "true"
|
||||
```
|
||||
|
||||
Save the above resource as podinfo-canary.yaml and then apply it:
|
||||
|
||||
```bash
|
||||
kubectl apply -f ./podinfo-canary.yaml
|
||||
```
|
||||
|
||||
After a couple of seconds Flagger will create the canary objects:
|
||||
|
||||
```bash
|
||||
# applied
|
||||
deployment.apps/podinfo
|
||||
horizontalpodautoscaler.autoscaling/podinfo
|
||||
canary.flagger.app/podinfo
|
||||
|
||||
# generated
|
||||
deployment.apps/podinfo-primary
|
||||
horizontalpodautoscaler.autoscaling/podinfo-primary
|
||||
service/podinfo
|
||||
service/podinfo-canary
|
||||
service/podinfo-primary
|
||||
traefikservice.traefik.containo.us/podinfo
|
||||
```
|
||||
|
||||
## Automated canary promotion
|
||||
|
||||
Flagger implements a control loop that gradually shifts traffic to the canary while measuring key performance indicators like HTTP requests success rate, requests average duration and pod health. Based on analysis of the KPIs a canary is promoted or aborted, and the analysis result is published to Slack or MS Teams.
|
||||
|
||||

|
||||
|
||||
Trigger a canary deployment by updating the container image:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=stefanprodan/podinfo:4.0.6
|
||||
```
|
||||
|
||||
Flagger detects that the deployment revision changed and starts a new rollout:
|
||||
|
||||
```text
|
||||
kubectl -n test describe canary/podinfo
|
||||
|
||||
Status:
|
||||
Canary Weight: 0
|
||||
Failed Checks: 0
|
||||
Phase: Succeeded
|
||||
Events:
|
||||
New revision detected! Scaling up podinfo.test
|
||||
Waiting for podinfo.test rollout to finish: 0 of 1 updated replicas are available
|
||||
Pre-rollout check acceptance-test passed
|
||||
Advance podinfo.test canary weight 5
|
||||
Advance podinfo.test canary weight 10
|
||||
Advance podinfo.test canary weight 15
|
||||
Advance podinfo.test canary weight 20
|
||||
Advance podinfo.test canary weight 25
|
||||
Advance podinfo.test canary weight 30
|
||||
Advance podinfo.test canary weight 35
|
||||
Advance podinfo.test canary weight 40
|
||||
Advance podinfo.test canary weight 45
|
||||
Advance podinfo.test canary weight 50
|
||||
Copying podinfo.test template spec to podinfo-primary.test
|
||||
Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available
|
||||
Routing all traffic to primary
|
||||
Promotion completed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
**Note** that if you apply new changes to the deployment during the canary analysis, Flagger will restart the analysis.
|
||||
|
||||
You can monitor all canaries with:
|
||||
|
||||
```bash
|
||||
watch kubectl get canaries --all-namespaces
|
||||
|
||||
NAMESPACE NAME STATUS WEIGHT LASTTRANSITIONTIME
|
||||
test podinfo-2 Progressing 30 2020-08-14T12:32:12Z
|
||||
test podinfo Succeeded 0 2020-08-14T11:23:88Z
|
||||
```
|
||||
|
||||
## Automated rollback
|
||||
|
||||
During the canary analysis you can generate HTTP 500 errors to test if Flagger pauses and rolls back the faulted version.
|
||||
|
||||
Trigger another canary deployment:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=stefanprodan/podinfo:4.0.6
|
||||
```
|
||||
|
||||
Exec into the load tester pod with:
|
||||
|
||||
```bash
|
||||
kubectl -n test exec -it deploy/flagger-loadtester bash
|
||||
```
|
||||
|
||||
Generate HTTP 500 errors:
|
||||
|
||||
```bash
|
||||
hey -z 1m -c 5 -q 5 http://app.example.com/status/500
|
||||
```
|
||||
|
||||
Generate latency:
|
||||
|
||||
```bash
|
||||
watch -n 1 curl http://app.example.com/delay/1
|
||||
```
|
||||
|
||||
When the number of failed checks reaches the canary analysis threshold, the traffic is routed back to the primary, the canary is scaled to zero and the rollout is marked as failed.
|
||||
|
||||
```text
|
||||
kubectl -n traefik logs deploy/flagger -f | jq .msg
|
||||
|
||||
New revision detected! Scaling up podinfo.test
|
||||
Canary deployment podinfo.test not ready: waiting for rollout to finish: 0 of 1 updated replicas are available
|
||||
Starting canary analysis for podinfo.test
|
||||
Pre-rollout check acceptance-test passed
|
||||
Advance podinfo.test canary weight 5
|
||||
Advance podinfo.test canary weight 10
|
||||
Advance podinfo.test canary weight 15
|
||||
Advance podinfo.test canary weight 20
|
||||
Halt podinfo.test advancement success rate 53.42% < 99%
|
||||
Halt podinfo.test advancement success rate 53.19% < 99%
|
||||
Halt podinfo.test advancement success rate 48.05% < 99%
|
||||
Rolling back podinfo.test failed checks threshold reached 3
|
||||
Canary failed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
## Custom metrics
|
||||
|
||||
The canary analysis can be extended with Prometheus queries.
|
||||
|
||||
Create a metric template and apply it on the cluster:
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: MetricTemplate
|
||||
metadata:
|
||||
name: not-found-percentage
|
||||
namespace: test
|
||||
spec:
|
||||
provider:
|
||||
type: prometheus
|
||||
address: http://flagger-prometheus.traefik:9090
|
||||
query: |
|
||||
sum(
|
||||
rate(
|
||||
traefik_service_request_duration_seconds_bucket{
|
||||
service=~"{{ namespace }}-{{ target }}-canary-[0-9a-zA-Z-]+@kubernetescrd",
|
||||
code!="404",
|
||||
}[{{ interval }}]
|
||||
)
|
||||
)
|
||||
/
|
||||
sum(
|
||||
rate(
|
||||
traefik_service_request_duration_seconds_bucket{
|
||||
service=~"{{ namespace }}-{{ target }}-canary-[0-9a-zA-Z-]+@kubernetescrd",
|
||||
}[{{ interval }}]
|
||||
)
|
||||
) * 100
|
||||
```
|
||||
|
||||
Edit the canary analysis and add the not found error rate check:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
metrics:
|
||||
- name: "404s percentage"
|
||||
templateRef:
|
||||
name: not-found-percentage
|
||||
thresholdRange:
|
||||
max: 5
|
||||
interval: 1m
|
||||
```
|
||||
|
||||
The above configuration validates the canary by checking if the HTTP 404 req/sec percentage is below 5 percent of the total traffic. If the 404s rate reaches the 5% threshold, then the canary fails.
|
||||
|
||||
Trigger a canary deployment by updating the container image:
|
||||
|
||||
```bash
|
||||
kubectl -n test set image deployment/podinfo \
|
||||
podinfod=stefanprodan/podinfo:4.0.6
|
||||
```
|
||||
|
||||
Generate 404s:
|
||||
|
||||
```bash
|
||||
watch curl http://app.example.com/status/400
|
||||
```
|
||||
|
||||
Watch Flagger logs:
|
||||
|
||||
```text
|
||||
kubectl -n traefik logs deployment/flagger -f | jq .msg
|
||||
|
||||
Starting canary deployment for podinfo.test
|
||||
Advance podinfo.test canary weight 5
|
||||
Advance podinfo.test canary weight 10
|
||||
Advance podinfo.test canary weight 15
|
||||
Halt podinfo.test advancement 404s percentage 6.20 > 5
|
||||
Halt podinfo.test advancement 404s percentage 6.45 > 5
|
||||
Halt podinfo.test advancement 404s percentage 7.60 > 5
|
||||
Halt podinfo.test advancement 404s percentage 8.69 > 5
|
||||
Halt podinfo.test advancement 404s percentage 9.70 > 5
|
||||
Rolling back podinfo.test failed checks threshold reached 5
|
||||
Canary failed! Scaling down podinfo.test
|
||||
```
|
||||
|
||||
If you have [alerting](../usage/alerting.md) configured, Flagger will send a notification with the reason why the canary failed.
|
||||
|
||||
For an in-depth look at the analysis process read the [usage docs](../usage/how-it-works.md).
|
||||
|
|
@ -0,0 +1,189 @@
|
|||
---
|
||||
weight: 14
|
||||
title: Zero downtime deployments
|
||||
type: docs
|
||||
---
|
||||
|
||||
This is a list of things you should consider when dealing with a high traffic production environment if you want to minimise the impact of rolling updates and downscaling.
|
||||
|
||||
## Deployment strategy
|
||||
|
||||
Limit the number of unavailable pods during a rolling update:
|
||||
|
||||
```yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
spec:
|
||||
progressDeadlineSeconds: 120
|
||||
strategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxUnavailable: 0
|
||||
```
|
||||
|
||||
The default progress deadline for a deployment is ten minutes. You should consider adjusting this value to make the deployment process fail faster.
|
||||
|
||||
## Liveness health check
|
||||
|
||||
You application should expose a HTTP endpoint that Kubernetes can call to determine if your app transitioned to a broken state from which it can't recover and needs to be restarted.
|
||||
|
||||
```yaml
|
||||
livenessProbe:
|
||||
exec:
|
||||
command:
|
||||
- wget
|
||||
- --quiet
|
||||
- --tries=1
|
||||
- --timeout=4
|
||||
- --spider
|
||||
- http://localhost:8080/healthz
|
||||
timeoutSeconds: 5
|
||||
initialDelaySeconds: 5
|
||||
```
|
||||
|
||||
If you've enabled mTLS, you'll have to use `exec` for liveness and readiness checks since kubelet is not part of the service mesh and doesn't have access to the TLS cert.
|
||||
|
||||
## Readiness health check
|
||||
|
||||
You application should expose a HTTP endpoint that Kubernetes can call to determine if your app is ready to receive traffic.
|
||||
|
||||
```yaml
|
||||
readinessProbe:
|
||||
exec:
|
||||
command:
|
||||
- wget
|
||||
- --quiet
|
||||
- --tries=1
|
||||
- --timeout=4
|
||||
- --spider
|
||||
- http://localhost:8080/readyz
|
||||
timeoutSeconds: 5
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
```
|
||||
|
||||
If your app depends on external services, you should check if those services are available before allowing Kubernetes to route traffic to an app instance. Keep in mind that the Envoy sidecar can have a slower startup than your app. This means that on application start you should retry for at least a couple of seconds any external connection.
|
||||
|
||||
## Graceful shutdown
|
||||
|
||||
Before a pod gets terminated, Kubernetes sends a `SIGTERM` signal to every container and waits for period of time \(30s by default\) for all containers to exit gracefully. If your app doesn't handle the `SIGTERM` signal or if it doesn't exit within the grace period, Kubernetes will kill the container and any inflight requests that your app is processing will fail.
|
||||
|
||||
```yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
terminationGracePeriodSeconds: 60
|
||||
containers:
|
||||
- name: app
|
||||
lifecycle:
|
||||
preStop:
|
||||
exec:
|
||||
command:
|
||||
- sleep
|
||||
- "10"
|
||||
```
|
||||
|
||||
Your app container should have a `preStop` hook that delays the container shutdown. This will allow the service mesh to drain the traffic and remove this pod from all other Envoy sidecars before your app becomes unavailable.
|
||||
|
||||
## Delay Envoy shutdown
|
||||
|
||||
Even if your app reacts to `SIGTERM` and tries to complete the inflight requests before shutdown, that doesn't mean that the response will make it back to the caller. If the Envoy sidecar shuts down before your app, then the caller will receive a 503 error.
|
||||
|
||||
To mitigate this issue you can add a `preStop` hook to the Istio proxy and wait for the main app to exit before Envoy exits.
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
set -e
|
||||
if ! pidof envoy &>/dev/null; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if ! pidof pilot-agent &>/dev/null; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
while [ $(netstat -plunt | grep tcp | grep -v envoy | wc -l | xargs) -ne 0 ]; do
|
||||
sleep 1;
|
||||
done
|
||||
|
||||
exit 0
|
||||
```
|
||||
|
||||
You'll have to build your own Envoy docker image with the above script and modify the Istio injection webhook with the `preStop` directive.
|
||||
|
||||
Thanks to Stono for his excellent [tips](https://github.com/istio/istio/issues/12183) on minimising 503s.
|
||||
|
||||
## Resource requests and limits
|
||||
|
||||
Setting CPU and memory requests/limits for all workloads is a mandatory step if you're running a production system. Without limits your nodes could run out of memory or become unresponsive due to CPU exhausting. Without CPU and memory requests, the Kubernetes scheduler will not be able to make decisions about which nodes to place pods on.
|
||||
|
||||
```yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: app
|
||||
resources:
|
||||
limits:
|
||||
cpu: 1000m
|
||||
memory: 1Gi
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 128Mi
|
||||
```
|
||||
|
||||
Note that without resource requests the horizontal pod autoscaler can't determine when to scale your app.
|
||||
|
||||
## Autoscaling
|
||||
|
||||
A production environment should be able to handle traffic bursts without impacting the quality of service. This can be achieved with Kubernetes autoscaling capabilities. Autoscaling in Kubernetes has two dimensions: the Cluster Autoscaler that deals with node scaling operations and the Horizontal Pod Autoscaler that automatically scales the number of pods in a deployment.
|
||||
|
||||
```yaml
|
||||
apiVersion: autoscaling/v2beta2
|
||||
kind: HorizontalPodAutoscaler
|
||||
spec:
|
||||
scaleTargetRef:
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: app
|
||||
minReplicas: 2
|
||||
maxReplicas: 4
|
||||
metrics:
|
||||
- type: Resource
|
||||
resource:
|
||||
name: cpu
|
||||
targetAverageValue: 900m
|
||||
- type: Resource
|
||||
resource:
|
||||
name: memory
|
||||
targetAverageValue: 768Mi
|
||||
```
|
||||
|
||||
The above HPA ensures your app will be scaled up before the pods reach the CPU or memory limits.
|
||||
|
||||
## Ingress retries
|
||||
|
||||
To minimise the impact of downscaling operations you can make use of Envoy retry capabilities.
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: Canary
|
||||
spec:
|
||||
service:
|
||||
port: 9898
|
||||
gateways:
|
||||
- public-gateway.istio-system.svc.cluster.local
|
||||
hosts:
|
||||
- app.example.com
|
||||
retries:
|
||||
attempts: 10
|
||||
perTryTimeout: 5s
|
||||
retryOn: "gateway-error,connect-failure,refused-stream"
|
||||
```
|
||||
|
||||
When the HPA scales down your app, your users could run into 503 errors. The above configuration will make Envoy retry the HTTP requests that failed due to gateway errors.
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
---
|
||||
type: docs
|
||||
weight: 3
|
||||
title: Usage
|
||||
---
|
|
@ -0,0 +1,151 @@
|
|||
---
|
||||
type: docs
|
||||
weight: 4
|
||||
title: Alerting
|
||||
---
|
||||
|
||||
Flagger can be configured to send alerts to various chat platforms.
|
||||
You can define a global alert provider at install time or configure alerts on a per canary basis.
|
||||
|
||||
## Global configuration
|
||||
|
||||
### Slack
|
||||
|
||||
#### Slack Configuration
|
||||
|
||||
Flagger requires a custom webhook integration from slack, instead of the new slack app system.
|
||||
|
||||
The webhook can be generated by following the [legacy slack documentation](https://api.slack.com/legacy/custom-integrations/messaging/webhooks)
|
||||
|
||||
#### Flagger configuration
|
||||
|
||||
Once the webhook has been generated. Flagger can be configured to send Slack notifications:
|
||||
|
||||
```bash
|
||||
helm upgrade -i flagger flagger/flagger \
|
||||
--set slack.url=https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK \
|
||||
--set slack.proxy=my-http-proxy.com \ # optional http/s proxy
|
||||
--set slack.channel=general \
|
||||
--set slack.user=flagger \
|
||||
--set clusterName=my-cluster
|
||||
```
|
||||
|
||||
Once configured with a Slack incoming **webhook**,
|
||||
Flagger will post messages when a canary deployment has been initialised,
|
||||
when a new revision has been detected and if the canary analysis failed or succeeded.
|
||||
|
||||

|
||||
|
||||
A canary deployment will be rolled back if the progress deadline exceeded
|
||||
or if the analysis reached the maximum number of failed checks:
|
||||
|
||||

|
||||
|
||||
### Microsoft Teams
|
||||
|
||||
Flagger can be configured to send notifications to Microsoft Teams:
|
||||
|
||||
```bash
|
||||
helm upgrade -i flagger flagger/flagger \
|
||||
--set msteams.url=https://outlook.office.com/webhook/YOUR/TEAMS/WEBHOOK \
|
||||
--set msteams.proxy-url=my-http-proxy.com # optional http/s proxy
|
||||
```
|
||||
|
||||
Similar to Slack, Flagger alerts on canary analysis events:
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
## Canary configuration
|
||||
|
||||
Configuring alerting globally has several limitations as it's not possible to specify different channels
|
||||
or configure the verbosity on a per canary basis. To make the alerting move flexible,
|
||||
the canary analysis can be extended with a list of alerts that reference an alert provider.
|
||||
For each alert, users can configure the severity level. The alerts section overrides the global setting.
|
||||
|
||||
Slack example:
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: AlertProvider
|
||||
metadata:
|
||||
name: on-call
|
||||
namespace: flagger
|
||||
spec:
|
||||
type: slack
|
||||
channel: on-call-alerts
|
||||
username: flagger
|
||||
# webhook address (ignored if secretRef is specified)
|
||||
address: https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK
|
||||
# optional http/s proxy
|
||||
proxy: http://my-http-proxy.com
|
||||
# secret containing the webhook address (optional)
|
||||
secretRef:
|
||||
name: on-call-url
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: on-call-url
|
||||
namespace: flagger
|
||||
data:
|
||||
address: <encoded-url>
|
||||
```
|
||||
|
||||
The alert provider **type** can be: `slack`, `msteams`, `rocket` or `discord`. When set to `discord`,
|
||||
Flagger will use [Slack formatting](https://birdie0.github.io/discord-webhooks-guide/other/slack_formatting.html)
|
||||
and will append `/slack` to the Discord address.
|
||||
|
||||
When not specified, **channel** defaults to `general` and **username** defaults to `flagger`.
|
||||
|
||||
When **secretRef** is specified, the Kubernetes secret must contain a data field named `address`,
|
||||
the address in the secret will take precedence over the **address** field in the provider spec.
|
||||
|
||||
The canary analysis can have a list of alerts, each alert referencing an alert provider:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
alerts:
|
||||
- name: "on-call Slack"
|
||||
severity: error
|
||||
providerRef:
|
||||
name: on-call
|
||||
namespace: flagger
|
||||
- name: "qa Discord"
|
||||
severity: warn
|
||||
providerRef:
|
||||
name: qa-discord
|
||||
- name: "dev MS Teams"
|
||||
severity: info
|
||||
providerRef:
|
||||
name: dev-msteams
|
||||
```
|
||||
|
||||
Alert fields:
|
||||
|
||||
* **name** \(required\)
|
||||
* **severity** levels: `info`, `warn`, `error` (default info)
|
||||
* **providerRef.name** alert provider name (required)
|
||||
* **providerRef.namespace** alert provider namespace (defaults to the canary namespace)
|
||||
|
||||
When the severity is set to `warn`, Flagger will alert when waiting on manual confirmation or if the analysis fails.
|
||||
When the severity is set to `error`, Flagger will alert only if the canary analysis fails.
|
||||
|
||||
To differentiate alerts based on the cluster name, you can configure Flagger with the `-cluster-name=my-cluster`
|
||||
command flag, or with Helm `--set clusterName=my-cluster`.
|
||||
|
||||
## Prometheus Alert Manager
|
||||
|
||||
You can use Alertmanager to trigger alerts when a canary deployment failed:
|
||||
|
||||
```yaml
|
||||
- alert: canary_rollback
|
||||
expr: flagger_canary_status > 1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Canary failed"
|
||||
description: "Workload {{ $labels.name }} namespace {{ $labels.namespace }}"
|
||||
```
|
|
@ -0,0 +1,399 @@
|
|||
---
|
||||
type: docs
|
||||
weight: 2
|
||||
title: Deployment Strategies
|
||||
---
|
||||
|
||||
Flagger can run automated application analysis, promotion and rollback for the following deployment strategies:
|
||||
|
||||
* **Canary Release** \(progressive traffic shifting\)
|
||||
* Istio, Linkerd, App Mesh, NGINX, Skipper, Contour, Gloo Edge, Traefik, Open Service Mesh, Kuma, Gateway API
|
||||
* **A/B Testing** \(HTTP headers and cookies traffic routing\)
|
||||
* Istio, App Mesh, NGINX, Contour, Gloo Edge, Gateway API
|
||||
* **Blue/Green** \(traffic switching\)
|
||||
* Kubernetes CNI, Istio, Linkerd, App Mesh, NGINX, Contour, Gloo Edge, Open Service Mesh, Gateway API
|
||||
* **Blue/Green Mirroring** \(traffic shadowing\)
|
||||
* Istio
|
||||
|
||||
For Canary releases and A/B testing you'll need a Layer 7 traffic management solution like
|
||||
a service mesh or an ingress controller. For Blue/Green deployments no service mesh or ingress controller is required.
|
||||
|
||||
A canary analysis is triggered by changes in any of the following objects:
|
||||
|
||||
* Deployment PodSpec \(container image, command, ports, env, resources, etc\)
|
||||
* ConfigMaps mounted as volumes or mapped to environment variables
|
||||
* Secrets mounted as volumes or mapped to environment variables
|
||||
|
||||
## Canary Release
|
||||
|
||||
Flagger implements a control loop that gradually shifts traffic to the canary while measuring
|
||||
key performance indicators like HTTP requests success rate, requests average duration and pod health.
|
||||
Based on analysis of the KPIs a canary is promoted or aborted.
|
||||
|
||||

|
||||
|
||||
The canary analysis runs periodically until it reaches the maximum traffic weight or the failed checks threshold.
|
||||
|
||||
Spec:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
# schedule interval (default 60s)
|
||||
interval: 1m
|
||||
# max number of failed metric checks before rollback
|
||||
threshold: 10
|
||||
# max traffic percentage routed to canary
|
||||
# percentage (0-100)
|
||||
maxWeight: 50
|
||||
# canary increment step
|
||||
# percentage (0-100)
|
||||
stepWeight: 2
|
||||
# promotion increment step (default 100)
|
||||
# percentage (0-100)
|
||||
stepWeightPromotion: 100
|
||||
# deploy straight to production without
|
||||
# the metrics and webhook checks
|
||||
skipAnalysis: false
|
||||
```
|
||||
|
||||
The above analysis, if it succeeds, will run for 25 minutes while validating the HTTP metrics and webhooks every minute.
|
||||
You can determine the minimum time it takes to validate and promote a canary deployment using this formula:
|
||||
|
||||
```text
|
||||
interval * (maxWeight / stepWeight)
|
||||
```
|
||||
|
||||
And the time it takes for a canary to be rollback when the metrics or webhook checks are failing:
|
||||
|
||||
```text
|
||||
interval * threshold
|
||||
```
|
||||
|
||||
When `stepWeightPromotion` is specified, the promotion phase happens in stages, the traffic is routed back
|
||||
to the primary pods in a progressive manner, the primary weight is increased until it reaches 100%.
|
||||
|
||||
In emergency cases, you may want to skip the analysis phase and ship changes directly to production.
|
||||
At any time you can set the `spec.skipAnalysis: true`. When skip analysis is enabled,
|
||||
Flagger checks if the canary deployment is healthy and promotes it without analysing it.
|
||||
If an analysis is underway, Flagger cancels it and runs the promotion.
|
||||
|
||||
Gated canary promotion stages:
|
||||
|
||||
* scan for canary deployments
|
||||
* check primary and canary deployment status
|
||||
* halt advancement if a rolling update is underway
|
||||
* halt advancement if pods are unhealthy
|
||||
* call confirm-rollout webhooks and check results
|
||||
* halt advancement if any hook returns a non HTTP 2xx result
|
||||
* call pre-rollout webhooks and check results
|
||||
* halt advancement if any hook returns a non HTTP 2xx result
|
||||
* increment the failed checks counter
|
||||
* increase canary traffic weight percentage from 0% to 2% \(step weight\)
|
||||
* call rollout webhooks and check results
|
||||
* check canary HTTP request success rate and latency
|
||||
* halt advancement if any metric is under the specified threshold
|
||||
* increment the failed checks counter
|
||||
* check if the number of failed checks reached the threshold
|
||||
* route all traffic to primary
|
||||
* scale to zero the canary deployment and mark it as failed
|
||||
* call post-rollout webhooks
|
||||
* post the analysis result to Slack
|
||||
* wait for the canary deployment to be updated and start over
|
||||
* increase canary traffic weight by 2% \(step weight\) till it reaches 50% \(max weight\)
|
||||
* halt advancement if any webhook call fails
|
||||
* halt advancement while canary request success rate is under the threshold
|
||||
* halt advancement while canary request duration P99 is over the threshold
|
||||
* halt advancement while any custom metric check fails
|
||||
* halt advancement if the primary or canary deployment becomes unhealthy
|
||||
* halt advancement while canary deployment is being scaled up/down by HPA
|
||||
* call confirm-promotion webhooks and check results
|
||||
* halt advancement if any hook returns a non HTTP 2xx result
|
||||
* promote canary to primary
|
||||
* copy ConfigMaps and Secrets from canary to primary
|
||||
* copy canary deployment spec template over primary
|
||||
* wait for primary rolling update to finish
|
||||
* halt advancement if pods are unhealthy
|
||||
* route all traffic to primary
|
||||
* scale to zero the canary deployment
|
||||
* mark rollout as finished
|
||||
* call post-rollout webhooks
|
||||
* send notification with the canary analysis result
|
||||
* wait for the canary deployment to be updated and start over
|
||||
|
||||
### Rollout Weights
|
||||
|
||||
By default Flagger uses linear weight values for the promotion, with the start value,
|
||||
the step and the maximum weight value in 0 to 100 range.
|
||||
|
||||
Example:
|
||||
|
||||
```yaml
|
||||
canary:
|
||||
analysis:
|
||||
promotion:
|
||||
maxWeight: 50
|
||||
stepWeight: 20
|
||||
```
|
||||
|
||||
This configuration performs analysis starting from 20, increasing by 20 until weight goes above 50.
|
||||
We would have steps (canary weight : primary weight):
|
||||
|
||||
* 20 (20 : 80)
|
||||
* 40 (40 : 60)
|
||||
* 60 (60 : 40)
|
||||
* promotion
|
||||
|
||||
In order to enable non-linear promotion a new parameter was introduced:
|
||||
|
||||
* `stepWeights` - determines the ordered array of weights, which shall be used during canary promotion.
|
||||
|
||||
Example:
|
||||
|
||||
```yaml
|
||||
canary:
|
||||
analysis:
|
||||
promotion:
|
||||
stepWeights: [1, 2, 10, 80]
|
||||
```
|
||||
|
||||
This configuration performs analysis starting from 1, going through `stepWeights` values till 80.
|
||||
We would have steps (canary weight : primary weight):
|
||||
|
||||
* 1 (1 : 99)
|
||||
* 2 (2 : 98)
|
||||
* 10 (10 : 90)
|
||||
* 80 (20 : 60)
|
||||
* promotion
|
||||
|
||||
## A/B Testing
|
||||
|
||||
For frontend applications that require session affinity you should use
|
||||
HTTP headers or cookies match conditions to ensure a set of users
|
||||
will stay on the same version for the whole duration of the canary analysis.
|
||||
|
||||

|
||||
|
||||
You can enable A/B testing by specifying the HTTP match conditions and the number of iterations.
|
||||
If Flagger finds a HTTP match condition, it will ignore the `maxWeight` and `stepWeight` settings.
|
||||
|
||||
Istio example:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
# schedule interval (default 60s)
|
||||
interval: 1m
|
||||
# total number of iterations
|
||||
iterations: 10
|
||||
# max number of failed iterations before rollback
|
||||
threshold: 2
|
||||
# canary match condition
|
||||
match:
|
||||
- headers:
|
||||
x-canary:
|
||||
regex: ".*insider.*"
|
||||
- headers:
|
||||
cookie:
|
||||
regex: "^(.*?;)?(canary=always)(;.*)?$"
|
||||
```
|
||||
|
||||
The above configuration will run an analysis for ten minutes targeting the Safari users and those that have a test cookie.
|
||||
You can determine the minimum time that it takes to validate and promote a canary deployment using this formula:
|
||||
|
||||
```text
|
||||
interval * iterations
|
||||
```
|
||||
|
||||
And the time it takes for a canary to be rollback when the metrics or webhook checks are failing:
|
||||
|
||||
```text
|
||||
interval * threshold
|
||||
```
|
||||
|
||||
Istio example:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
interval: 1m
|
||||
threshold: 10
|
||||
iterations: 2
|
||||
match:
|
||||
- headers:
|
||||
x-canary:
|
||||
exact: "insider"
|
||||
- headers:
|
||||
cookie:
|
||||
regex: "^(.*?;)?(canary=always)(;.*)?$"
|
||||
- sourceLabels:
|
||||
app.kubernetes.io/name: "scheduler"
|
||||
```
|
||||
|
||||
The header keys must be lowercase and use hyphen as the separator.
|
||||
Header values are case-sensitive and formatted as follows:
|
||||
|
||||
* `exact: "value"` for exact string match
|
||||
* `prefix: "value"` for prefix-based match
|
||||
* `suffix: "value"` for suffix-based match
|
||||
* `regex: "value"` for [RE2](https://github.com/google/re2/wiki/Syntax) style regex-based match
|
||||
|
||||
Note that the `sourceLabels` match conditions are applicable only when
|
||||
the `mesh` gateway is included in the `canary.service.gateways` list.
|
||||
|
||||
App Mesh example:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
interval: 1m
|
||||
threshold: 10
|
||||
iterations: 2
|
||||
match:
|
||||
- headers:
|
||||
user-agent:
|
||||
regex: ".*Chrome.*"
|
||||
```
|
||||
|
||||
Note that App Mesh supports a single condition.
|
||||
|
||||
Contour example:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
interval: 1m
|
||||
threshold: 10
|
||||
iterations: 2
|
||||
match:
|
||||
- headers:
|
||||
user-agent:
|
||||
prefix: "Chrome"
|
||||
```
|
||||
|
||||
Note that Contour does not support regex, you can use prefix, suffix or exact.
|
||||
|
||||
NGINX example:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
interval: 1m
|
||||
threshold: 10
|
||||
iterations: 2
|
||||
match:
|
||||
- headers:
|
||||
x-canary:
|
||||
exact: "insider"
|
||||
- headers:
|
||||
cookie:
|
||||
exact: "canary"
|
||||
```
|
||||
|
||||
Note that the NGINX ingress controller supports only exact matching for
|
||||
cookies names where the value must be set to `always`.
|
||||
Starting with NGINX ingress v0.31, regex matching is supported for header values.
|
||||
|
||||
The above configurations will route users with the x-canary header
|
||||
or canary cookie to the canary instance during analysis:
|
||||
|
||||
```bash
|
||||
curl -H 'X-Canary: insider' http://app.example.com
|
||||
curl -b 'canary=always' http://app.example.com
|
||||
```
|
||||
|
||||
## Blue/Green Deployments
|
||||
|
||||
For applications that are not deployed on a service mesh,
|
||||
Flagger can orchestrate blue/green style deployments with Kubernetes L4 networking.
|
||||
When using Istio you have the option to mirror traffic between blue and green.
|
||||
|
||||

|
||||
|
||||
You can use the blue/green deployment strategy by replacing
|
||||
`stepWeight/maxWeight` with `iterations` in the `analysis` spec:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
# schedule interval (default 60s)
|
||||
interval: 1m
|
||||
# total number of iterations
|
||||
iterations: 10
|
||||
# max number of failed iterations before rollback
|
||||
threshold: 2
|
||||
```
|
||||
|
||||
With the above configuration Flagger will run conformance and load tests on the canary pods for ten minutes.
|
||||
If the metrics analysis succeeds, live traffic will be switched from
|
||||
the old version to the new one when the canary is promoted.
|
||||
|
||||
The blue/green deployment strategy is supported for all service mesh providers.
|
||||
|
||||
Blue/Green rollout steps for service mesh:
|
||||
|
||||
* detect new revision (deployment spec, secrets or configmaps changes)
|
||||
* scale up the canary (green)
|
||||
* run conformance tests for the canary pods
|
||||
* run load tests and metric checks for the canary pods every minute
|
||||
* abort the canary release if the failure threshold is reached
|
||||
* route traffic to canary (This doesn't happen when using the kubernetes provider)
|
||||
* promote canary spec over primary (blue)
|
||||
* wait for primary rollout
|
||||
* route traffic to primary
|
||||
* scale down canary
|
||||
|
||||
After the analysis finishes, the traffic is routed to the canary (green) before
|
||||
triggering the primary (blue) rolling update,
|
||||
this ensures a smooth transition to the new version avoiding dropping
|
||||
in-flight requests during the Kubernetes deployment rollout.
|
||||
|
||||
## Blue/Green with Traffic Mirroring
|
||||
|
||||
Traffic Mirroring is a pre-stage in a Canary (progressive traffic shifting) or Blue/Green deployment strategy.
|
||||
Traffic mirroring will copy each incoming request, sending one request to the primary and one to the canary service.
|
||||
The response from the primary is sent back to the user. The response from the canary is discarded.
|
||||
Metrics are collected on both requests so that the deployment will only proceed if the canary metrics are healthy.
|
||||
|
||||
Mirroring should be used for requests that are **idempotent** or capable of being processed
|
||||
twice (once by the primary and once by the canary).
|
||||
Reads are idempotent. Before using mirroring on requests that may be writes,
|
||||
you should consider what will happen if a write is duplicated and handled by the primary and canary.
|
||||
|
||||
To use mirroring, set `spec.analysis.mirror` to `true`.
|
||||
|
||||
Istio example:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
# schedule interval (default 60s)
|
||||
interval: 1m
|
||||
# total number of iterations
|
||||
iterations: 10
|
||||
# max number of failed iterations before rollback
|
||||
threshold: 2
|
||||
# Traffic shadowing (compatible with Istio only)
|
||||
mirror: true
|
||||
# Weight of the traffic mirrored to your canary (defaults to 100%)
|
||||
mirrorWeight: 100
|
||||
```
|
||||
|
||||
Mirroring rollout steps for service mesh:
|
||||
|
||||
* detect new revision (deployment spec, secrets or configmaps changes)
|
||||
* scale from zero the canary deployment
|
||||
* wait for the HPA to set the canary minimum replicas
|
||||
* check canary pods health
|
||||
* run the acceptance tests
|
||||
* abort the canary release if tests fail
|
||||
* start the load tests
|
||||
* mirror 100% of the traffic from primary to canary
|
||||
* check request success rate and request duration every minute
|
||||
* abort the canary release if the failure threshold is reached
|
||||
* stop traffic mirroring after the number of iterations is reached
|
||||
* route live traffic to the canary pods
|
||||
* promote the canary \(update the primary secrets, configmaps and deployment spec\)
|
||||
* wait for the primary deployment rollout to finish
|
||||
* wait for the HPA to set the primary minimum replicas
|
||||
* check primary pods health
|
||||
* switch live traffic back to primary
|
||||
* scale to zero the canary
|
||||
* send notification with the canary analysis result
|
||||
|
||||
After the analysis finishes, the traffic is routed to the canary (green) before
|
||||
triggering the primary (blue) rolling update, this ensures a smooth transition
|
||||
to the new version avoiding dropping in-flight requests during the Kubernetes deployment rollout.
|
||||
|
|
@ -0,0 +1,373 @@
|
|||
---
|
||||
type: docs
|
||||
weight: 1
|
||||
title: How it works
|
||||
---
|
||||
|
||||
[Flagger](https://github.com/fluxcd/flagger) can be configured to automate the release process for
|
||||
Kubernetes workloads with a custom resource named canary.
|
||||
|
||||
## Canary resource
|
||||
|
||||
The canary custom resource defines the release process of an application running on Kubernetes and is
|
||||
portable across clusters, service meshes and ingress providers.
|
||||
|
||||
For a deployment named _podinfo_, a canary release with progressive traffic shifting can be defined as:
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: Canary
|
||||
metadata:
|
||||
name: podinfo
|
||||
spec:
|
||||
targetRef:
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: podinfo
|
||||
service:
|
||||
port: 9898
|
||||
analysis:
|
||||
interval: 1m
|
||||
threshold: 10
|
||||
maxWeight: 50
|
||||
stepWeight: 5
|
||||
metrics:
|
||||
- name: request-success-rate
|
||||
thresholdRange:
|
||||
min: 99
|
||||
interval: 1m
|
||||
- name: request-duration
|
||||
thresholdRange:
|
||||
max: 500
|
||||
interval: 1m
|
||||
webhooks:
|
||||
- name: load-test
|
||||
url: http://flagger-loadtester.test/
|
||||
metadata:
|
||||
cmd: "hey -z 1m -q 10 -c 2 http://podinfo-canary.test:9898/"
|
||||
```
|
||||
|
||||
When you deploy a new version of an app, Flagger gradually shifts traffic to the canary, and at the same time,
|
||||
measures the requests success rate as well as the average response duration.
|
||||
You can extend the canary analysis with custom metrics,
|
||||
acceptance and load testing to harden the validation process of your app release process.
|
||||
|
||||
If you are running multiple service meshes or ingress controllers in the same cluster,
|
||||
you can override the global provider for a specific canary with `spec.provider`.
|
||||
|
||||
## Canary target
|
||||
|
||||
A canary resource can target a Kubernetes Deployment or DaemonSet.
|
||||
|
||||
Kubernetes Deployment example:
|
||||
|
||||
```yaml
|
||||
spec:
|
||||
progressDeadlineSeconds: 60
|
||||
targetRef:
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: podinfo
|
||||
autoscalerRef:
|
||||
apiVersion: autoscaling/v2beta2
|
||||
kind: HorizontalPodAutoscaler
|
||||
name: podinfo
|
||||
```
|
||||
|
||||
Based on the above configuration, Flagger generates the following Kubernetes objects:
|
||||
|
||||
* `deployment/<targetRef.name>-primary`
|
||||
* `hpa/<autoscalerRef.name>-primary`
|
||||
|
||||
The primary deployment is considered the stable release of your app,
|
||||
by default all traffic is routed to this version and the target deployment is scaled to zero.
|
||||
Flagger will detect changes to the target deployment (including secrets and configmaps)
|
||||
and will perform a canary analysis before promoting the new version as primary.
|
||||
|
||||
**Note** that the target deployment must have a single label selector in the format `app: <DEPLOYMENT-NAME>`:
|
||||
|
||||
```yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: podinfo
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: podinfo
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: podinfo
|
||||
```
|
||||
|
||||
In addition to `app`, Flagger supports `name` and `app.kubernetes.io/name` selectors.
|
||||
If you use a different convention you can specify your label with the `-selector-labels=my-app-label`
|
||||
command flag in the Flagger deployment manifest under containers args
|
||||
or by setting `--set selectorLabels=my-app-label` when installing Flagger with Helm.
|
||||
|
||||
If the target deployment uses secrets and/or configmaps,
|
||||
Flagger will create a copy of each object using the `-primary` suffix
|
||||
and will reference these objects in the primary deployment.
|
||||
If you annotate your ConfigMap or Secret with `flagger.app/config-tracking: disabled`,
|
||||
Flagger will use the same object for the primary deployment instead of making a primary copy.
|
||||
You can disable the secrets/configmaps tracking globally with the `-enable-config-tracking=false`
|
||||
command flag in the Flagger deployment manifest under containers args
|
||||
or by setting `--set configTracking.enabled=false` when installing Flagger with Helm,
|
||||
but disabling config-tracking using the per Secret/ConfigMap annotation may fit your use-case better.
|
||||
|
||||
The autoscaler reference is optional, when specified,
|
||||
Flagger will pause the traffic increase while the target and primary deployments are scaled up or down.
|
||||
HPA can help reduce the resource usage during the canary analysis.
|
||||
When the autoscaler reference is specified, any changes made to the autoscaler are only made active
|
||||
in the primary autoscaler when a rollout for the deployment starts and completes successfully.
|
||||
Optionally, you can create two HPAs, one for canary and one for the primary to update the HPA without
|
||||
doing a new rollout. As the canary deployment will be scaled to 0, the HPA on the canary will be inactive.
|
||||
|
||||
The progress deadline represents the maximum time in seconds for the canary deployment to
|
||||
make progress before it is rolled back, defaults to ten minutes.
|
||||
|
||||
## Canary service
|
||||
|
||||
A canary resource dictates how the target workload is exposed inside the cluster.
|
||||
The canary target should expose a TCP port that will be used by Flagger to create the ClusterIP Services.
|
||||
|
||||
```yaml
|
||||
spec:
|
||||
service:
|
||||
name: podinfo
|
||||
port: 9898
|
||||
portName: http
|
||||
targetPort: 9898
|
||||
portDiscovery: true
|
||||
```
|
||||
|
||||
The container port from the target workload should match the `service.port` or `service.targetPort`.
|
||||
The `service.name` is optional, defaults to `spec.targetRef.name`.
|
||||
The `service.targetPort` can be a container port number or name.
|
||||
The `service.portName` is optional (defaults to `http`), if your workload uses gRPC then set the port name to `grpc`.
|
||||
|
||||
If port discovery is enabled, Flagger scans the target workload and extracts the containers ports
|
||||
excluding the port specified in the canary service and service mesh sidecar ports.
|
||||
These ports will be used when generating the ClusterIP services.
|
||||
|
||||
Based on the canary spec service, Flagger creates the following Kubernetes ClusterIP service:
|
||||
|
||||
* `<service.name>.<namespace>.svc.cluster.local`
|
||||
|
||||
selector `app=<name>-primary`
|
||||
|
||||
* `<service.name>-primary.<namespace>.svc.cluster.local`
|
||||
|
||||
selector `app=<name>-primary`
|
||||
|
||||
* `<service.name>-canary.<namespace>.svc.cluster.local`
|
||||
|
||||
selector `app=<name>`
|
||||
|
||||
This ensures that traffic to `podinfo.test:9898` will be routed to the latest stable release of your app.
|
||||
The `podinfo-canary.test:9898` address is available only during the canary analysis
|
||||
and can be used for conformance testing or load testing.
|
||||
|
||||
You can configure Flagger to set annotations and labels for the generated services with:
|
||||
|
||||
```yaml
|
||||
spec:
|
||||
service:
|
||||
port: 9898
|
||||
apex:
|
||||
annotations:
|
||||
test: "test"
|
||||
labels:
|
||||
test: "test"
|
||||
canary:
|
||||
annotations:
|
||||
test: "test"
|
||||
labels:
|
||||
test: "test"
|
||||
primary:
|
||||
annotations:
|
||||
test: "test"
|
||||
labels:
|
||||
test: "test"
|
||||
```
|
||||
|
||||
Note that the `apex` annotations are added to both the generated Kubernetes Service and the
|
||||
generated service mesh/ingress object. This allows using external-dns with Istio `VirtualServices`
|
||||
and `TraefikServices`. Beware of configuration conflicts [here](../faq.md#ExternalDNS).
|
||||
|
||||
Besides port mapping and metadata, the service specification can
|
||||
contain URI match and rewrite rules, timeout and retry polices:
|
||||
|
||||
```yaml
|
||||
spec:
|
||||
service:
|
||||
port: 9898
|
||||
match:
|
||||
- uri:
|
||||
prefix: /
|
||||
rewrite:
|
||||
uri: /
|
||||
retries:
|
||||
attempts: 3
|
||||
perTryTimeout: 1s
|
||||
timeout: 5s
|
||||
```
|
||||
|
||||
When using **Istio** as the mesh provider, you can also specify HTTP header operations,
|
||||
CORS and traffic policies, Istio gateways and hosts.
|
||||
The Istio routing configuration can be found [here](../faq.md#istio-routing).
|
||||
|
||||
## Canary status
|
||||
|
||||
You can use kubectl to get the current status of canary deployments cluster wide:
|
||||
|
||||
```bash
|
||||
kubectl get canaries --all-namespaces
|
||||
|
||||
NAMESPACE NAME STATUS WEIGHT LASTTRANSITIONTIME
|
||||
test podinfo Progressing 15 2019-06-30T14:05:07Z
|
||||
prod frontend Succeeded 0 2019-06-30T16:15:07Z
|
||||
prod backend Failed 0 2019-06-30T17:05:07Z
|
||||
```
|
||||
|
||||
The status condition reflects the last known state of the canary analysis:
|
||||
|
||||
```bash
|
||||
kubectl -n test get canary/podinfo -oyaml | awk '/status/,0'
|
||||
```
|
||||
|
||||
A successful rollout status:
|
||||
|
||||
```yaml
|
||||
status:
|
||||
canaryWeight: 0
|
||||
failedChecks: 0
|
||||
iterations: 0
|
||||
lastAppliedSpec: "14788816656920327485"
|
||||
lastPromotedSpec: "14788816656920327485"
|
||||
conditions:
|
||||
- lastTransitionTime: "2019-07-10T08:23:18Z"
|
||||
lastUpdateTime: "2019-07-10T08:23:18Z"
|
||||
message: Canary analysis completed successfully, promotion finished.
|
||||
reason: Succeeded
|
||||
status: "True"
|
||||
type: Promoted
|
||||
```
|
||||
|
||||
The `Promoted` status condition can have one of the following reasons:
|
||||
Initialized, Waiting, Progressing, WaitingPromotion, Promoting, Finalising, Succeeded or Failed.
|
||||
A failed canary will have the promoted status set to `false`,
|
||||
the reason to `failed` and the last applied spec will be different to the last promoted one.
|
||||
|
||||
Wait for a successful rollout:
|
||||
|
||||
```bash
|
||||
kubectl wait canary/podinfo --for=condition=promoted
|
||||
```
|
||||
|
||||
CI example:
|
||||
|
||||
```bash
|
||||
# update the container image
|
||||
kubectl set image deployment/podinfo podinfod=stefanprodan/podinfo:3.0.1
|
||||
|
||||
# wait for Flagger to detect the change
|
||||
ok=false
|
||||
until ${ok}; do
|
||||
kubectl get canary/podinfo | grep 'Progressing' && ok=true || ok=false
|
||||
sleep 5
|
||||
done
|
||||
|
||||
# wait for the canary analysis to finish
|
||||
kubectl wait canary/podinfo --for=condition=promoted --timeout=5m
|
||||
|
||||
# check if the deployment was successful
|
||||
kubectl get canary/podinfo | grep Succeeded
|
||||
```
|
||||
|
||||
## Canary finalizers
|
||||
|
||||
The default behavior of Flagger on canary deletion is to leave resources that aren't owned
|
||||
by the controller in their current state.
|
||||
This simplifies the deletion action and avoids possible deadlocks during resource finalization.
|
||||
In the event the canary was introduced with existing resource(s) (i.e. service, virtual service, etc.),
|
||||
they would be mutated during the initialization phase and no longer reflect their initial state.
|
||||
If the desired functionality upon deletion is to revert the resources to their initial state,
|
||||
the `revertOnDeletion` attribute can be enabled.
|
||||
|
||||
```yaml
|
||||
spec:
|
||||
revertOnDeletion: true
|
||||
```
|
||||
|
||||
When a deletion action is submitted to the cluster, Flagger will attempt to revert the following resources:
|
||||
|
||||
* [Canary target](how-it-works.md#canary-target) replicas will be updated to the primary replica count
|
||||
* [Canary service](how-it-works.md#canary-service) selector will be reverted
|
||||
* Mesh/Ingress traffic routed to the target
|
||||
|
||||
The recommended approach to disable canary analysis would be utilization of the `skipAnalysis` attribute,
|
||||
which limits the need for resource reconciliation.
|
||||
Utilizing the `revertOnDeletion` attribute should be enabled when
|
||||
you no longer plan to rely on Flagger for deployment management.
|
||||
|
||||
**Note** When this feature is enabled expect a delay in the delete action due to the reconciliation.
|
||||
|
||||
## Canary analysis
|
||||
|
||||
The canary analysis defines:
|
||||
|
||||
* the type of [deployment strategy](deployment-strategies.md)
|
||||
* the [metrics](metrics.md) used to validate the canary version
|
||||
* the [webhooks](webhooks.md) used for conformance testing, load testing and manual gating
|
||||
* the [alerting settings](alerting.md)
|
||||
|
||||
Spec:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
# schedule interval (default 60s)
|
||||
interval:
|
||||
# max number of failed metric checks before rollback
|
||||
threshold:
|
||||
# max traffic percentage routed to canary
|
||||
# percentage (0-100)
|
||||
maxWeight:
|
||||
# canary increment step
|
||||
# percentage (0-100)
|
||||
stepWeight:
|
||||
# promotion increment step
|
||||
# percentage (0-100)
|
||||
stepWeightPromotion:
|
||||
# total number of iterations
|
||||
# used for A/B Testing and Blue/Green
|
||||
iterations:
|
||||
# threshold of primary pods that need to be available to consider it ready
|
||||
# before starting rollout. this is optional and the default is 100
|
||||
# percentage (0-100)
|
||||
primaryReadyThreshold: 100
|
||||
# threshold of canary pods that need to be available to consider it ready
|
||||
# before starting rollout. this is optional and the default is 100
|
||||
# percentage (0-100)
|
||||
canaryReadyThreshold: 100
|
||||
# canary match conditions
|
||||
# used for A/B Testing
|
||||
match:
|
||||
- # HTTP header
|
||||
# key performance indicators
|
||||
metrics:
|
||||
- # metric check
|
||||
# alerting
|
||||
alerts:
|
||||
- # alert provider
|
||||
# external checks
|
||||
webhooks:
|
||||
- # hook
|
||||
```
|
||||
|
||||
The canary analysis runs periodically until it reaches the maximum traffic weight or the number of iterations.
|
||||
On each run, Flagger calls the webhooks, checks the metrics and if the failed checks threshold is reached,
|
||||
stops the analysis and rolls back the canary.
|
||||
If alerting is configured, Flagger will post the analysis result using the alert providers.
|
||||
|
|
@ -0,0 +1,617 @@
|
|||
---
|
||||
type: docs
|
||||
weight: 3
|
||||
title: Metrics Analysis
|
||||
---
|
||||
|
||||
As part of the analysis process, Flagger can validate service level objectives
|
||||
(SLOs) like availability, error rate percentage, average response time and any other objective
|
||||
based on app specific metrics.
|
||||
If a drop in performance is noticed during the SLOs analysis,
|
||||
the release will be automatically rolled back with minimum impact to end-users.
|
||||
|
||||
## Builtin metrics
|
||||
|
||||
Flagger comes with two builtin metric checks: HTTP request success rate and duration.
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
metrics:
|
||||
- name: request-success-rate
|
||||
interval: 1m
|
||||
# minimum req success rate (non 5xx responses)
|
||||
# percentage (0-100)
|
||||
thresholdRange:
|
||||
min: 99
|
||||
- name: request-duration
|
||||
interval: 1m
|
||||
# maximum req duration P99
|
||||
# milliseconds
|
||||
thresholdRange:
|
||||
max: 500
|
||||
```
|
||||
|
||||
For each metric you can specify a range of accepted values with `thresholdRange` and
|
||||
the window size or the time series with `interval`.
|
||||
The builtin checks are available for every service mesh / ingress controlle
|
||||
and are implemented with [Prometheus queries](../faq.md#metrics).
|
||||
|
||||
## Custom metrics
|
||||
|
||||
The canary analysis can be extended with custom metric checks.
|
||||
Using a `MetricTemplate` custom resource,
|
||||
you configure Flagger to connect to a metric provider and run a query that returns a `float64` value.
|
||||
The query result is used to validate the canary based on the specified threshold range.
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: MetricTemplate
|
||||
metadata:
|
||||
name: my-metric
|
||||
spec:
|
||||
provider:
|
||||
type: # can be prometheus, datadog, etc
|
||||
address: # API URL
|
||||
insecureSkipVerify: # if set to true, disables the TLS cert validation
|
||||
secretRef:
|
||||
name: # name of the secret containing the API credentials
|
||||
query: # metric query
|
||||
```
|
||||
|
||||
The following variables are available in query templates:
|
||||
|
||||
* `name` (canary.metadata.name)
|
||||
* `namespace` (canary.metadata.namespace)
|
||||
* `target` (canary.spec.targetRef.name)
|
||||
* `service` (canary.spec.service.name)
|
||||
* `ingress` (canary.spec.ingresRef.name)
|
||||
* `interval` (canary.spec.analysis.metrics[].interval)
|
||||
|
||||
A canary analysis metric can reference a template with `templateRef`:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
metrics:
|
||||
- name: "my metric"
|
||||
templateRef:
|
||||
name: my-metric
|
||||
# namespace is optional
|
||||
# when not specified, the canary namespace will be used
|
||||
namespace: flagger
|
||||
# accepted values
|
||||
thresholdRange:
|
||||
min: 10
|
||||
max: 1000
|
||||
# metric query time window
|
||||
interval: 1m
|
||||
```
|
||||
|
||||
## Prometheus
|
||||
|
||||
You can create custom metric checks targeting a Prometheus server by
|
||||
setting the provider type to `prometheus` and writing the query in PromQL.
|
||||
|
||||
Prometheus template example:
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: MetricTemplate
|
||||
metadata:
|
||||
name: not-found-percentage
|
||||
namespace: istio-system
|
||||
spec:
|
||||
provider:
|
||||
type: prometheus
|
||||
address: http://prometheus.istio-system:9090
|
||||
query: |
|
||||
100 - sum(
|
||||
rate(
|
||||
istio_requests_total{
|
||||
reporter="destination",
|
||||
destination_workload_namespace="{{ namespace }}",
|
||||
destination_workload="{{ target }}",
|
||||
response_code!="404"
|
||||
}[{{ interval }}]
|
||||
)
|
||||
)
|
||||
/
|
||||
sum(
|
||||
rate(
|
||||
istio_requests_total{
|
||||
reporter="destination",
|
||||
destination_workload_namespace="{{ namespace }}",
|
||||
destination_workload="{{ target }}"
|
||||
}[{{ interval }}]
|
||||
)
|
||||
) * 100
|
||||
```
|
||||
|
||||
Reference the template in the canary analysis:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
metrics:
|
||||
- name: "404s percentage"
|
||||
templateRef:
|
||||
name: not-found-percentage
|
||||
namespace: istio-system
|
||||
thresholdRange:
|
||||
max: 5
|
||||
interval: 1m
|
||||
```
|
||||
|
||||
The above configuration validates the canary by checking if the HTTP 404 req/sec percentage
|
||||
is below 5 percent of the total traffic. If the 404s rate reaches the 5% threshold, then the canary fails.
|
||||
|
||||
Prometheus gRPC error rate example:
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: MetricTemplate
|
||||
metadata:
|
||||
name: grpc-error-rate-percentage
|
||||
namespace: flagger
|
||||
spec:
|
||||
provider:
|
||||
type: prometheus
|
||||
address: http://flagger-prometheus.flagger-system:9090
|
||||
query: |
|
||||
100 - sum(
|
||||
rate(
|
||||
grpc_server_handled_total{
|
||||
grpc_code!="OK",
|
||||
kubernetes_namespace="{{ namespace }}",
|
||||
kubernetes_pod_name=~"{{ target }}-[0-9a-zA-Z]+(-[0-9a-zA-Z]+)"
|
||||
}[{{ interval }}]
|
||||
)
|
||||
)
|
||||
/
|
||||
sum(
|
||||
rate(
|
||||
grpc_server_started_total{
|
||||
kubernetes_namespace="{{ namespace }}",
|
||||
kubernetes_pod_name=~"{{ target }}-[0-9a-zA-Z]+(-[0-9a-zA-Z]+)"
|
||||
}[{{ interval }}]
|
||||
)
|
||||
) * 100
|
||||
```
|
||||
|
||||
The above template is for gRPC services instrumented with
|
||||
[go-grpc-prometheus](https://github.com/grpc-ecosystem/go-grpc-prometheus).
|
||||
|
||||
## Prometheus authentication
|
||||
|
||||
If your Prometheus API requires basic authentication, you can create a secret in the same namespace
|
||||
as the `MetricTemplate` with the basic-auth credentials:
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: prom-basic-auth
|
||||
namespace: flagger
|
||||
data:
|
||||
username: your-user
|
||||
password: your-password
|
||||
```
|
||||
|
||||
Then reference the secret in the `MetricTemplate`:
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: MetricTemplate
|
||||
metadata:
|
||||
name: my-metric
|
||||
namespace: flagger
|
||||
spec:
|
||||
provider:
|
||||
type: prometheus
|
||||
address: http://prometheus.monitoring:9090
|
||||
secretRef:
|
||||
name: prom-basic-auth
|
||||
```
|
||||
|
||||
## Datadog
|
||||
|
||||
You can create custom metric checks using the Datadog provider.
|
||||
|
||||
Create a secret with your Datadog API credentials:
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: datadog
|
||||
namespace: istio-system
|
||||
data:
|
||||
datadog_api_key: your-datadog-api-key
|
||||
datadog_application_key: your-datadog-application-key
|
||||
```
|
||||
|
||||
Datadog template example:
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: MetricTemplate
|
||||
metadata:
|
||||
name: not-found-percentage
|
||||
namespace: istio-system
|
||||
spec:
|
||||
provider:
|
||||
type: datadog
|
||||
address: https://api.datadoghq.com
|
||||
secretRef:
|
||||
name: datadog
|
||||
query: |
|
||||
100 - (
|
||||
sum:istio.mesh.request.count{
|
||||
reporter:destination,
|
||||
destination_workload_namespace:{{ namespace }},
|
||||
destination_workload:{{ target }},
|
||||
!response_code:404
|
||||
}.as_count()
|
||||
/
|
||||
sum:istio.mesh.request.count{
|
||||
reporter:destination,
|
||||
destination_workload_namespace:{{ namespace }},
|
||||
destination_workload:{{ target }}
|
||||
}.as_count()
|
||||
) * 100
|
||||
```
|
||||
|
||||
Reference the template in the canary analysis:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
metrics:
|
||||
- name: "404s percentage"
|
||||
templateRef:
|
||||
name: not-found-percentage
|
||||
namespace: istio-system
|
||||
thresholdRange:
|
||||
max: 5
|
||||
interval: 1m
|
||||
```
|
||||
|
||||
## Amazon CloudWatch
|
||||
|
||||
You can create custom metric checks using the CloudWatch metrics provider.
|
||||
|
||||
CloudWatch template example:
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1alpha1
|
||||
kind: MetricTemplate
|
||||
metadata:
|
||||
name: cloudwatch-error-rate
|
||||
spec:
|
||||
provider:
|
||||
type: cloudwatch
|
||||
region: ap-northeast-1 # specify the region of your metrics
|
||||
query: |
|
||||
[
|
||||
{
|
||||
"Id": "e1",
|
||||
"Expression": "m1 / m2",
|
||||
"Label": "ErrorRate"
|
||||
},
|
||||
{
|
||||
"Id": "m1",
|
||||
"MetricStat": {
|
||||
"Metric": {
|
||||
"Namespace": "MyKubernetesCluster",
|
||||
"MetricName": "ErrorCount",
|
||||
"Dimensions": [
|
||||
{
|
||||
"Name": "appName",
|
||||
"Value": "{{ name }}.{{ namespace }}"
|
||||
}
|
||||
]
|
||||
},
|
||||
"Period": 60,
|
||||
"Stat": "Sum",
|
||||
"Unit": "Count"
|
||||
},
|
||||
"ReturnData": false
|
||||
},
|
||||
{
|
||||
"Id": "m2",
|
||||
"MetricStat": {
|
||||
"Metric": {
|
||||
"Namespace": "MyKubernetesCluster",
|
||||
"MetricName": "RequestCount",
|
||||
"Dimensions": [
|
||||
{
|
||||
"Name": "appName",
|
||||
"Value": "{{ name }}.{{ namespace }}"
|
||||
}
|
||||
]
|
||||
},
|
||||
"Period": 60,
|
||||
"Stat": "Sum",
|
||||
"Unit": "Count"
|
||||
},
|
||||
"ReturnData": false
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
The query format documentation can be found
|
||||
[here](https://aws.amazon.com/premiumsupport/knowledge-center/cloudwatch-getmetricdata-api/).
|
||||
|
||||
Reference the template in the canary analysis:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
metrics:
|
||||
- name: "app error rate"
|
||||
templateRef:
|
||||
name: cloudwatch-error-rate
|
||||
thresholdRange:
|
||||
max: 0.1
|
||||
interval: 1m
|
||||
```
|
||||
|
||||
**Note** that Flagger need AWS IAM permission to perform `cloudwatch:GetMetricData` to use this provider.
|
||||
|
||||
## New Relic
|
||||
|
||||
You can create custom metric checks using the New Relic provider.
|
||||
|
||||
Create a secret with your New Relic Insights credentials:
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: newrelic
|
||||
namespace: istio-system
|
||||
data:
|
||||
newrelic_account_id: your-account-id
|
||||
newrelic_query_key: your-insights-query-key
|
||||
```
|
||||
|
||||
New Relic template example:
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: MetricTemplate
|
||||
metadata:
|
||||
name: newrelic-error-rate
|
||||
namespace: ingress-nginx
|
||||
spec:
|
||||
provider:
|
||||
type: newrelic
|
||||
secretRef:
|
||||
name: newrelic
|
||||
query: |
|
||||
SELECT
|
||||
filter(sum(nginx_ingress_controller_requests), WHERE status >= '500') /
|
||||
sum(nginx_ingress_controller_requests) * 100
|
||||
FROM Metric
|
||||
WHERE metricName = 'nginx_ingress_controller_requests'
|
||||
AND ingress = '{{ ingress }}' AND namespace = '{{ namespace }}'
|
||||
```
|
||||
|
||||
Reference the template in the canary analysis:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
metrics:
|
||||
- name: "error rate"
|
||||
templateRef:
|
||||
name: newrelic-error-rate
|
||||
namespace: ingress-nginx
|
||||
thresholdRange:
|
||||
max: 5
|
||||
interval: 1m
|
||||
```
|
||||
|
||||
## Graphite
|
||||
|
||||
You can create custom metric checks using the Graphite provider.
|
||||
|
||||
Graphite template example:
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: MetricTemplate
|
||||
metadata:
|
||||
name: graphite-request-success-rate
|
||||
spec:
|
||||
provider:
|
||||
type: graphite
|
||||
address: http://graphite.monitoring
|
||||
query: |
|
||||
target=summarize(
|
||||
asPercent(
|
||||
sumSeries(
|
||||
stats.timers.httpServerRequests.app.{{target}}.exception.*.method.*.outcome.{CLIENT_ERROR,INFORMATIONAL,REDIRECTION,SUCCESS}.status.*.uri.*.count
|
||||
),
|
||||
sumSeries(
|
||||
stats.timers.httpServerRequests.app.{{target}}.exception.*.method.*.outcome.*.status.*.uri.*.count
|
||||
)
|
||||
),
|
||||
{{interval}},
|
||||
'avg'
|
||||
)
|
||||
```
|
||||
|
||||
Reference the template in the canary analysis:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
metrics:
|
||||
- name: "success rate"
|
||||
templateRef:
|
||||
name: graphite-request-success-rate
|
||||
thresholdRange:
|
||||
min: 90
|
||||
interval: 1min
|
||||
```
|
||||
|
||||
## Graphite authentication
|
||||
|
||||
If your Graphite API requires basic authentication, you can create a secret in the same namespace
|
||||
as the `MetricTemplate` with the basic-auth credentials:
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: graphite-basic-auth
|
||||
namespace: flagger
|
||||
data:
|
||||
username: your-user
|
||||
password: your-password
|
||||
```
|
||||
|
||||
Then, reference the secret in the `MetricTemplate`:
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: MetricTemplate
|
||||
metadata:
|
||||
name: my-metric
|
||||
namespace: flagger
|
||||
spec:
|
||||
provider:
|
||||
type: graphite
|
||||
address: http://graphite.monitoring
|
||||
secretRef:
|
||||
name: graphite-basic-auth
|
||||
```
|
||||
|
||||
## Google Cloud Monitoring (Stackdriver)
|
||||
|
||||
Enable Workload Identity on your cluster, create a service account key that has read access to the
|
||||
Cloud Monitoring API and then create an IAM policy binding between the GCP service account and the Flagger
|
||||
service account on Kubernetes. You can take a look at this [guide](https://cloud.google.com/kubernetes-engine/docs/how-to/workload-identity)
|
||||
|
||||
Annotate the flagger service account
|
||||
```shell script
|
||||
kubectl annotate serviceaccount flagger \
|
||||
--namespace <namespace> \
|
||||
iam.gke.io/gcp-service-account=<gcp-serviceaccount-name>@<project-id>.iam.gserviceaccount.com
|
||||
```
|
||||
|
||||
Alternatively, you can download the json keys and add it to your secret with the key `serviceAccountKey` (This method is not recommended).
|
||||
|
||||
Create a secret that contains your project-id (and, if workload identity is not enabled on your cluster,
|
||||
your [service account json](https://cloud.google.com/docs/authentication/production#create_service_account)).
|
||||
|
||||
```
|
||||
kubectl create secret generic gcloud-sa --from-literal=project=<project-id>
|
||||
```
|
||||
|
||||
Then reference the secret in the metric template.
|
||||
Note: The particular MQL query used here works if [Istio is installed on GKE](https://cloud.google.com/istio/docs/istio-on-gke/installing).
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: MetricTemplate
|
||||
metadata:
|
||||
name: bytes-sent
|
||||
namespace: test
|
||||
spec:
|
||||
provider:
|
||||
type: stackdriver
|
||||
secretRef:
|
||||
name: gcloud-sa
|
||||
query: |
|
||||
fetch k8s_container
|
||||
| metric 'istio.io/service/server/response_latencies'
|
||||
| filter
|
||||
(metric.destination_service_name == '{{ service }}-canary'
|
||||
&& metric.destination_service_namespace == '{{ namespace }}')
|
||||
| align delta(1m)
|
||||
| every 1m
|
||||
| group_by [],
|
||||
[value_response_latencies_percentile:
|
||||
percentile(value.response_latencies, 99)]
|
||||
```
|
||||
|
||||
The reference for the query language can be found [here](https://cloud.google.com/monitoring/mql/reference)
|
||||
|
||||
## InfluxDB
|
||||
|
||||
The InfluxDB provider uses the [flux](https://docs.influxdata.com/influxdb/v2.0/query-data/get-started/) query language.
|
||||
|
||||
Create a secret that contains your authentication token that can be found in the InfluxDB UI.
|
||||
|
||||
```
|
||||
kubectl create secret generic influx-token --from-literal=token=<token>
|
||||
```
|
||||
|
||||
Then reference the secret in the metric template.
|
||||
|
||||
Note: The particular MQL query used here works if [Istio is installed on GKE](https://cloud.google.com/istio/docs/istio-on-gke/installing).
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: MetricTemplate
|
||||
metadata:
|
||||
name: not-found
|
||||
namespace: test
|
||||
spec:
|
||||
provider:
|
||||
type: influxdb
|
||||
secretRef:
|
||||
name: influx-token
|
||||
query: |
|
||||
from(bucket: "default")
|
||||
|> range(start: -2h)
|
||||
|> filter(fn: (r) => r["_measurement"] == "istio_requests_total")
|
||||
|> filter(fn: (r) => r[" destination_workload_namespace"] == "{{ namespace }}")
|
||||
|> filter(fn: (r) => r["destination_workload"] == "{{ target }}")
|
||||
|> filter(fn: (r) => r["response_code"] == "500")
|
||||
|> count()
|
||||
|> yield(name: "count")
|
||||
```
|
||||
|
||||
## Dynatrace
|
||||
|
||||
You can create custom metric checks using the Dynatrace provider.
|
||||
|
||||
Create a secret with your Dynatrace token:
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: dynatrace
|
||||
namespace: istio-system
|
||||
data:
|
||||
dynatrace_token: ZHQwYz...
|
||||
```
|
||||
|
||||
Dynatrace metric template example:
|
||||
|
||||
```yaml
|
||||
apiVersion: flagger.app/v1beta1
|
||||
kind: MetricTemplate
|
||||
metadata:
|
||||
name: response-time-95pct
|
||||
namespace: istio-system
|
||||
spec:
|
||||
provider:
|
||||
type: dynatrace
|
||||
address: https://xxxxxxxx.live.dynatrace.com
|
||||
secretRef:
|
||||
name: dynatrace
|
||||
query: |
|
||||
builtin:service.response.time:filter(eq(dt.entity.service,SERVICE-ABCDEFG0123456789)):percentile(95)
|
||||
```
|
||||
|
||||
Reference the template in the canary analysis:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
metrics:
|
||||
- name: "response-time-95pct"
|
||||
templateRef:
|
||||
name: response-time-95pct
|
||||
namespace: istio-system
|
||||
thresholdRange:
|
||||
max: 1000
|
||||
interval: 1m
|
||||
```
|
|
@ -0,0 +1,128 @@
|
|||
---
|
||||
type: docs
|
||||
weight: 5
|
||||
title: Monitoring
|
||||
---
|
||||
|
||||
## Grafana
|
||||
|
||||
Flagger comes with a Grafana dashboard made for canary analysis. Install Grafana with Helm:
|
||||
|
||||
```bash
|
||||
helm upgrade -i flagger-grafana flagger/grafana \
|
||||
--set url=http://prometheus:9090
|
||||
```
|
||||
|
||||
The dashboard shows the RED and USE metrics for the primary and canary workloads:
|
||||
|
||||

|
||||
|
||||
## Logging
|
||||
|
||||
The canary errors and latency spikes have been recorded as Kubernetes events and logged by Flagger in json format:
|
||||
|
||||
```text
|
||||
kubectl -n istio-system logs deployment/flagger --tail=100 | jq .msg
|
||||
|
||||
Starting canary deployment for podinfo.test
|
||||
Advance podinfo.test canary weight 5
|
||||
Advance podinfo.test canary weight 10
|
||||
Advance podinfo.test canary weight 15
|
||||
Advance podinfo.test canary weight 20
|
||||
Advance podinfo.test canary weight 25
|
||||
Advance podinfo.test canary weight 30
|
||||
Advance podinfo.test canary weight 35
|
||||
Halt podinfo.test advancement success rate 98.69% < 99%
|
||||
Advance podinfo.test canary weight 40
|
||||
Halt podinfo.test advancement request duration 1.515s > 500ms
|
||||
Advance podinfo.test canary weight 45
|
||||
Advance podinfo.test canary weight 50
|
||||
Copying podinfo.test template spec to podinfo-primary.test
|
||||
Halt podinfo-primary.test advancement waiting for rollout to finish: 1 old replicas are pending termination
|
||||
Scaling down podinfo.test
|
||||
Promotion completed! podinfo.test
|
||||
```
|
||||
|
||||
## Event Webhook
|
||||
|
||||
Flagger can be configured to send event payloads to a specified webhook:
|
||||
|
||||
```bash
|
||||
helm upgrade -i flagger flagger/flagger \
|
||||
--set eventWebhook=https://example.com/flagger-canary-event-webhook
|
||||
```
|
||||
|
||||
The environment variable _EVENT\_WEBHOOK\_URL_ can be used for activating the event-webhook, too.
|
||||
This is handy for using a secret to store a sensible value that could contain api keys for example.
|
||||
|
||||
When configured, every action that Flagger takes during a canary deployment
|
||||
will be sent as JSON via an HTTP POST request. The JSON payload has the following schema:
|
||||
|
||||
```javascript
|
||||
{
|
||||
"name": "string (canary name)",
|
||||
"namespace": "string (canary namespace)",
|
||||
"phase": "string (canary phase)",
|
||||
"metadata": {
|
||||
"eventMessage": "string (canary event message)",
|
||||
"eventType": "string (canary event type)",
|
||||
"timestamp": "string (unix timestamp ms)"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Example:
|
||||
|
||||
```javascript
|
||||
{
|
||||
"name": "podinfo",
|
||||
"namespace": "default",
|
||||
"phase": "Progressing",
|
||||
"metadata": {
|
||||
"eventMessage": "New revision detected! Scaling up podinfo.default",
|
||||
"eventType": "Normal",
|
||||
"timestamp": "1578607635167"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
The event webhook can be overwritten at canary level with:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
webhooks:
|
||||
- name: "send to Slack"
|
||||
type: event
|
||||
url: http://event-recevier.notifications/slack
|
||||
```
|
||||
|
||||
## Metrics
|
||||
|
||||
Flagger exposes Prometheus metrics that can be used to determine
|
||||
the canary analysis status and the destination weight values:
|
||||
|
||||
```bash
|
||||
# Flagger version and mesh provider gauge
|
||||
flagger_info{version="0.10.0", mesh_provider="istio"} 1
|
||||
|
||||
# Canaries total gauge
|
||||
flagger_canary_total{namespace="test"} 1
|
||||
|
||||
# Canary promotion last known status gauge
|
||||
# 0 - running, 1 - successful, 2 - failed
|
||||
flagger_canary_status{name="podinfo" namespace="test"} 1
|
||||
|
||||
# Canary traffic weight gauge
|
||||
flagger_canary_weight{workload="podinfo-primary" namespace="test"} 95
|
||||
flagger_canary_weight{workload="podinfo" namespace="test"} 5
|
||||
|
||||
# Seconds spent performing canary analysis histogram
|
||||
flagger_canary_duration_seconds_bucket{name="podinfo",namespace="test",le="10"} 6
|
||||
flagger_canary_duration_seconds_bucket{name="podinfo",namespace="test",le="+Inf"} 6
|
||||
flagger_canary_duration_seconds_sum{name="podinfo",namespace="test"} 17.3561329
|
||||
flagger_canary_duration_seconds_count{name="podinfo",namespace="test"} 6
|
||||
|
||||
# Last canary metric analysis result per different metrics
|
||||
flagger_canary_metric_analysis{metric="podinfo-http-successful-rate",name="podinfo",namespace="test"} 1
|
||||
flagger_canary_metric_analysis{metric="podinfo-custom-metric",name="podinfo",namespace="test"} 0.918223108974359
|
||||
```
|
|
@ -0,0 +1,560 @@
|
|||
---
|
||||
type: docs
|
||||
weight: 3
|
||||
title: Webhooks
|
||||
---
|
||||
|
||||
The canary analysis can be extended with webhooks.
|
||||
Flagger will call each webhook URL and determine from the response status code
|
||||
(HTTP 2xx) if the canary is failing or not.
|
||||
|
||||
There are several types of hooks:
|
||||
|
||||
* **confirm-rollout** hooks are executed before scaling up the canary deployment and can be used for manual approval.
|
||||
The rollout is paused until the hook returns a successful HTTP status code.
|
||||
|
||||
* **pre-rollout** hooks are executed before routing traffic to canary.
|
||||
The canary advancement is paused if a pre-rollout hook fails and if the number of failures reach the
|
||||
threshold the canary will be rollback.
|
||||
|
||||
* **rollout** hooks are executed during the analysis on each iteration before the metric checks.
|
||||
If a rollout hook call fails the canary advancement is paused and eventfully rolled back.
|
||||
|
||||
* **confirm-traffic-increase** hooks are executed right before the weight on the canary is increased. The canary
|
||||
advancement is paused until this hook returns HTTP 200.
|
||||
|
||||
* **confirm-promotion** hooks are executed before the promotion step.
|
||||
The canary promotion is paused until the hooks return HTTP 200.
|
||||
While the promotion is paused, Flagger will continue to run the metrics checks and rollout hooks.
|
||||
|
||||
* **post-rollout** hooks are executed after the canary has been promoted or rolled back.
|
||||
If a post rollout hook fails the error is logged.
|
||||
|
||||
* **rollback** hooks are executed while a canary deployment is in either Progressing or Waiting status.
|
||||
This provides the ability to rollback during analysis or while waiting for a confirmation. If a rollback hook
|
||||
returns a successful HTTP status code, Flagger will stop the analysis and mark the canary release as failed.
|
||||
|
||||
* **event** hooks are executed every time Flagger emits a Kubernetes event. When configured,
|
||||
every action that Flagger takes during a canary deployment will be sent as JSON via an HTTP POST request.
|
||||
|
||||
Spec:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
webhooks:
|
||||
- name: "start gate"
|
||||
type: confirm-rollout
|
||||
url: http://flagger-loadtester.test/gate/approve
|
||||
- name: "helm test"
|
||||
type: pre-rollout
|
||||
url: http://flagger-helmtester.flagger/
|
||||
timeout: 3m
|
||||
metadata:
|
||||
type: "helmv3"
|
||||
cmd: "test podinfo -n test"
|
||||
- name: "load test"
|
||||
type: rollout
|
||||
url: http://flagger-loadtester.test/
|
||||
timeout: 15s
|
||||
metadata:
|
||||
cmd: "hey -z 1m -q 5 -c 2 http://podinfo-canary.test:9898/"
|
||||
- name: "traffic increase gate"
|
||||
type: confirm-traffic-increase
|
||||
url: http://flagger-loadtester.test/gate/approve
|
||||
- name: "promotion gate"
|
||||
type: confirm-promotion
|
||||
url: http://flagger-loadtester.test/gate/approve
|
||||
- name: "notify"
|
||||
type: post-rollout
|
||||
url: http://telegram.bot:8080/
|
||||
timeout: 5s
|
||||
metadata:
|
||||
some: "message"
|
||||
- name: "rollback gate"
|
||||
type: rollback
|
||||
url: http://flagger-loadtester.test/rollback/check
|
||||
- name: "send to Slack"
|
||||
type: event
|
||||
url: http://event-recevier.notifications/slack
|
||||
metadata:
|
||||
environment: "test"
|
||||
cluster: "flagger-test"
|
||||
```
|
||||
|
||||
> **Note** that the sum of all rollout webhooks timeouts should be lower than the analysis interval.
|
||||
|
||||
Webhook payload (HTTP POST):
|
||||
|
||||
```javascript
|
||||
{
|
||||
"name": "podinfo",
|
||||
"namespace": "test",
|
||||
"phase": "Progressing",
|
||||
"metadata": {
|
||||
"test": "all",
|
||||
"token": "16688eb5e9f289f1991c"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Response status codes:
|
||||
|
||||
* 200-202 - advance canary by increasing the traffic weight
|
||||
* timeout or non-2xx - halt advancement and increment failed checks
|
||||
|
||||
On a non-2xx response Flagger will include the response body (if any) in the failed checks log and Kubernetes events.
|
||||
|
||||
Event payload (HTTP POST):
|
||||
|
||||
```javascript
|
||||
{
|
||||
"name": "string (canary name)",
|
||||
"namespace": "string (canary namespace)",
|
||||
"phase": "string (canary phase)",
|
||||
"metadata": {
|
||||
"eventMessage": "string (canary event message)",
|
||||
"eventType": "string (canary event type)",
|
||||
"timestamp": "string (unix timestamp ms)"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
The event receiver can create alerts based on the received phase
|
||||
(possible values: `Initialized`, `Waiting`, `Progressing`, `Promoting`, `Finalising`, `Succeeded` or `Failed`).
|
||||
|
||||
## Load Testing
|
||||
|
||||
For workloads that are not receiving constant traffic Flagger can be configured with a webhook,
|
||||
that when called, will start a load test for the target workload.
|
||||
If the target workload doesn't receive any traffic during the canary analysis,
|
||||
Flagger metric checks will fail with "no values found for metric request-success-rate".
|
||||
|
||||
Flagger comes with a load testing service based on [rakyll/hey](https://github.com/rakyll/hey)
|
||||
that generates traffic during analysis when configured as a webhook.
|
||||
|
||||

|
||||
|
||||
First you need to deploy the load test runner in a namespace with sidecar injection enabled:
|
||||
|
||||
```bash
|
||||
kubectl apply -k https://github.com/fluxcd/flagger//kustomize/tester?ref=main
|
||||
```
|
||||
|
||||
Or by using Helm:
|
||||
|
||||
```bash
|
||||
helm repo add flagger https://flagger.app
|
||||
|
||||
helm upgrade -i flagger-loadtester flagger/loadtester \
|
||||
--namespace=test \
|
||||
--set cmd.timeout=1h \
|
||||
--set cmd.namespaceRegexp=''
|
||||
```
|
||||
|
||||
When deployed the load tester API will be available at `http://flagger-loadtester.test/`.
|
||||
|
||||
Now you can add webhooks to the canary analysis spec:
|
||||
|
||||
```yaml
|
||||
webhooks:
|
||||
- name: load-test-get
|
||||
url: http://flagger-loadtester.test/
|
||||
timeout: 5s
|
||||
metadata:
|
||||
type: cmd
|
||||
cmd: "hey -z 1m -q 10 -c 2 http://podinfo-canary.test:9898/"
|
||||
- name: load-test-post
|
||||
url: http://flagger-loadtester.test/
|
||||
timeout: 5s
|
||||
metadata:
|
||||
type: cmd
|
||||
cmd: "hey -z 1m -q 10 -c 2 -m POST -d '{test: 2}' http://podinfo-canary.test:9898/echo"
|
||||
```
|
||||
|
||||
When the canary analysis starts, Flagger will call the webhooks and the load tester will
|
||||
run the `hey` commands in the background, if they are not already running.
|
||||
This will ensure that during the analysis, the `podinfo-canary.test`
|
||||
service will receive a steady stream of GET and POST requests.
|
||||
|
||||
If your workload is exposed outside the mesh you can point `hey` to the public URL and use HTTP2.
|
||||
|
||||
```yaml
|
||||
webhooks:
|
||||
- name: load-test-get
|
||||
url: http://flagger-loadtester.test/
|
||||
timeout: 5s
|
||||
metadata:
|
||||
type: cmd
|
||||
cmd: "hey -z 1m -q 10 -c 2 -h2 https://podinfo.example.com/"
|
||||
```
|
||||
|
||||
For gRPC services you can use [bojand/ghz](https://github.com/bojand/ghz) which is a similar tool to Hey but for gRPC:
|
||||
|
||||
```yaml
|
||||
webhooks:
|
||||
- name: grpc-load-test
|
||||
url: http://flagger-loadtester.test/
|
||||
timeout: 5s
|
||||
metadata:
|
||||
type: cmd
|
||||
cmd: "ghz -z 1m -q 10 -c 2 --insecure podinfo.test:9898"
|
||||
```
|
||||
|
||||
`ghz` uses reflection to identify which gRPC method to call.
|
||||
If you do not wish to enable reflection for your gRPC service you can implement a standardized
|
||||
health check from the [grpc-proto](https://github.com/grpc/grpc-proto) library.
|
||||
To use this [health check schema](https://github.com/grpc/grpc-proto/blob/master/grpc/health/v1/health.proto)
|
||||
without reflection you can pass a parameter to `ghz` like this
|
||||
|
||||
```yaml
|
||||
webhooks:
|
||||
- name: grpc-load-test-no-reflection
|
||||
url: http://flagger-loadtester.test/
|
||||
timeout: 5s
|
||||
metadata:
|
||||
type: cmd
|
||||
cmd: "ghz --insecure --proto=/tmp/ghz/health.proto --call=grpc.health.v1.Health/Check podinfo.test:9898"
|
||||
```
|
||||
|
||||
The load tester can run arbitrary commands as long as the binary is present in the container image.
|
||||
For example if you want to replace `hey` with another CLI, you can create your own Docker image:
|
||||
|
||||
```text
|
||||
FROM weaveworks/flagger-loadtester:<VER>
|
||||
|
||||
RUN curl -Lo /usr/local/bin/my-cli https://github.com/user/repo/releases/download/ver/my-cli \
|
||||
&& chmod +x /usr/local/bin/my-cli
|
||||
```
|
||||
|
||||
## Load Testing Delegation
|
||||
|
||||
The load tester can also forward testing tasks to external tools,
|
||||
by now [nGrinder](https://github.com/naver/ngrinder) is supported.
|
||||
|
||||
To use this feature, add a load test task of type 'ngrinder' to the canary analysis spec:
|
||||
|
||||
```yaml
|
||||
webhooks:
|
||||
- name: load-test-post
|
||||
url: http://flagger-loadtester.test/
|
||||
timeout: 5s
|
||||
metadata:
|
||||
# type of this load test task, cmd or ngrinder
|
||||
type: ngrinder
|
||||
# base url of your nGrinder controller server
|
||||
server: http://ngrinder-server:port
|
||||
# id of the test to clone from, the test must have been defined.
|
||||
clone: 100
|
||||
# user name and base64 encoded password to authenticate against the nGrinder server
|
||||
username: admin
|
||||
passwd: YWRtaW4=
|
||||
# the interval between between nGrinder test status polling, default to 1s
|
||||
pollInterval: 5s
|
||||
```
|
||||
|
||||
When the canary analysis starts, the load tester will initiate a
|
||||
[clone_and_start request](https://github.com/naver/ngrinder/wiki/REST-API-PerfTest)
|
||||
to the nGrinder server and start a new performance test. the load tester will periodically
|
||||
poll the nGrinder server for the status of the test,
|
||||
and prevent duplicate requests from being sent in subsequent analysis loops.
|
||||
|
||||
### K6 Load Tester
|
||||
|
||||
You can also delegate load testing to a third-party webhook. An example of this is the [`k6 webhook`](https://github.com/grafana/flagger-k6-webhook). This webhook uses [`k6`](https://k6.io/), a very featureful load tester, to run load or smoke tests on canaries. For all features available, see the source repository.
|
||||
|
||||
Here's an example integrating this webhook as a `pre-rollout` step, to load test a service before any traffic is sent to it:
|
||||
|
||||
```yaml
|
||||
webhooks:
|
||||
- name: k6-load-test
|
||||
timeout: 5m
|
||||
type: pre-rollout
|
||||
url: http://k6-loadtester.flagger/launch-test
|
||||
metadata:
|
||||
script: |
|
||||
import http from 'k6/http';
|
||||
import { sleep } from 'k6';
|
||||
export const options = {
|
||||
vus: 2,
|
||||
duration: '30s',
|
||||
thresholds: {
|
||||
http_req_duration: ['p(95)<50']
|
||||
},
|
||||
ext: {
|
||||
loadimpact: {
|
||||
name: '<cluster>/<your_service>',
|
||||
projectID: <project id>,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
export default function () {
|
||||
http.get('http://<your_service>-canary.<namespace>:80/');
|
||||
sleep(0.10);
|
||||
}
|
||||
```
|
||||
|
||||
## Integration Testing
|
||||
|
||||
Flagger comes with a testing service that can run Helm tests, Bats tests or Concord tests when configured as a webhook.
|
||||
|
||||
Deploy the Helm test runner in the `kube-system` namespace using the `tiller` service account:
|
||||
|
||||
```bash
|
||||
helm repo add flagger https://flagger.app
|
||||
|
||||
helm upgrade -i flagger-helmtester flagger/loadtester \
|
||||
--namespace=kube-system \
|
||||
--set serviceAccountName=tiller
|
||||
```
|
||||
|
||||
When deployed the Helm tester API will be available at `http://flagger-helmtester.kube-system/`.
|
||||
|
||||
Now you can add pre-rollout webhooks to the canary analysis spec:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
webhooks:
|
||||
- name: "smoke test"
|
||||
type: pre-rollout
|
||||
url: http://flagger-helmtester.kube-system/
|
||||
timeout: 3m
|
||||
metadata:
|
||||
type: "helm"
|
||||
cmd: "test {{ .Release.Name }} --cleanup"
|
||||
```
|
||||
|
||||
When the canary analysis starts, Flagger will call the pre-rollout webhooks before routing traffic to the canary.
|
||||
If the helm test fails, Flagger will retry until the analysis threshold is reached and the canary is rolled back.
|
||||
|
||||
If you are using Helm v3,
|
||||
you'll have to create a dedicated service account and add the release namespace to the test command:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
webhooks:
|
||||
- name: "smoke test"
|
||||
type: pre-rollout
|
||||
url: http://flagger-helmtester.kube-system/
|
||||
timeout: 3m
|
||||
metadata:
|
||||
type: "helmv3"
|
||||
cmd: "test {{ .Release.Name }} --timeout 3m -n {{ .Release.Namespace }}"
|
||||
```
|
||||
|
||||
If the test hangs or logs error messages hinting to insufficient permissions it can be related to RBAC,
|
||||
check the [Troubleshooting](webhooks.md#Troubleshooting) section for an example configuration.
|
||||
|
||||
As an alternative to Helm you can use the
|
||||
[Bash Automated Testing System](https://github.com/bats-core/bats-core) to run your tests.
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
webhooks:
|
||||
- name: "acceptance tests"
|
||||
type: pre-rollout
|
||||
url: http://flagger-batstester.default/
|
||||
timeout: 5m
|
||||
metadata:
|
||||
type: "bash"
|
||||
cmd: "bats /tests/acceptance.bats"
|
||||
```
|
||||
|
||||
Note that you should create a ConfigMap with your Bats tests and mount it inside the tester container.
|
||||
|
||||
You can also configure the test runner to start a [Concord](https://concord.walmartlabs.com/) process.
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
webhooks:
|
||||
- name: "concord integration test"
|
||||
type: pre-rollout
|
||||
url: http://flagger-concordtester.default/
|
||||
timeout: 60s
|
||||
metadata:
|
||||
type: "concord"
|
||||
org: "your-concord-org"
|
||||
project: "your-concord-project"
|
||||
repo: "your-concord-repo"
|
||||
entrypoint: "your-concord-entrypoint"
|
||||
apiKeyPath: "/tmp/concord-api-key"
|
||||
endpoint: "https://canary-endpoint/"
|
||||
pollInterval: "5"
|
||||
pollTimeout: "60"
|
||||
```
|
||||
|
||||
`org`, `project`, `repo` and `entrypoint` represents where your test process runs in Concord.
|
||||
In order to authenticate to Concord, you need to set `apiKeyPath`
|
||||
to a path of a file containing a valid Concord API key on the `flagger-helmtester` container.
|
||||
This can be done via mounting a Kubernetes secret in the tester's Deployment.
|
||||
`pollInterval` represents the interval in seconds the web-hook will call Concord
|
||||
to see if the process has finished (Default is 5s). `pollTimeout` represents the time in seconds
|
||||
the web-hook will try to call Concord before timing out (Default is 30s).
|
||||
|
||||
## Manual Gating
|
||||
|
||||
For manual approval of a canary deployment you can use the `confirm-rollout` and `confirm-promotion` webhooks.
|
||||
The confirmation rollout hooks are executed before the pre-rollout hooks. For manually approving traffic weight increase,
|
||||
you can use the `confirm-traffic-increase` webhook.
|
||||
Flagger will halt the canary traffic shifting and analysis until the confirm webhook returns HTTP status 200.
|
||||
|
||||
For manual rollback of a canary deployment you can use the `rollback` webhook.
|
||||
The rollback hook will be called during the analysis and confirmation states.
|
||||
If a rollback webhook returns a successful HTTP status code,
|
||||
Flagger will shift all traffic back to the primary instance and fail the canary.
|
||||
|
||||
Manual gating with Flagger's tester:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
webhooks:
|
||||
- name: "gate"
|
||||
type: confirm-rollout
|
||||
url: http://flagger-loadtester.test/gate/halt
|
||||
```
|
||||
|
||||
The `/gate/halt` returns HTTP 403 thus blocking the rollout.
|
||||
|
||||
If you have notifications enabled, Flagger will post a message to
|
||||
Slack or MS Teams if a canary rollout is waiting for approval.
|
||||
|
||||
The notifications can be disabled with:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
webhooks:
|
||||
- name: "gate"
|
||||
type: confirm-rollout
|
||||
url: http://flagger-loadtester.test/gate/halt
|
||||
muteAlert: true
|
||||
```
|
||||
|
||||
Change the URL to `/gate/approve` to start the canary analysis:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
webhooks:
|
||||
- name: "gate"
|
||||
type: confirm-rollout
|
||||
url: http://flagger-loadtester.test/gate/approve
|
||||
```
|
||||
|
||||
Manual gating can be driven with Flagger's tester API. Set the confirmation URL to `/gate/check`:
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
webhooks:
|
||||
- name: "ask for confirmation"
|
||||
type: confirm-rollout
|
||||
url: http://flagger-loadtester.test/gate/check
|
||||
```
|
||||
|
||||
By default the gate is closed, you can start or resume the canary rollout with:
|
||||
|
||||
```bash
|
||||
kubectl -n test exec -it flagger-loadtester-xxxx-xxxx sh
|
||||
|
||||
curl -d '{"name": "podinfo","namespace":"test"}' http://localhost:8080/gate/open
|
||||
```
|
||||
|
||||
You can pause the rollout at any time with:
|
||||
|
||||
```bash
|
||||
curl -d '{"name": "podinfo","namespace":"test"}' http://localhost:8080/gate/close
|
||||
```
|
||||
|
||||
If a canary analysis is paused the status will change to waiting:
|
||||
|
||||
```bash
|
||||
kubectl get canary/podinfo
|
||||
|
||||
NAME STATUS WEIGHT
|
||||
podinfo Waiting 0
|
||||
```
|
||||
|
||||
The `confirm-promotion` hook type can be used to manually approve the canary promotion.
|
||||
While the promotion is paused, Flagger will continue to run the metrics checks and load tests.
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
webhooks:
|
||||
- name: "promotion gate"
|
||||
type: confirm-promotion
|
||||
url: http://flagger-loadtester.test/gate/halt
|
||||
```
|
||||
|
||||
The `rollback` hook type can be used to manually rollback the canary promotion.
|
||||
As with gating, rollbacks can be driven with Flagger's tester API by setting the rollback URL to `/rollback/check`
|
||||
|
||||
```yaml
|
||||
analysis:
|
||||
webhooks:
|
||||
- name: "rollback"
|
||||
type: rollback
|
||||
url: http://flagger-loadtester.test/rollback/check
|
||||
```
|
||||
|
||||
By default, rollback is closed, you can rollback a canary rollout with:
|
||||
|
||||
```bash
|
||||
kubectl -n test exec -it flagger-loadtester-xxxx-xxxx sh
|
||||
|
||||
curl -d '{"name": "podinfo","namespace":"test"}' http://localhost:8080/rollback/open
|
||||
```
|
||||
|
||||
You can close the rollback with:
|
||||
|
||||
```bash
|
||||
curl -d '{"name": "podinfo","namespace":"test"}' http://localhost:8080/rollback/close
|
||||
```
|
||||
|
||||
If you have notifications enabled, Flagger will post a message to Slack or MS Teams if a canary has been rolled back.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Manually check if helm test is running
|
||||
|
||||
To debug in depth any issues with helm tests, you can execute commands on the flagger-loadtester pod.
|
||||
|
||||
```bash
|
||||
kubectl exec -it deploy/flagger-loadtester -- bash
|
||||
helmv3 test <release> -n <namespace> --debug
|
||||
```
|
||||
|
||||
### Helm tests hang during canary deployment
|
||||
|
||||
If test execution hangs or displays insufficient permissions, check your RBAC settings.
|
||||
|
||||
```yaml
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: helm-smoke-tester
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["secrets"]
|
||||
verbs: ["get", "watch", "list", "update"]
|
||||
# choose the permission based on helm test type (Pod or Job)
|
||||
- apiGroups: [""]
|
||||
resources: ["pods", "pods/log"]
|
||||
verbs: ["create", "list", "delete", "watch"]
|
||||
- apiGroups: ["batch"]
|
||||
resources: ["jobs", "jobs/log"]
|
||||
verbs: ["create", "list", "delete", "watch"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: helm-smoke-tester
|
||||
# Don't forget to update accordingly
|
||||
namespace: namespace-of-the-tested-release
|
||||
subjects:
|
||||
- kind: User
|
||||
name: system:serviceaccount:linkerd:default
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
roleRef:
|
||||
kind: ClusterRole
|
||||
name: helm-smoke-tester
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
```
|
|
@ -133,7 +133,8 @@ class Repo():
|
|||
shutil.copyfile(
|
||||
os.path.join(self.dest, entry[0]),
|
||||
out_file)
|
||||
docs = entry[1].startswith('docs/') or entry[1].startswith('legacy/')
|
||||
docs = entry[1].startswith('docs/') or entry[1].startswith('legacy/') or \
|
||||
entry[1].startswith('flagger/')
|
||||
title = None
|
||||
weight = None
|
||||
if len(entry) == 4:
|
||||
|
|
|
@ -1,13 +1,30 @@
|
|||
{{ $cover := and (.HasShortcode "blocks/cover") (not .Site.Params.ui.navbar_translucent_over_cover_disable) }}
|
||||
<nav class="js-navbar-scroll navbar-expand-lg navbar navbar-dark {{ if $cover}} td-navbar-cover {{ end }}flex-row td-navbar">
|
||||
<a class="navbar-brand" href="{{ .Site.Home.RelPermalink }}">
|
||||
{{ if in .Permalink "/flagger" }}
|
||||
<span class="navbar-logo">{{ if .Site.Params.ui.navbar_logo }}{{ with resources.Get "icons/flagger.svg" }}{{ ( . | minify).Content | safeHTML }}{{ end }}{{ end }}</span><span class="font-weight-bold">Flagger</span>
|
||||
{{ else }}
|
||||
<span class="navbar-logo">{{ if .Site.Params.ui.navbar_logo }}{{ with resources.Get "icons/logo.svg" }}{{ ( . | minify).Content | safeHTML }}{{ end }}{{ end }}</span><span class="font-weight-bold">{{ .Site.Title }}</span>
|
||||
{{ end }}
|
||||
</a>
|
||||
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarContent" aria-controls="navbarContent" aria-expanded="false" aria-label="Toggle navigation">
|
||||
<span class="navbar-toggler-icon"></span>
|
||||
</button>
|
||||
<div class="collapse navbar-collapse justify-content-end" id="navbarContent">
|
||||
<ul class="navbar-nav mt-2 p-3 p-lg-0 mt-lg-0">
|
||||
{{ if in .Permalink "/flagger" }}
|
||||
<li class="nav-item mr-4 mb-2 mb-lg-0">
|
||||
<a class="nav-link active" href="/flagger"><span class="active">Docs</span></a>
|
||||
</li>
|
||||
{{ else if in .Permalink "/legacy" }}
|
||||
<li class="nav-item mr-4 mb-2 mb-lg-0">
|
||||
<a class="nav-link active" href="/legacy"><span class="active">Docs</span></a>
|
||||
</li>
|
||||
{{ else }}
|
||||
<li class="nav-item mr-4 mb-2 mb-lg-0">
|
||||
<a class="nav-link active" href="/docs"><span class="active">Docs</span></a>
|
||||
</li>
|
||||
{{ end }}
|
||||
{{ $p := . }}
|
||||
{{ range .Site.Menus.main }}
|
||||
{{ $active := or ($p.IsMenuCurrent "main" .) ($p.HasMenuCurrent "main" .) }}
|
||||
|
|
|
@ -25,7 +25,9 @@
|
|||
</div>
|
||||
<div class="row">
|
||||
<a class="btn btn-lg btn-{{ $b1_color }}" href="{{ $b1_url }}" role="button">{{ $b1_caption }}</a>
|
||||
{{ if $b2_url }}
|
||||
<a class="btn btn-lg btn-{{ $b2_color }}" href="{{ $b2_url }}" role="button">{{ $b2_caption }}</a>
|
||||
{{ end }}
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
|
@ -1,13 +1,30 @@
|
|||
/docs/components/helm/controller /docs/components/helm 301!
|
||||
/docs/components/image/controller /docs/components/image 301!
|
||||
/docs/components/kustomize/controller /docs/components/kustomize 301!
|
||||
/docs/components/notification/controller /docs/components/notification 301!
|
||||
/docs/components/source/controller /docs/components/source 301!
|
||||
/docs/guides/installation /docs/installation 301!
|
||||
/contributing /docs/contributing 301!
|
||||
/docs/contributing/flux2 /docs/contributing/flux 301!
|
||||
/legacy/helm-operator/guides/upgrade-to-ga /legacy/helm-operator/how-to/upgrade-to-ga 301!
|
||||
/legacy/helm-operator/guides/upgrade-to-beta /legacy/helm-operator/how-to/upgrade-to-beta 301!
|
||||
/docs/roadmap /roadmap 301!
|
||||
/integrations /ecosystem 301!
|
||||
/docs/contributing/docs/some-background https://github.com/fluxcd/website#readme 301!
|
||||
/docs/components/helm/controller /docs/components/helm 301!
|
||||
/docs/components/image/controller /docs/components/image 301!
|
||||
/docs/components/kustomize/controller /docs/components/kustomize 301!
|
||||
/docs/components/notification/controller /docs/components/notification 301!
|
||||
/docs/components/source/controller /docs/components/source 301!
|
||||
/docs/guides/installation /docs/installation 301!
|
||||
/contributing /docs/contributing 301!
|
||||
/docs/contributing/flux2 /docs/contributing/flux 301!
|
||||
/legacy/helm-operator/guides/upgrade-to-ga /legacy/helm-operator/how-to/upgrade-to-ga 301!
|
||||
/legacy/helm-operator/guides/upgrade-to-beta /legacy/helm-operator/how-to/upgrade-to-beta 301!
|
||||
/docs/roadmap /roadmap 301!
|
||||
/integrations /ecosystem 301!
|
||||
/docs/contributing/docs/some-background https://github.com/fluxcd/website#readme 301!
|
||||
|
||||
|
||||
/flagger/how-it-works /flagger/usage/how-it-works 301!
|
||||
/flagger/usage/progressive-delivery /flagger/tutorials/istio-progressive-delivery 301!
|
||||
/flagger/usage/ab-testing /flagger/tutorials/istio-ab-testing 301!
|
||||
/flagger/usage/blue-green /flagger/tutorials/kubernetes-blue-green 301!
|
||||
/flagger/usage/appmesh-progressive-delivery /flagger/tutorials/appmesh-progressive-delivery 301!
|
||||
/flagger/usage/linkerd-progressive-delivery /flagger/tutorials/linkerd-progressive-delivery 301!
|
||||
/flagger/usage/contour-progressive-delivery /flagger/tutorials/contour-progressive-delivery 301!
|
||||
/flagger/usage/gloo-progressive-delivery /flagger/tutorials/gloo-progressive-delivery 301!
|
||||
/flagger/usage/nginx-progressive-delivery /flagger/tutorials/nginx-progressive-delivery 301!
|
||||
/flagger/usage/skipper-progressive-delivery /flagger/tutorials/skipper-progressive-delivery 301!
|
||||
/flagger/usage/crossover-progressive-delivery /flagger/tutorials/crossover-progressive-delivery 301!
|
||||
/flagger/usage/traefik-progressive-delivery /flagger/tutorials/traefik-progressive-delivery 301!
|
||||
/flagger/usage/osm-progressive-delivery /flagger/tutorials/osm-progressive-delivery 301!
|
||||
/flagger/usage/kuma-progressive-delivery /flagger/tutorials/kuma-progressive-delivery 301!
|
||||
/flagger/usage/gatewayapi-progressive-delivery /flagger/tutorials/gatewayapi-progressive-delivery 301!
|
||||
|
|
After Width: | Height: | Size: 36 KiB |
After Width: | Height: | Size: 35 KiB |
After Width: | Height: | Size: 40 KiB |
After Width: | Height: | Size: 48 KiB |
After Width: | Height: | Size: 33 KiB |
After Width: | Height: | Size: 34 KiB |
After Width: | Height: | Size: 38 KiB |
After Width: | Height: | Size: 39 KiB |
After Width: | Height: | Size: 39 KiB |
After Width: | Height: | Size: 40 KiB |
After Width: | Height: | Size: 38 KiB |
After Width: | Height: | Size: 38 KiB |
After Width: | Height: | Size: 196 KiB |
After Width: | Height: | Size: 41 KiB |
After Width: | Height: | Size: 121 KiB |
After Width: | Height: | Size: 41 KiB |
After Width: | Height: | Size: 29 KiB |
After Width: | Height: | Size: 38 KiB |
After Width: | Height: | Size: 39 KiB |
After Width: | Height: | Size: 124 KiB |
After Width: | Height: | Size: 51 KiB |
After Width: | Height: | Size: 45 KiB |
After Width: | Height: | Size: 40 KiB |
After Width: | Height: | Size: 205 KiB |
After Width: | Height: | Size: 523 KiB |
After Width: | Height: | Size: 349 KiB |
After Width: | Height: | Size: 497 KiB |
After Width: | Height: | Size: 131 KiB |
After Width: | Height: | Size: 133 KiB |
After Width: | Height: | Size: 523 KiB |
After Width: | Height: | Size: 22 KiB |
After Width: | Height: | Size: 30 KiB |
After Width: | Height: | Size: 440 KiB |
After Width: | Height: | Size: 78 KiB |
After Width: | Height: | Size: 113 KiB |