Compare commits

...

2 Commits

Author SHA1 Message Date
Udit Gaurav a9e5f39dba
[ Cherry-pick for 2.4.0 ] (#473)
* Refactor/experiment contributing (#470)

* docs: add instructions for building litmus-sdk binary

Non Linux AMD64 users will need to build the binary for their target
platform.

Signed-off-by: Nic Johnson <nicjohnson145@hotmail.com>

* docs: update generated code & docs to aid experiment contribution

It wasn't very clear what generated code needed to be kept, and what
generated code needed to be replaced with experiment-specific code.
Attempt to make that more clear by expanding README & adding grep-able
tags inside generated code.

Signed-off-by: Nic Johnson <nicjohnson145@hotmail.com>

* fix issue-3350 (#468)

Signed-off-by: Andrew Hu <andrew.hu@hcl.com>

Co-authored-by: Udit Gaurav <35391335+uditgaurav@users.noreply.github.com>

* Remove the stress process on timeout without failure (#472)

Signed-off-by: udit <udit@chaosnative.com>

* update image tag

Signed-off-by: udit <udit@chaosnative.com>

Co-authored-by: Nic Johnson <nicjohnson145@users.noreply.github.com>
Co-authored-by: Andrew Hu <93282581+andrewhu-hcl@users.noreply.github.com>
2021-12-15 17:53:44 +05:30
Shubham Chaudhary c4fb546094
[cherrypick for 2.3.0] (#467)
* corrected spelling for received (#463)

Signed-off-by: neelanjan00 <neelanjan@chaosnative.com>

* fix(helper): removing job-name label from the helper pod (#466)

Signed-off-by: shubham chaudhary <shubham@chaosnative.com>

* fix(image-tag): updating harden-alpine image tag

Signed-off-by: shubham chaudhary <shubham@chaosnative.com>

Co-authored-by: Neelanjan Manna <neelanjan@chaosnative.com>
2021-11-15 15:19:09 +05:30
8 changed files with 88 additions and 33 deletions

View File

@ -24,7 +24,7 @@ RUN apk --update add \
# Packaging stage
# Image source: https://github.com/litmuschaos/test-tools/blob/master/custom/hardened-alpine/experiment/Dockerfile
# The base image is non-root (have litmus user) with default litmus directory.
FROM litmuschaos/experiment-alpine
FROM litmuschaos/experiment-alpine:2.4.0
LABEL maintainer="LitmusChaos"

View File

@ -6,6 +6,7 @@ import (
"os/exec"
"os/signal"
"strconv"
"strings"
"syscall"
"time"
@ -19,6 +20,8 @@ import (
"github.com/litmuschaos/litmus-go/pkg/utils/common"
"github.com/litmuschaos/litmus-go/pkg/utils/retry"
"github.com/pkg/errors"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
@ -149,29 +152,50 @@ func drainNode(experimentsDetails *experimentTypes.ExperimentDetails, clients cl
// uncordonNode uncordon the application node
func uncordonNode(experimentsDetails *experimentTypes.ExperimentDetails, clients clients.ClientSets, chaosDetails *types.ChaosDetails) error {
log.Infof("[Recover]: Uncordon the %v node", experimentsDetails.TargetNode)
targetNodes := strings.Split(experimentsDetails.TargetNode, ",")
for _, targetNode := range targetNodes {
command := exec.Command("kubectl", "uncordon", experimentsDetails.TargetNode)
var out, stderr bytes.Buffer
command.Stdout = &out
command.Stderr = &stderr
if err := command.Run(); err != nil {
log.Infof("Error String: %v", stderr.String())
return errors.Errorf("unable to uncordon the %v node, err: %v", experimentsDetails.TargetNode, err)
//Check node exist before uncordon the node
_, err := clients.KubeClient.CoreV1().Nodes().Get(targetNode, metav1.GetOptions{})
if err != nil {
if apierrors.IsNotFound(err) {
log.Infof("[Info]: The %v node is no longer exist, skip uncordon the node", targetNode)
common.SetTargets(targetNode, "noLongerExist", "node", chaosDetails)
continue
} else {
return errors.Errorf("unable to get the %v node, err: %v", targetNode, err)
}
}
log.Infof("[Recover]: Uncordon the %v node", targetNode)
command := exec.Command("kubectl", "uncordon", targetNode)
var out, stderr bytes.Buffer
command.Stdout = &out
command.Stderr = &stderr
if err := command.Run(); err != nil {
log.Infof("Error String: %v", stderr.String())
return errors.Errorf("unable to uncordon the %v node, err: %v", targetNode, err)
}
common.SetTargets(targetNode, "reverted", "node", chaosDetails)
}
common.SetTargets(experimentsDetails.TargetNode, "reverted", "node", chaosDetails)
return retry.
Times(uint(experimentsDetails.Timeout / experimentsDetails.Delay)).
Wait(time.Duration(experimentsDetails.Delay) * time.Second).
Try(func(attempt uint) error {
nodeSpec, err := clients.KubeClient.CoreV1().Nodes().Get(experimentsDetails.TargetNode, v1.GetOptions{})
if err != nil {
return err
}
if nodeSpec.Spec.Unschedulable {
return errors.Errorf("%v node is in unschedulable state", experimentsDetails.TargetNode)
targetNodes := strings.Split(experimentsDetails.TargetNode, ",")
for _, targetNode := range targetNodes {
nodeSpec, err := clients.KubeClient.CoreV1().Nodes().Get(targetNode, v1.GetOptions{})
if err != nil {
if apierrors.IsNotFound(err) {
continue
} else {
return err
}
}
if nodeSpec.Spec.Unschedulable {
return errors.Errorf("%v node is in unschedulable state", experimentsDetails.TargetNode)
}
}
return nil
})

View File

@ -88,7 +88,7 @@ func experimentExecution(experimentsDetails *experimentTypes.ExperimentDetails,
// injectChaosInSerialMode stressed the storage of all target application in serial mode (one by one)
func injectChaosInSerialMode(experimentsDetails *experimentTypes.ExperimentDetails, targetPodList corev1.PodList, clients clients.ClientSets, resultDetails *types.ResultDetails, eventsDetails *types.EventDetails, chaosDetails *types.ChaosDetails) error {
// creating err channel to recieve the error from the go routine
// creating err channel to receive the error from the go routine
stressErr := make(chan error)
// run the probes during chaos
@ -128,7 +128,7 @@ func injectChaosInSerialMode(experimentsDetails *experimentTypes.ExperimentDetai
endTime = time.After(timeDelay)
select {
case err := <-stressErr:
// skipping the execution, if recieved any error other than 137, while executing stress command and marked result as fail
// skipping the execution, if received any error other than 137, while executing stress command and marked result as fail
// it will ignore the error code 137(oom kill), it will skip further execution and marked the result as pass
// oom kill occurs if stor to be stressed exceed than the resource limit for the target container
if err != nil {
@ -160,7 +160,7 @@ func injectChaosInSerialMode(experimentsDetails *experimentTypes.ExperimentDetai
// injectChaosInParallelMode stressed the storage of all target application in parallel mode (all at once)
func injectChaosInParallelMode(experimentsDetails *experimentTypes.ExperimentDetails, targetPodList corev1.PodList, clients clients.ClientSets, resultDetails *types.ResultDetails, eventsDetails *types.EventDetails, chaosDetails *types.ChaosDetails) error {
// creating err channel to recieve the error from the go routine
// creating err channel to receive the error from the go routine
stressErr := make(chan error)
// run the probes during chaos
@ -200,7 +200,7 @@ loop:
endTime = time.After(timeDelay)
select {
case err := <-stressErr:
// skipping the execution, if recieved any error other than 137, while executing stress command and marked result as fail
// skipping the execution, if received any error other than 137, while executing stress command and marked result as fail
// it will ignore the error code 137(oom kill), it will skip further execution and marked the result as pass
// oom kill occurs if stor to be stressed exceed than the resource limit for the target container
if err != nil {

View File

@ -176,13 +176,15 @@ func prepareStressChaos(experimentsDetails *experimentTypes.ExperimentDetails, c
select {
case <-timeout:
// the stress process gets timeout before completion
log.Infof("[Timeout] Stress output: %v", buf.String())
log.Info("[Cleanup]: Killing the stress process")
terminateProcess(cmd.Process.Pid)
log.Infof("[Chaos] The stress process is not yet completed after the chaos duration of %vs", experimentsDetails.ChaosDuration+30)
log.Info("[Timeout]: Killing the stress process")
if err = terminateProcess(cmd.Process.Pid); err != nil {
return err
}
if err = result.AnnotateChaosResult(resultDetails.Name, chaosDetails.ChaosNamespace, "reverted", "pod", experimentsDetails.TargetPods); err != nil {
return err
}
return errors.Errorf("the stress process is timeout after %vs", experimentsDetails.ChaosDuration+30)
return nil
case err := <-done:
if err != nil {
err, ok := err.(*exec.ExitError)

View File

@ -34,6 +34,12 @@ The *generate_experiment.go* script is a simple way to bootstrap your experiment
$ cd litmus-go/contribute/developer-guide
```
- Build litmus-sdk
```
go build -o ./litmus-sdk ./bin/main.go
```
- Populate the `attributes.yaml` with details of the chaos experiment (or chart). Use the [attributes.yaml.sample](/contribute/developer-guide/attributes.yaml.sample) as reference.
As an example, let us consider an experiment to kill one of the replicas of a nginx deployment. The attributes.yaml can be constructed like this:
@ -158,17 +164,29 @@ The *generate_experiment.go* script is a simple way to bootstrap your experiment
drwxr-xr-x 2 shubham shubham 4096 Jun 10 22:41 icons/
```
- Proceed with construction of business logic inside the `sample-exec-chaos.go` file, by making
the appropriate modifications listed below to achieve the desired effect:
- variables
- entry & exit criteria checks for the experiment
- helper utils in either [pkg](/pkg/) or new [base chaos libraries](/chaoslib)
- Proceed with construction of business logic, by making the appropriate modifications listed below
to achieve the desired effect:
- Pre-Chaos Checks: Additional experiment-specific checks to run before chaos. Checks should be
added at the `@TODO: user PRE-CHAOS-CHECK` marker in the
`experiments/<category>/<name>/experiment/<name>.go` file
- The chaoslib is created at `chaoslib/litmus/sample-exec-chaos/lib/sample-exec-chaos.go` path. It contains some pre-defined steps which runs the `ChaosInject` command (explicitly provided as an ENV var in the experiment CR). Which will induce chaos in the target application. It will wait for the given chaos duration and finally runs the `ChaosKill` command (also provided as an ENV var) for cleanup purposes. Update this chaoslib to achieve the desired effect based on the use-case or reuse the other existing chaoslib.
- Inject Chaos: The heart of your experiment, actually enact the choas. By default, the generated
code will call out to the generated library. However, if your experiment simply makes use of
exising libraries, modify the chaos injection at the `@TODO: user INVOKE-CHAOSLIB` marker in the
`experiments/<category>/<name>/experiment/<name>.go` file
- Create an experiment README explaining, briefly, the *what*, *why* & *how* of the experiment to aid users of this experiment.
- Library Modifications: This is where the low level chaos execution code should live. Populate
the `runChaos`, `experimentExecution`, and `injectChaos` functions as appropriate in the
`chaosLib/litmus/<name>/lib/<name>.go` file.
- Post-Chaos Checks: Additional experiment-specific checks to run after achos. Checks should be
added at the `@TODO: user POST-CHAOS-CHECK` marker in the
`experiments/<category>/<name>/experiment/<name>.go` file
- Create an experiment README explaining, briefly, the *what*, *why* & *how* of the experiment to aid users of this experiment. This README
should live at `experiments/<category>/<name>/README.md`
### Steps to Test Experiment

View File

@ -16,6 +16,7 @@ import (
"github.com/sirupsen/logrus"
)
// Experiment contains steps to inject chaos
func Experiment(clients clients.ClientSets){
@ -69,6 +70,7 @@ func Experiment(clients clients.ClientSets){
// Calling AbortWatcher go routine, it will continuously watch for the abort signal and generate the required events and result
go common.AbortWatcher(experimentsDetails.ExperimentName, clients, &resultDetails, &chaosDetails, &eventsDetails)
// @TODO: user PRE-CHAOS-CHECK
// ADD A PRE-CHAOS CHECK OF YOUR CHOICE HERE
// POD STATUS CHECKS FOR THE APPLICATION UNDER TEST AND AUXILIARY APPLICATIONS ARE ADDED BY DEFAULT
@ -121,6 +123,7 @@ func Experiment(clients clients.ClientSets){
// INVOKE THE CHAOSLIB OF YOUR CHOICE HERE, WHICH WILL CONTAIN
// THE BUSINESS LOGIC OF THE ACTUAL CHAOS
// IT CAN BE A NEW CHAOSLIB YOU HAVE CREATED SPECIALLY FOR THIS EXPERIMENT OR ANY EXISTING ONE
// @TODO: user INVOKE-CHAOSLIB
// Including the litmus lib
switch experimentsDetails.ChaosLib {
@ -138,6 +141,7 @@ func Experiment(clients clients.ClientSets){
return
}
// @TODO: user POST-CHAOS-CHECK
// ADD A POST-CHAOS CHECK OF YOUR CHOICE HERE
// POD STATUS CHECKS FOR THE APPLICATION UNDER TEST AND AUXILIARY APPLICATIONS ARE ADDED BY DEFAULT

View File

@ -10,6 +10,7 @@ import (
"github.com/pkg/errors"
logrus "github.com/sirupsen/logrus"
apiv1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
@ -26,7 +27,12 @@ func CheckNodeStatus(nodes string, timeout, delay int, clients clients.ClientSet
for index := range targetNodes {
node, err := clients.KubeClient.CoreV1().Nodes().Get(targetNodes[index], metav1.GetOptions{})
if err != nil {
return err
if apierrors.IsNotFound(err) {
log.Infof("[Info]: The %v node is not exist", targetNodes[index])
continue
} else {
return err
}
}
nodeList.Items = append(nodeList.Items, *node)
}

View File

@ -96,6 +96,7 @@ func SetHelperData(chaosDetails *types.ChaosDetails, clients clients.ClientSets)
// Get Labels
labels := pod.ObjectMeta.Labels
delete(labels, "controller-uid")
delete(labels, "job-name")
chaosDetails.Labels = labels
// Get Resource Requirements