Compare commits
2 Commits
Author | SHA1 | Date |
---|---|---|
|
a9e5f39dba | |
|
c4fb546094 |
|
@ -24,7 +24,7 @@ RUN apk --update add \
|
|||
# Packaging stage
|
||||
# Image source: https://github.com/litmuschaos/test-tools/blob/master/custom/hardened-alpine/experiment/Dockerfile
|
||||
# The base image is non-root (have litmus user) with default litmus directory.
|
||||
FROM litmuschaos/experiment-alpine
|
||||
FROM litmuschaos/experiment-alpine:2.4.0
|
||||
|
||||
LABEL maintainer="LitmusChaos"
|
||||
|
||||
|
|
|
@ -6,6 +6,7 @@ import (
|
|||
"os/exec"
|
||||
"os/signal"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
|
@ -19,6 +20,8 @@ import (
|
|||
"github.com/litmuschaos/litmus-go/pkg/utils/common"
|
||||
"github.com/litmuschaos/litmus-go/pkg/utils/retry"
|
||||
"github.com/pkg/errors"
|
||||
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
)
|
||||
|
||||
|
@ -149,29 +152,50 @@ func drainNode(experimentsDetails *experimentTypes.ExperimentDetails, clients cl
|
|||
// uncordonNode uncordon the application node
|
||||
func uncordonNode(experimentsDetails *experimentTypes.ExperimentDetails, clients clients.ClientSets, chaosDetails *types.ChaosDetails) error {
|
||||
|
||||
log.Infof("[Recover]: Uncordon the %v node", experimentsDetails.TargetNode)
|
||||
targetNodes := strings.Split(experimentsDetails.TargetNode, ",")
|
||||
for _, targetNode := range targetNodes {
|
||||
|
||||
command := exec.Command("kubectl", "uncordon", experimentsDetails.TargetNode)
|
||||
var out, stderr bytes.Buffer
|
||||
command.Stdout = &out
|
||||
command.Stderr = &stderr
|
||||
if err := command.Run(); err != nil {
|
||||
log.Infof("Error String: %v", stderr.String())
|
||||
return errors.Errorf("unable to uncordon the %v node, err: %v", experimentsDetails.TargetNode, err)
|
||||
//Check node exist before uncordon the node
|
||||
_, err := clients.KubeClient.CoreV1().Nodes().Get(targetNode, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
if apierrors.IsNotFound(err) {
|
||||
log.Infof("[Info]: The %v node is no longer exist, skip uncordon the node", targetNode)
|
||||
common.SetTargets(targetNode, "noLongerExist", "node", chaosDetails)
|
||||
continue
|
||||
} else {
|
||||
return errors.Errorf("unable to get the %v node, err: %v", targetNode, err)
|
||||
}
|
||||
}
|
||||
|
||||
log.Infof("[Recover]: Uncordon the %v node", targetNode)
|
||||
command := exec.Command("kubectl", "uncordon", targetNode)
|
||||
var out, stderr bytes.Buffer
|
||||
command.Stdout = &out
|
||||
command.Stderr = &stderr
|
||||
if err := command.Run(); err != nil {
|
||||
log.Infof("Error String: %v", stderr.String())
|
||||
return errors.Errorf("unable to uncordon the %v node, err: %v", targetNode, err)
|
||||
}
|
||||
common.SetTargets(targetNode, "reverted", "node", chaosDetails)
|
||||
}
|
||||
|
||||
common.SetTargets(experimentsDetails.TargetNode, "reverted", "node", chaosDetails)
|
||||
|
||||
return retry.
|
||||
Times(uint(experimentsDetails.Timeout / experimentsDetails.Delay)).
|
||||
Wait(time.Duration(experimentsDetails.Delay) * time.Second).
|
||||
Try(func(attempt uint) error {
|
||||
nodeSpec, err := clients.KubeClient.CoreV1().Nodes().Get(experimentsDetails.TargetNode, v1.GetOptions{})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if nodeSpec.Spec.Unschedulable {
|
||||
return errors.Errorf("%v node is in unschedulable state", experimentsDetails.TargetNode)
|
||||
targetNodes := strings.Split(experimentsDetails.TargetNode, ",")
|
||||
for _, targetNode := range targetNodes {
|
||||
nodeSpec, err := clients.KubeClient.CoreV1().Nodes().Get(targetNode, v1.GetOptions{})
|
||||
if err != nil {
|
||||
if apierrors.IsNotFound(err) {
|
||||
continue
|
||||
} else {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if nodeSpec.Spec.Unschedulable {
|
||||
return errors.Errorf("%v node is in unschedulable state", experimentsDetails.TargetNode)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
|
|
@ -88,7 +88,7 @@ func experimentExecution(experimentsDetails *experimentTypes.ExperimentDetails,
|
|||
|
||||
// injectChaosInSerialMode stressed the storage of all target application in serial mode (one by one)
|
||||
func injectChaosInSerialMode(experimentsDetails *experimentTypes.ExperimentDetails, targetPodList corev1.PodList, clients clients.ClientSets, resultDetails *types.ResultDetails, eventsDetails *types.EventDetails, chaosDetails *types.ChaosDetails) error {
|
||||
// creating err channel to recieve the error from the go routine
|
||||
// creating err channel to receive the error from the go routine
|
||||
stressErr := make(chan error)
|
||||
|
||||
// run the probes during chaos
|
||||
|
@ -128,7 +128,7 @@ func injectChaosInSerialMode(experimentsDetails *experimentTypes.ExperimentDetai
|
|||
endTime = time.After(timeDelay)
|
||||
select {
|
||||
case err := <-stressErr:
|
||||
// skipping the execution, if recieved any error other than 137, while executing stress command and marked result as fail
|
||||
// skipping the execution, if received any error other than 137, while executing stress command and marked result as fail
|
||||
// it will ignore the error code 137(oom kill), it will skip further execution and marked the result as pass
|
||||
// oom kill occurs if stor to be stressed exceed than the resource limit for the target container
|
||||
if err != nil {
|
||||
|
@ -160,7 +160,7 @@ func injectChaosInSerialMode(experimentsDetails *experimentTypes.ExperimentDetai
|
|||
|
||||
// injectChaosInParallelMode stressed the storage of all target application in parallel mode (all at once)
|
||||
func injectChaosInParallelMode(experimentsDetails *experimentTypes.ExperimentDetails, targetPodList corev1.PodList, clients clients.ClientSets, resultDetails *types.ResultDetails, eventsDetails *types.EventDetails, chaosDetails *types.ChaosDetails) error {
|
||||
// creating err channel to recieve the error from the go routine
|
||||
// creating err channel to receive the error from the go routine
|
||||
stressErr := make(chan error)
|
||||
|
||||
// run the probes during chaos
|
||||
|
@ -200,7 +200,7 @@ loop:
|
|||
endTime = time.After(timeDelay)
|
||||
select {
|
||||
case err := <-stressErr:
|
||||
// skipping the execution, if recieved any error other than 137, while executing stress command and marked result as fail
|
||||
// skipping the execution, if received any error other than 137, while executing stress command and marked result as fail
|
||||
// it will ignore the error code 137(oom kill), it will skip further execution and marked the result as pass
|
||||
// oom kill occurs if stor to be stressed exceed than the resource limit for the target container
|
||||
if err != nil {
|
||||
|
|
|
@ -176,13 +176,15 @@ func prepareStressChaos(experimentsDetails *experimentTypes.ExperimentDetails, c
|
|||
select {
|
||||
case <-timeout:
|
||||
// the stress process gets timeout before completion
|
||||
log.Infof("[Timeout] Stress output: %v", buf.String())
|
||||
log.Info("[Cleanup]: Killing the stress process")
|
||||
terminateProcess(cmd.Process.Pid)
|
||||
log.Infof("[Chaos] The stress process is not yet completed after the chaos duration of %vs", experimentsDetails.ChaosDuration+30)
|
||||
log.Info("[Timeout]: Killing the stress process")
|
||||
if err = terminateProcess(cmd.Process.Pid); err != nil {
|
||||
return err
|
||||
}
|
||||
if err = result.AnnotateChaosResult(resultDetails.Name, chaosDetails.ChaosNamespace, "reverted", "pod", experimentsDetails.TargetPods); err != nil {
|
||||
return err
|
||||
}
|
||||
return errors.Errorf("the stress process is timeout after %vs", experimentsDetails.ChaosDuration+30)
|
||||
return nil
|
||||
case err := <-done:
|
||||
if err != nil {
|
||||
err, ok := err.(*exec.ExitError)
|
||||
|
|
|
@ -34,6 +34,12 @@ The *generate_experiment.go* script is a simple way to bootstrap your experiment
|
|||
$ cd litmus-go/contribute/developer-guide
|
||||
```
|
||||
|
||||
- Build litmus-sdk
|
||||
|
||||
```
|
||||
go build -o ./litmus-sdk ./bin/main.go
|
||||
```
|
||||
|
||||
- Populate the `attributes.yaml` with details of the chaos experiment (or chart). Use the [attributes.yaml.sample](/contribute/developer-guide/attributes.yaml.sample) as reference.
|
||||
|
||||
As an example, let us consider an experiment to kill one of the replicas of a nginx deployment. The attributes.yaml can be constructed like this:
|
||||
|
@ -158,17 +164,29 @@ The *generate_experiment.go* script is a simple way to bootstrap your experiment
|
|||
drwxr-xr-x 2 shubham shubham 4096 Jun 10 22:41 icons/
|
||||
```
|
||||
|
||||
- Proceed with construction of business logic inside the `sample-exec-chaos.go` file, by making
|
||||
the appropriate modifications listed below to achieve the desired effect:
|
||||
|
||||
- variables
|
||||
- entry & exit criteria checks for the experiment
|
||||
- helper utils in either [pkg](/pkg/) or new [base chaos libraries](/chaoslib)
|
||||
- Proceed with construction of business logic, by making the appropriate modifications listed below
|
||||
to achieve the desired effect:
|
||||
|
||||
- Pre-Chaos Checks: Additional experiment-specific checks to run before chaos. Checks should be
|
||||
added at the `@TODO: user PRE-CHAOS-CHECK` marker in the
|
||||
`experiments/<category>/<name>/experiment/<name>.go` file
|
||||
|
||||
- The chaoslib is created at `chaoslib/litmus/sample-exec-chaos/lib/sample-exec-chaos.go` path. It contains some pre-defined steps which runs the `ChaosInject` command (explicitly provided as an ENV var in the experiment CR). Which will induce chaos in the target application. It will wait for the given chaos duration and finally runs the `ChaosKill` command (also provided as an ENV var) for cleanup purposes. Update this chaoslib to achieve the desired effect based on the use-case or reuse the other existing chaoslib.
|
||||
- Inject Chaos: The heart of your experiment, actually enact the choas. By default, the generated
|
||||
code will call out to the generated library. However, if your experiment simply makes use of
|
||||
exising libraries, modify the chaos injection at the `@TODO: user INVOKE-CHAOSLIB` marker in the
|
||||
`experiments/<category>/<name>/experiment/<name>.go` file
|
||||
|
||||
- Create an experiment README explaining, briefly, the *what*, *why* & *how* of the experiment to aid users of this experiment.
|
||||
- Library Modifications: This is where the low level chaos execution code should live. Populate
|
||||
the `runChaos`, `experimentExecution`, and `injectChaos` functions as appropriate in the
|
||||
`chaosLib/litmus/<name>/lib/<name>.go` file.
|
||||
|
||||
- Post-Chaos Checks: Additional experiment-specific checks to run after achos. Checks should be
|
||||
added at the `@TODO: user POST-CHAOS-CHECK` marker in the
|
||||
`experiments/<category>/<name>/experiment/<name>.go` file
|
||||
|
||||
- Create an experiment README explaining, briefly, the *what*, *why* & *how* of the experiment to aid users of this experiment. This README
|
||||
should live at `experiments/<category>/<name>/README.md`
|
||||
|
||||
### Steps to Test Experiment
|
||||
|
||||
|
|
|
@ -16,6 +16,7 @@ import (
|
|||
"github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
|
||||
// Experiment contains steps to inject chaos
|
||||
func Experiment(clients clients.ClientSets){
|
||||
|
||||
|
@ -69,6 +70,7 @@ func Experiment(clients clients.ClientSets){
|
|||
// Calling AbortWatcher go routine, it will continuously watch for the abort signal and generate the required events and result
|
||||
go common.AbortWatcher(experimentsDetails.ExperimentName, clients, &resultDetails, &chaosDetails, &eventsDetails)
|
||||
|
||||
// @TODO: user PRE-CHAOS-CHECK
|
||||
// ADD A PRE-CHAOS CHECK OF YOUR CHOICE HERE
|
||||
// POD STATUS CHECKS FOR THE APPLICATION UNDER TEST AND AUXILIARY APPLICATIONS ARE ADDED BY DEFAULT
|
||||
|
||||
|
@ -121,6 +123,7 @@ func Experiment(clients clients.ClientSets){
|
|||
// INVOKE THE CHAOSLIB OF YOUR CHOICE HERE, WHICH WILL CONTAIN
|
||||
// THE BUSINESS LOGIC OF THE ACTUAL CHAOS
|
||||
// IT CAN BE A NEW CHAOSLIB YOU HAVE CREATED SPECIALLY FOR THIS EXPERIMENT OR ANY EXISTING ONE
|
||||
// @TODO: user INVOKE-CHAOSLIB
|
||||
|
||||
// Including the litmus lib
|
||||
switch experimentsDetails.ChaosLib {
|
||||
|
@ -138,6 +141,7 @@ func Experiment(clients clients.ClientSets){
|
|||
return
|
||||
}
|
||||
|
||||
// @TODO: user POST-CHAOS-CHECK
|
||||
// ADD A POST-CHAOS CHECK OF YOUR CHOICE HERE
|
||||
// POD STATUS CHECKS FOR THE APPLICATION UNDER TEST AND AUXILIARY APPLICATIONS ARE ADDED BY DEFAULT
|
||||
|
||||
|
|
|
@ -10,6 +10,7 @@ import (
|
|||
"github.com/pkg/errors"
|
||||
logrus "github.com/sirupsen/logrus"
|
||||
apiv1 "k8s.io/api/core/v1"
|
||||
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
)
|
||||
|
||||
|
@ -26,7 +27,12 @@ func CheckNodeStatus(nodes string, timeout, delay int, clients clients.ClientSet
|
|||
for index := range targetNodes {
|
||||
node, err := clients.KubeClient.CoreV1().Nodes().Get(targetNodes[index], metav1.GetOptions{})
|
||||
if err != nil {
|
||||
return err
|
||||
if apierrors.IsNotFound(err) {
|
||||
log.Infof("[Info]: The %v node is not exist", targetNodes[index])
|
||||
continue
|
||||
} else {
|
||||
return err
|
||||
}
|
||||
}
|
||||
nodeList.Items = append(nodeList.Items, *node)
|
||||
}
|
||||
|
|
|
@ -96,6 +96,7 @@ func SetHelperData(chaosDetails *types.ChaosDetails, clients clients.ClientSets)
|
|||
// Get Labels
|
||||
labels := pod.ObjectMeta.Labels
|
||||
delete(labels, "controller-uid")
|
||||
delete(labels, "job-name")
|
||||
chaosDetails.Labels = labels
|
||||
|
||||
// Get Resource Requirements
|
||||
|
|
Loading…
Reference in New Issue