dartboard/test/utils/helpers/helpers.go

package helpers

import (
	"context"
	"fmt"
	"os"
	"testing"
	"time"

	"github.com/pkg/errors"

	"github.com/git-ival/dartboard/test/utils/grafanautils"
	"github.com/git-ival/dartboard/test/utils/imageutils"
	"github.com/git-ival/dartboard/test/utils/ranchermonitoring"
	"github.com/git-ival/dartboard/test/utils/rancherprofiling"
	gapi "github.com/grafana/grafana-api-golang-client"
	provV1 "github.com/rancher/rancher/pkg/apis/provisioning.cattle.io/v1"
	"github.com/rancher/shepherd/clients/rancher"
	mgmtV3 "github.com/rancher/shepherd/clients/rancher/generated/management/v3"
	"github.com/rancher/shepherd/extensions/clusters"
	"github.com/rancher/shepherd/extensions/kubeconfig"
	"github.com/rancher/shepherd/extensions/kubectl"
	"github.com/rancher/shepherd/pkg/session"
	"github.com/sirupsen/logrus"
	log "github.com/sirupsen/logrus"
	"github.com/stretchr/testify/require"
	corev1 "k8s.io/api/core/v1"
	k8sErrors "k8s.io/apimachinery/pkg/api/errors"
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
	"k8s.io/apimachinery/pkg/runtime/schema"
	restclient "k8s.io/client-go/rest"
	"k8s.io/client-go/tools/clientcmd"
	"sigs.k8s.io/yaml"
)

func ConfigMapGVR() schema.GroupVersionResource {
	return corev1.SchemeGroupVersion.WithResource("configmaps")
}

func V1ClusterGVR() schema.GroupVersionResource {
	return provV1.SchemeGroupVersion.WithResource("clusters")
}

type ScreenshotParams struct {
	URL           string
	ImageFilePath string
	WindowSize    [2]int
	Selector      string
	Timeout       int
	Cookies       []string
}

// endpoints for the different pprof visualizations
// selectors have been commented out as they can sometimes fail to be found by chromedp
func PprofEndpoints() map[string]ScreenshotParams {
	return map[string]ScreenshotParams{
		"graph": {
			URL: "http://" + rancherprofiling.BasePprofAddress + "/ui/",
			// Selector: "div#graph",
		},
		"top": {
			URL: "http://" + rancherprofiling.BasePprofAddress + "/ui/top",
			// Selector: "table#toptable",
		},
		"flame": { // Not able to programmatically retrieve screenshot of this via browser, have to resort to html
			URL: "http://" + rancherprofiling.BasePprofAddress + "/ui/flamegraph",
			// Selector: "div#stack-chart",
		},
		"peek": {
			URL: "http://" + rancherprofiling.BasePprofAddress + "/ui/peek",
			// Selector: "div#content",
		},
		"source": {
			URL: "http://" + rancherprofiling.BasePprofAddress + "/ui/source",
			// Selector: "div#content",
		},
	}
}

func GetAllRancherLogs(r *rancher.Client, clusterID string, podName string, since metav1.Time) (string, error) {
	podLogOptions := &corev1.PodLogOptions{
		Container:  "rancher",
		Timestamps: true,
		SinceTime:  &since,
	}
	log.Infof("Collecting Rancher logs since: %s", since.String())
	return kubeconfig.GetPodLogsWithOpts(r, clusterID, podName, "cattle-system", "", podLogOptions)
}

func CreateCustomMonitoringDashboards(t *testing.T, ts *session.Session, client *rancher.Client, configMapsDir string) error {
	files, err := os.ReadDir(configMapsDir)
	require.NoError(t, err)

	for _, file := range files {
		if !file.IsDir() {
			f, err := os.ReadFile(configMapsDir + "/" + file.Name())
			require.NoError(t, err)
			dashboardYAML, err := yaml.YAMLToJSON(f)
			require.NoError(t, err)
			_, err = kubectl.CreateUnstructured(ts, client, dashboardYAML, "local", "cattle-dashboards", ConfigMapGVR())
			if k8sErrors.ReasonForError(err) == metav1.StatusReasonAlreadyExists {
				logrus.Infof("configmap already exists for %v, skipping", file)
				continue
			}
			require.NoError(t, err)
		}
	}
	return nil
}

func WriteMonitoringSnapshotsToPNGs(t *testing.T, ts *session.Session, client *rancher.Client, gapi *gapi.Client, from time.Time, to time.Time, outputPath, clusterID, configMapsDir, prefix, suffix string, dashboardUIDs []string) ([]string, error) {
	var snapshotURLs []string
	var err error
	for _, d := range dashboardUIDs {
		// Ensure ConfigMaps for each Dashboard exist
		configMapPath := configMapsDir + "/" + d
		// In case not all dashboards have custom configmaps in local dir, get from cluster
		if _, err := os.Stat(configMapPath); errors.Is(err, os.ErrExist) {
			_, err := kubectl.GetUnstructured(ts, client, d, clusterID, "cattle-dashboards", ConfigMapGVR())
			require.NoError(t, err)
		}
		snapshotResponse, err := grafanautils.GetDashboardSnapshot(gapi, from, to, d, 9000, false)
		if err != nil {
			log.Warnf("Failed to retrieve dashboard snapshot for %v using time range from %v to %v. Skipping dashboard.", d, from, to)
			continue
		}
		var cookies []string
		imageutils.HTTPCookiesToSlice(gapi.Cookies(), &cookies)
		snapshotURL := "https://" + client.RancherConfig.Host + ranchermonitoring.GrafanaSnapshotRoute + snapshotResponse.Key
		snapshotURLs = append(snapshotURLs, snapshotURL)
		filePath := fmt.Sprintf("%s/%s%s%s.png", outputPath, prefix, d, suffix)
		err = imageutils.URLScreenshotToPNG(snapshotURL, filePath, ranchermonitoring.PanelContentSelector, nil, 60, cookies...)
		if err != nil {
			log.Warnf("Failed to write snapshotURL (%s) to file (%s): %v", snapshotURL, filePath, err)
		}
	}
	return snapshotURLs, err
}

func LogV1ClusterProvisioningTime(t *testing.T, ts *session.Session, client *rancher.Client, cluster *mgmtV3.Cluster, numClusters *int, outputPath, clusterID, namespace, configMapsDir, suffix string) (time.Duration, error) {
	createdTime, err := time.Parse(time.RFC3339, cluster.Created)
	if err != nil {
		return time.Duration(0), err
	}
	var readyTime time.Time
	v1Cluster, _, err := clusters.GetProvisioningClusterByName(client, cluster.ID, "fleet-default")
	if err != nil {
		return time.Duration(0), err
	}
	for _, condition := range v1Cluster.Status.Conditions {
		if condition.Type == "Ready" {
			readyTime, err = time.Parse(time.RFC3339, condition.LastUpdateTime)
			log.Infof("Cluster Created time is: %s", createdTime.Format(time.RFC3339))
			log.Infof("Cluster Ready time is: %s", condition.LastUpdateTime)
			break
		}
	}
	if err != nil {
		return time.Duration(0), err
	}
	filePath := outputPath + "/provisioning-times.log"
	provisioningTimeDiff := readyTime.Sub(createdTime)
	if numClusters != nil {
		text := fmt.Sprintf("%d Clusters: %s\n", numClusters, provisioningTimeDiff)
		WriteStringToFile(text, filePath)
	}
	log.Infof("Provisioning took: %s", provisioningTimeDiff)
	return provisioningTimeDiff, nil
}

func WriteSnapshotURLsToFiles(t *testing.T, ts *session.Session, client *rancher.Client, gapi *gapi.Client, from time.Time, to time.Time, outputPath, clusterID, configMapsDir, prefix, suffix string, dashboardUIDs []string) {
	snapshotURLs, err := WriteMonitoringSnapshotsToPNGs(t, ts, client, gapi, from, to, outputPath, clusterID, configMapsDir, prefix, suffix, dashboardUIDs)
	if err != nil {
		log.Infof("error writing snapshots to PNG: %v", err)
	}
	filename := prefix + "snapshots" + suffix + ".txt"
	f, err := os.Create(outputPath + "/" + filename)
	if err != nil {
		log.Infof("error creating file with path (%s): %v", outputPath+"/"+filename, err)
	}
	for _, url := range snapshotURLs {
		_, err = f.WriteString(url + "\n")
		if err != nil {
			log.Infof("error writing bytes to file (%s): %v", outputPath+"/"+filename, err)
		}
	}
}

func WriteStringToFile(s string, dest string) {
	f, err := os.OpenFile(dest,
		os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
	if err != nil {
		log.Warnf("error creating file (%s): %v", dest, err)
	} else {
		log.Info("Writing to file at: ", dest)
		_, err = f.WriteString(s)
		if err != nil {
			log.Warnf("error writing to file (%s): %v", dest, err)
		}
		err := f.Close()
		if err != nil {
			log.Warnf("error closing file (%s): %v", f.Name(), err)
		}
	}
}

func CollectProfileScreenshots(profilePath, imagePath string) error {
	var err error
	profCmd := rancherprofiling.StartServeProfile(profilePath)
	defer func() {
		if err := profCmd.Process.Kill(); err != nil {
			log.Errorf("CollectProfileScreenshots: Error killing command (%s): %v", profCmd.Args, err)
		} else {
			log.Infof("CollectProfileScreenshots: Successfully killed command (%s)", profCmd.Args)
		}
	}()

	endpoints := PprofEndpoints()
	for k, endpoint := range endpoints {
		endpoint.ImageFilePath = fmt.Sprintf("%s-%s", imagePath, k)
		endpoint.WindowSize = [2]int{2560, 1440}
		endpoint.Timeout = 30

		_, err = imageutils.GetURLWithRetry(endpoint.URL, 5)
		if err != nil {
			log.Errorf("CollectProfileScreenshots: Failed to get URL (%s), skipping URL: %v", endpoint.URL, err)
			continue
		}

		if k == "flame" {
			err = imageutils.LocalHTMLFromURL(endpoint.URL, endpoint.ImageFilePath+".html")
			if err != nil {
				log.Errorf("CollectProfileScreenshots: Failed to get flamegraph html: %v", err)
			}
		} else {
			err = imageutils.URLScreenshotToPNG(endpoint.URL, endpoint.ImageFilePath+".png", endpoint.Selector, &endpoint.WindowSize, 25, "")
			if err != nil {
				log.Errorf("CollectProfileScreenshots: Failed to get image of url (%s): %v", endpoint.URL, err)
			}
		}
	}
	return err
}

func CollectRancherMetricsAndArtifacts(t *testing.T, ts *session.Session, client *rancher.Client, gapi *gapi.Client,
	restClient *restclient.Config, clientConfig *clientcmd.ClientConfig, outputPath, prefix, clusterID, configMapsDir string,
	numClusters int, start metav1.Time, end metav1.Time, dashboardUIDs []string) {
	log.Infof("Collecting metrics and other artifacts at %d Clusters", numClusters)
	podNames, err := kubeconfig.GetPodNames(client, clusterID, "cattle-system", &metav1.ListOptions{
		LabelSelector: "app=rancher",
		FieldSelector: "status.phase=Running",
	})
	require.NoError(t, err)
	log.Info("Pod Names: ", podNames)
	clustersSuffix := fmt.Sprintf("-%d-clusters", numClusters)

	// Create a context with timeout
	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Minute)
	defer cancel()
	// Create a channel to communicate completion of goroutines
	done := make(chan string, len(podNames))
	pprofBaseDir := outputPath + "/pprof/"

	for _, podName := range podNames {
		err = os.MkdirAll(pprofBaseDir, 0755)
		if err != nil {
			log.Errorf("Failed to create pprof file directory: %v", err)
		}

		memProfilePath := fmt.Sprintf("%s%s%s-%s%s", pprofBaseDir, prefix, rancherprofiling.MemProfileFileName, podName, clustersSuffix+rancherprofiling.ProfileFileExtension)
		log.Infof("Getting mem profile for pod %s, storing at %s", podName, memProfilePath)
		err = rancherprofiling.GetRancherMemProfile(*restClient, *clientConfig, podName, memProfilePath)
		if err != nil {
			log.Errorf("Failed to get Rancher memory profile: %v", err)
			continue
		}
		memProfileImagePath := fmt.Sprintf("%s%s%s-%s%s", pprofBaseDir, prefix, rancherprofiling.MemProfileFileName, podName, clustersSuffix)
		log.Infof("Getting mem profile screenshots for %s, storing at %s", podName, pprofBaseDir)
		CollectProfileScreenshots(memProfilePath, memProfileImagePath)

		cpuProfilePath := fmt.Sprintf("%s%s%s-%s%s", pprofBaseDir, prefix, rancherprofiling.CPUProfileFileName, podName, clustersSuffix+rancherprofiling.ProfileFileExtension)
		log.Infof("Getting cpu profile for pod %s, storing at %s", podName, cpuProfilePath)
		err = rancherprofiling.GetRancherCPUProfile(*restClient, *clientConfig, podName, cpuProfilePath)
		if err != nil {
			log.Errorf("Failed to get Rancher CPU profile: %v", err)
			continue
		}
		cpuProfileImagePath := fmt.Sprintf("%s%s%s-%s%s", pprofBaseDir, prefix, rancherprofiling.CPUProfileFileName, podName, clustersSuffix)
		log.Infof("Getting cpu profile screenshots for %s, storing at %s", podName, pprofBaseDir)
		CollectProfileScreenshots(cpuProfilePath, cpuProfileImagePath)

		// Get rancher pod logs
		logFileDest := fmt.Sprintf("%s/%s%s%s-%s", outputPath, prefix, podName, clustersSuffix, start.String()+".log")
		podLogOptions := &corev1.PodLogOptions{
			Container: "rancher",
			SinceTime: &start,
		}
		go func(ctx context.Context, podName, logFileDest string, podLogOptions *corev1.PodLogOptions) {
			defer func() { done <- podName }() // Signal completion of goroutine
			log.Info("Getting pod logs")
			_, err := kubeconfig.GetPodLogsWithContext(ctx, client, "local", podName, "cattle-system", "", logFileDest, true, podLogOptions)
			if err != nil {
				log.Warnf("error getting pod logs for pod (%s): %v", podName, err)
			}
		}(ctx, podName, logFileDest, podLogOptions)
	}

	// Wait for all goroutines to complete or timeout
	for range podNames {
		select {
		case podName := <-done:
			log.Infof("Completed getting pod logs for %s", podName)
		case <-ctx.Done():
			log.Warn("Timeout waiting for podlog goroutines to complete")
			return
		}
	}
	_, newGAPI, err := ranchermonitoring.SetupClients(client.RancherConfig.Host, client.RancherConfig.AdminToken, client.RancherConfig.AdminPassword)
	if err != nil {
		log.Errorf("Could not re-setup prometheus and/or grafana clients: %v", err)
	}
	WriteMonitoringSnapshotsToPNGs(t, ts, client, newGAPI, start.Time, end.Time, outputPath, clusterID, configMapsDir, prefix, clustersSuffix, dashboardUIDs)
}