elemental-operator/cmd/support/main.go

629 lines
18 KiB
Go

/*
Copyright © 2022 - 2025 SUSE LLC
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package main
import (
"archive/tar"
"bufio"
"bytes"
"compress/gzip"
"encoding/json"
"errors"
"fmt"
"io"
"os"
"os/exec"
"path/filepath"
"regexp"
"strings"
"time"
"github.com/spf13/cobra"
"github.com/spf13/viper"
"sigs.k8s.io/yaml" // See: https://github.com/rancher/system-agent/blob/main/pkg/config/config.go
systemagent "github.com/rancher/elemental-operator/internal/system-agent"
"github.com/rancher/elemental-operator/pkg/log"
"github.com/rancher/elemental-operator/pkg/version"
)
const (
k3sKubeConfig = "/etc/rancher/k3s/k3s.yaml"
k3sKubectl = "/usr/local/bin/kubectl"
rkeKubeConfig = "/etc/rancher/rke2/rke2.yaml"
rkeConfigPath = "/etc/rancher/rke2/config.yaml.d/50-rancher.yaml"
rkeDataDirDefault = "/var/lib/rancher/rke2"
rkeKubectl = "/bin/kubectl"
elementalAgentPlanDir = "/var/lib/elemental/agent/applied/"
rancherAgentPlanDirDefault = "/var/lib/rancher/agent/applied/"
rancherAgentConf = "/etc/rancher/agent/config.yaml"
elementalAgentConf = "/etc/rancher/elemental/agent/config.yaml"
osRelease = "/etc/os-release"
hostnameFile = "/etc/hostname"
resolvConf = "/etc/resolv.conf"
oemDir = "/oem/"
systemOEMDir = "/system/oem"
)
type RKEConfig struct {
DataDir string `json:"data-dir,omitempty" yaml:"data-dir,omitempty"`
}
func getServices() []string {
return []string{
"elemental-system-agent",
"rancher-system-agent",
"k3s",
"rke2",
"cos-setup-boot",
"cos-setup-fs",
"cos-setup-initramfs",
"cos-setup-network",
"cos-setup-reconcile",
"cos-setup-rootfs",
"cos-immutable-rootfs",
"elemental-setup-boot",
"elemental-setup-fs",
"elemental-setup-initramfs",
"elemental-setup-network",
"elemental-setup-reconcile",
"elemental-setup-rootfs",
"elemental-rootfs",
"elemental",
"elemental-register",
"elemental-register-install",
"elemental-register-reset",
"NetworkManager",
}
}
var (
hostName = "unknown"
kubectlconfig = ""
kubectl = ""
)
func main() {
cmd := &cobra.Command{
Use: "elemental-support",
Short: "Gathers logs about the running system",
Long: "elemental-support tries to gather as much info as possible with logs about the running system for troubleshooting purposes",
RunE: func(_ *cobra.Command, _ []string) error {
if viper.GetBool("debug") {
log.EnableDebugLogging()
}
if viper.GetBool("version") {
log.Infof("Support version %s, commit %s, commit date %s", version.Version, version.Commit, version.CommitDate)
return nil
}
log.Infof("Support version %s, commit %s, commit date %s", version.Version, version.Commit, version.CommitDate)
return run()
},
}
cmd.PersistentFlags().BoolP("debug", "d", false, "Enable debug logging")
cmd.PersistentFlags().BoolP("version", "v", false, "print version and exit")
_ = viper.BindPFlag("debug", cmd.PersistentFlags().Lookup("debug"))
_ = viper.BindPFlag("version", cmd.PersistentFlags().Lookup("version"))
if err := cmd.Execute(); err != nil {
log.Fatalln(err)
}
}
func run() (err error) {
tempDir, err := os.MkdirTemp("", "")
if err != nil {
return err
}
defer func(path string) {
_ = os.RemoveAll(path)
}(tempDir)
// Support version
log.Info("Getting elemental-support version")
_ = os.WriteFile(
fmt.Sprintf("%s/elemental-support.version", tempDir),
[]byte(fmt.Sprintf("Support version %s, commit %s, commit date %s", version.Version, version.Commit, version.CommitDate)),
os.ModePerm)
// copy a bit of system information
for _, f := range []string{osRelease, resolvConf, hostnameFile} {
log.Infof("Copying %s", f)
copyFile(f, tempDir)
}
log.Infof("Copying %s", elementalAgentConf)
copyFileWithAltName(elementalAgentConf, tempDir, "elemental-agent-config.yaml")
log.Infof("Copying %s", rancherAgentConf)
copyFileWithAltName(rancherAgentConf, tempDir, "rancher-agent-config.yaml")
rancherAgentPlanDir := getSystemAgentAppliedDir(rancherAgentConf)
// TODO: Flag to skip certain files? They could have passwords set in them so maybe we need to search and replace
// any sensitive fields?
for _, f := range []string{elementalAgentPlanDir, rancherAgentPlanDir, systemOEMDir, oemDir} {
log.Infof("Copying dir %s", f)
// Full dest is the /tmp dir + the full real path of the source, so we store the paths as they are in the node
fullDest := filepath.Join(tempDir, f)
copyFilesInDir(f, fullDest)
}
for _, service := range getServices() {
log.Infof("Getting service log %s", service)
getServiceLog(service, tempDir)
}
// get binaries versions
log.Infof("Getting elemental-cli version")
cmd := exec.Command("elemental", "version")
out, err := cmd.CombinedOutput()
if err != nil {
log.Warningf("Could not get elemental version: %s", out)
}
_ = os.WriteFile(fmt.Sprintf("%s/elemental.version", tempDir), out, os.ModePerm)
// Should probably have a version command
log.Infof("Getting elemental-operator version")
cmd = exec.Command("elemental-operator", "operator", "--namespace", "fake", "--operator-image", "whatever")
// Don't check errors here, we expect it to fail because we pass fake stuff until we get a version command
out, _ = cmd.CombinedOutput()
_ = os.WriteFile(fmt.Sprintf("%s/elemental-operator.version", tempDir), out, os.ModePerm)
// Should probably have a version command
log.Infof("Getting elemental-register version")
cmd = exec.Command("elemental-register", "/tmp/nope")
// Don't check errors here, we expect it to fail because we pass fake stuff until we get a version command
out, _ = cmd.CombinedOutput()
_ = os.WriteFile(fmt.Sprintf("%s/elemental-register.version", tempDir), out, os.ModePerm)
// Check if we have a kubeconfig before starting
kubectlconfig, _ = getKubeConfig()
kubectl, _ = getKubectl()
if kubectlconfig != "" && kubectl != "" {
// get k8s info
for _, crd := range []string{"pods", "secrets", "nodes", "services", "deployments", "plans", "apps", "jobs"} {
log.Infof("Getting k8s info for %s", crd)
getK8sResource(crd, tempDir)
describeK8sResource(crd, tempDir)
}
// get k8s logs
for _, namespace := range []string{"cattle-system", "kube-system", "ingress-nginx", "calico-system", "cattle-fleet-system"} {
log.Infof("Getting k8s logs for namespace %s", namespace)
getK8sPodsLogs(namespace, tempDir)
}
} else {
log.Warningf("No kubeconfig available, skipping getting k8s items")
}
// All done, pack it up into a nice gzip file
hostName, err = os.Hostname()
if err != nil {
log.Warningf("Could not get hostname: %s", err)
}
finalFileName := fmt.Sprintf("%s-%s.tar.gz", hostName, time.Now().Format(time.RFC3339))
finalFileName = strings.Replace(finalFileName, ":", "", -1)
file, err := os.Create(finalFileName)
defer func(file *os.File) {
_ = file.Close()
}(file)
log.Infof("Creating final file")
var buf bytes.Buffer
err = compress(tempDir, &buf)
if err != nil {
log.Errorf("Failed to compress: %s", err)
return err
}
_, err = io.Copy(file, &buf)
if err != nil {
log.Errorf("Failed to copy: %s", err)
return err
}
log.Infof("All done. File created at %s", finalFileName)
return nil
}
func compress(src string, buf io.Writer) error {
zr := gzip.NewWriter(buf)
tw := tar.NewWriter(zr)
fi, err := os.Stat(src)
if err != nil {
return err
}
mode := fi.Mode()
if mode.IsRegular() {
header, err := tar.FileInfoHeader(fi, src)
if err != nil {
return err
}
if err := tw.WriteHeader(header); err != nil {
return err
}
data, err := os.Open(src)
if err != nil {
return err
}
if _, err := io.Copy(tw, data); err != nil {
return err
}
} else if mode.IsDir() {
_ = filepath.Walk(src, func(file string, fi os.FileInfo, _ error) error {
// generate tar header
var header, _ = tar.FileInfoHeader(fi, file)
header.Name = strings.Replace(filepath.ToSlash(file), "/tmp/", "elemental-support-", 1)
// write header
if err := tw.WriteHeader(header); err != nil {
return err
}
// if not a dir, write file content
if !fi.IsDir() {
data, err := os.Open(file)
if err != nil {
return err
}
if _, err := io.Copy(tw, data); err != nil {
return err
}
}
return nil
})
} else {
return fmt.Errorf("error: file type not supported")
}
// produce tar
if err := tw.Close(); err != nil {
return err
}
// produce gzip
if err := zr.Close(); err != nil {
return err
}
//
return nil
}
func getKubeConfig() (string, error) {
// First use it the env var if available
kubectlconfig = os.Getenv("KUBECONFIG")
if kubectlconfig == "" {
if existsNoWarn(k3sKubeConfig) {
log.Infof("Found k3s kubeconfig at %s", k3sKubeConfig)
kubectlconfig = k3sKubeConfig
}
if existsNoWarn(rkeKubeConfig) {
log.Infof("Found rke kubeconfig at %s", rkeKubeConfig)
kubectlconfig = rkeKubeConfig
}
// This should not happen as far as I understand
if existsNoWarn(k3sKubeConfig) && existsNoWarn(rkeKubeConfig) {
return "", errors.New("both kubeconfig exists for k3s and rke2, maybe the deployment is wrong")
}
}
return kubectlconfig, nil
}
func getK8sResource(resource string, dest string) {
var err error
cmd := exec.Command(
kubectl,
fmt.Sprintf("--kubeconfig=%s", kubectlconfig),
"get",
resource,
"--all-namespaces",
"-o",
"wide",
"--show-labels",
)
out, err := cmd.CombinedOutput()
if err != nil {
log.Warningf("Failed to get %s: %s", resource, err)
}
// We still want to write the output if the resource was not found or another issue occured as that can shed info on whats going on
_ = os.WriteFile(fmt.Sprintf("%s/%s-resource.log", dest, resource), out, os.ModePerm)
}
func describeK8sResource(resource string, dest string) {
var err error
cmd := exec.Command(
kubectl,
fmt.Sprintf("--kubeconfig=%s", kubectlconfig),
"describe",
resource,
"--all-namespaces",
)
out, err := cmd.CombinedOutput()
if err != nil {
log.Warningf("Failed to get %s", resource)
}
// We still want to write the output if the resource was not found or another issue occured as that can shed info on whats going on
_ = os.WriteFile(fmt.Sprintf("%s/%s-describe.log", dest, resource), out, os.ModePerm)
}
// getK8sPodsLogs will get ALL the logs for ALL the pods in a given namespace
func getK8sPodsLogs(namespace, dest string) {
var err error
var podNames []string
command := exec.Command(
kubectl,
fmt.Sprintf("--kubeconfig=%s", kubectlconfig),
"get",
"pods",
"-n",
namespace,
"-o",
"jsonpath='{.items[*].metadata.name}'",
)
out, err := command.CombinedOutput()
if err != nil {
log.Warningf("Failed to get pod names in namespace %s: %s", namespace, err)
}
cleanedOut := strings.Replace(string(out), "'", "", -1)
if len(cleanedOut) == 0 {
log.Warningf("No pods in namespace %s", namespace)
return
}
podNames = strings.Split(cleanedOut, " ")
for _, pod := range podNames {
log.Debugf("Getting logs for pod %s", pod)
if pod == "" {
continue
}
command := exec.Command(
kubectl,
fmt.Sprintf("--kubeconfig=%s", kubectlconfig),
"logs",
pod,
"-n",
namespace,
)
out, err := command.CombinedOutput()
if err != nil {
log.Warningf("Failed to get %s on namespace %s: %s", pod, namespace, err)
}
// We still want to write the output if the resource was not found or another issue occurred as that can shed info on what's going on
_ = os.WriteFile(fmt.Sprintf("%s/%s-%s-logs.log", dest, namespace, pod), out, os.ModePerm)
}
}
// copyFile copies a files from source to a destination path, keeping the name. Ignore any errors, we dont care
func copyFile(sourceFile string, destpath string) {
copyFileWithAltName(sourceFile, destpath, "")
}
// copyFile copies a files from source to a destination path, altering the name. Ignore any errors, we dont care
func copyFileWithAltName(sourceFile string, destPath string, altName string) {
if exists(sourceFile) {
stat, err := os.Stat(sourceFile)
if err != nil {
log.Warningf("File %s does not exists", sourceFile)
return
}
content, err := os.ReadFile(sourceFile)
if err != nil {
log.Warningf("Could not read file contents for %s", sourceFile)
return
}
destName := fmt.Sprintf("%s/%s", destPath, stat.Name())
if altName != "" {
destName = fmt.Sprintf("%s/%s", destPath, altName)
}
err = os.WriteFile(destName, content, os.ModePerm)
if err != nil {
log.Warningf("Could not write file %s", destName)
return
}
}
}
// exists checks for a path and returns a bool on its existence
func exists(path string) bool {
_, err := os.Stat(path)
if err != nil {
log.Warningf("path %s does not exist", path)
}
return err == nil
}
// existsNoWarn checks for a path and returns a bool on its existence. Does not raise warnings
func existsNoWarn(path string) bool {
_, err := os.Stat(path)
return err == nil
}
func serviceExists(service string) bool {
var out bytes.Buffer
cmd := exec.Command("systemctl", "list-units", "--full", "-all")
cmd.Stdout = &out
err := cmd.Run()
if err != nil {
log.Warning("Failed to list systemctl units")
return false
}
scan := bufio.NewScanner(&out)
for scan.Scan() {
if strings.Contains(scan.Text(), fmt.Sprintf("%s.service", service)) {
return true
}
}
return false
}
func getServiceLog(service string, dest string) {
var args []string
if serviceExists(service) {
args = append(args, "-u")
} else {
args = append(args, "-t")
}
args = append(args, service)
args = append(args, "-o")
args = append(args, "short-iso")
cmd := exec.Command("journalctl", args...)
out, err := cmd.CombinedOutput()
if err == nil {
_ = os.WriteFile(fmt.Sprintf("%s/%s.log", dest, service), out, os.ModePerm)
} else {
log.Warningf("Failed to get log for service %s", service)
}
}
// redactPasswords removes any occurrences of a passwd or password set in yaml/cloud-config files usually
func redactPasswords(input []byte) []byte {
passwd := regexp.MustCompile(`(passwd)([\s=\t\:]{1,3})(.*)`)
password := regexp.MustCompile(`(password)([\s=\t\:]{1,3})(.*)`)
redacted := passwd.ReplaceAll(input, []byte("$1$2*****"))
redacted = password.ReplaceAll(redacted, []byte("$1$2*****"))
return redacted
}
// copyFilesInDir copies all files in the source dir to destination path, keeping the name. Follows dirs inside dirs.
func copyFilesInDir(src, dest string) {
info, err := os.Lstat(src)
if err != nil {
log.Warningf("Error opening %s", src)
return
}
switch {
case info.IsDir():
copyDir(src, dest)
default:
copySingleFile(src, dest)
}
}
func copySingleFile(src string, dest string) {
log.Debugf("Copying %s into %s", src, dest)
original, err := os.ReadFile(src)
if err != nil {
log.Warningf("Failed to read: %s", err)
return
}
redacted := redactPasswords(original)
err = os.MkdirAll(filepath.Dir(dest), os.ModeDir|os.ModePerm)
if err != nil {
log.Warningf("Failed to create dir: %s", err)
return
}
err = os.WriteFile(dest, redacted, os.ModePerm)
if err != nil {
log.Warningf("Failed to write: %s", err)
return
}
}
func copyDir(srcdir string, destdir string) {
contents, err := os.ReadDir(srcdir)
if err != nil {
return
}
for _, content := range contents {
cs, cd := filepath.Join(srcdir, content.Name()), filepath.Join(destdir, content.Name())
copyFilesInDir(cs, cd)
}
}
func getKubectl() (string, error) {
if existsNoWarn(k3sKubectl) {
log.Infof("Found k3s kubectl at %s", k3sKubectl)
return k3sKubectl, nil
}
rkeKubectlPath := getRKEKubectlPath()
if existsNoWarn(rkeKubectlPath) {
log.Infof("Found rke kubectl at %s", rkeKubectlPath)
return rkeKubectlPath, nil
}
return "", errors.New("Cant find kubectl")
}
// getRKEKubectlPath tries to locate the RKE2 installed kubectl, parsing the RKE2 config.
// Note that this config has .yaml extension but is a JSON file.
// We try to parse both.
// See: https://github.com/rancher/rancher/issues/45865
func getRKEKubectlPath() string {
if existsNoWarn(rkeConfigPath) {
log.Infof("Found RKE config: %s", rkeConfigPath)
rkeConfigBytes, err := os.ReadFile(rkeConfigPath)
if err != nil {
log.Errorf("Could not read file '%s': %s", rkeConfigPath, err.Error())
return filepath.Join(rkeDataDirDefault, rkeKubectl)
}
rkeConfig := RKEConfig{}
err = json.Unmarshal(rkeConfigBytes, &rkeConfig)
if err != nil {
log.Errorf("Could not unmarshal JSON '%s': %s", rkeConfigPath, err.Error())
}
if len(rkeConfig.DataDir) > 0 {
kubectlPath := filepath.Join(rkeConfig.DataDir, rkeKubectl)
log.Infof("Determined kubectl location: %s", kubectlPath)
return kubectlPath
}
// If JSON didn't work, try YAML
err = yaml.Unmarshal(rkeConfigBytes, &rkeConfig)
if err != nil {
log.Errorf("Could not unmarshal YAML '%s': %s", rkeConfigPath, err.Error())
}
if len(rkeConfig.DataDir) > 0 {
kubectlPath := filepath.Join(rkeConfig.DataDir, rkeKubectl)
log.Infof("Determined kubectl location: %s", kubectlPath)
return kubectlPath
}
}
return filepath.Join(rkeDataDirDefault, rkeKubectl)
}
// getSystemAgentAppliedDir tries to locate the Rancher System Agent applied plans dir,
// parsing the system agent config.
func getSystemAgentAppliedDir(systemAgentConfigPath string) string {
if existsNoWarn(systemAgentConfigPath) {
log.Infof("Found Rancher System Agent config: %s", systemAgentConfigPath)
configBytes, err := os.ReadFile(systemAgentConfigPath)
if err != nil {
log.Errorf("Could not read file '%s': %s", systemAgentConfigPath, err.Error())
return rancherAgentPlanDirDefault
}
agentConfig := systemagent.AgentConfig{}
err = yaml.Unmarshal(configBytes, &agentConfig)
if err != nil {
log.Errorf("Could not unmarshal file '%s': %s", systemAgentConfigPath, err.Error())
return rancherAgentPlanDirDefault
}
log.Infof("Determined Rancher System Agent Applied Dir to be: %s", agentConfig.AppliedPlanDir)
return agentConfig.AppliedPlanDir
}
return rancherAgentPlanDirDefault
}