Modify simulate proxy to expose prometheus metrics (#576)

The simulate-proxy script pushes metrics to the telemetry service. This PR modifies the script to expose metrics to a prometheus endpoint. This functionality creates a server that randomly generates response_total, request_totals, response_duration_ms and response_latency_ms. The server reads pod information from a k8s cluster and picks a random namespace to use for all exposed metrics. Tested out these changes with a locally running prometheus server. I also ran the docker-compose.yml to make sure metrics were being recorded by the prometheus docker container. fixes #498 Signed-off-by: Dennis Adjei-Baah <dennis@buoyant.io>
2018-03-21 16:40:12 -07:00 · 2018-03-21 16:40:12 -07:00 · b90668a0b5
parent 1c9ce4d118
commit b90668a0b5
4 changed files with 279 additions and 179 deletions
--- a/.prometheus.dev.yml
+++ b/.prometheus.dev.yml
@ -31,3 +31,10 @@ scrape_configs:
  - job_name: 'destination'
    static_configs:
      - targets: ['destination:9999']
+
+  - job_name: 'conduit'
+    static_configs:
+      - targets:
+        - 'simulate-proxy:9000'
+        - 'simulate-proxy:9001'
+        - 'simulate-proxy:9002'
--- a/BUILD.md
+++ b/BUILD.md
@ -167,11 +167,11 @@ traffic to the docker-compose environment:
 ```bash
 # confirm you are connected to Kubernetes
 kubectl version
-
-# simulate traffic
-bin/go-run controller/script/simulate-proxy --kubeconfig ~/.kube/config --addr $DOCKER_IP:8086 --max-pods 10 --sleep 10ms
 ```

+Note that the Kubernetes cluster your system is configured to talk to must not be referenced via 
+`localhost` in your Kubernetes config file, as `simulate-proxy` will not be able to connect to it.
+ This includes Kubernetes on Docker For Mac.
 ### Testing

 ```bash
--- a/controller/script/simulate-proxy/main.go
+++ b/controller/script/simulate-proxy/main.go
@ -1,20 +1,19 @@
 package main

 import (
-	"context"
 	"flag"
+	"fmt"
 	"math"
 	"math/rand"
 	"net/http"
-	"strconv"
+	"os"
+	"os/signal"
 	"strings"
 	"time"

-	"github.com/runconduit/conduit/controller/api/proxy"
-	common "github.com/runconduit/conduit/controller/gen/common"
-	pb "github.com/runconduit/conduit/controller/gen/proxy/telemetry"
+	prom "github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/promhttp"
 	"github.com/runconduit/conduit/controller/k8s"
-	"github.com/runconduit/conduit/controller/util"
 	log "github.com/sirupsen/logrus"
 	"google.golang.org/grpc/codes"
 	"k8s.io/api/core/v1"
@ -22,9 +21,27 @@ import (
 	_ "k8s.io/client-go/plugin/pkg/client/auth"
 )

-/* A simple script for posting simulated telemetry data to the proxy api */
+/* A simple script for exposing simulated prometheus metrics */
+
+type simulatedProxy struct {
+	podOwner       string
+	deploymentName string
+	sleep          time.Duration
+	namespace      string
+	deployments    []string
+	registerer     *prom.Registry
+	*proxyMetricCollectors
+}
+
+type proxyMetricCollectors struct {
+	requestTotals     *prom.CounterVec
+	responseTotals    *prom.CounterVec
+	requestDurationMs *prom.HistogramVec
+	responseLatencyMs *prom.HistogramVec
+}

 var (
+	labels            = generatePromLabels()
 	grpcResponseCodes = []codes.Code{
 		codes.OK,
 		codes.PermissionDenied,
@ -93,23 +110,20 @@ var (
 		http.StatusNetworkAuthenticationRequired,
 	}

-	ports = []uint32{3333, 6262}
-
 	// latencyBucketBounds holds the maximum value (inclusive, in tenths of a
 	// millisecond) that may be counted in a given histogram bucket.
-
 	// These values are one order of magnitude greater than the controller's
 	// Prometheus buckets, because the proxy will reports latencies in tenths
 	// of a millisecond rather than whole milliseconds.
-	latencyBucketBounds = [26]uint32{
+	latencyBucketBounds = []float64{
 		// prometheus.LinearBuckets(1, 1, 5),
 		10, 20, 30, 40, 50,
 		// prometheus.LinearBuckets(10, 10, 5),
-		100, 200, 300, 400, 50,
+		100, 200, 300, 400, 500,
 		// prometheus.LinearBuckets(100, 100, 5),
 		1000, 2000, 3000, 4000, 5000,
 		// prometheus.LinearBuckets(1000, 1000, 5),
-		10000, 20000, 30000, 40000, 5000,
+		10000, 20000, 30000, 40000, 50000,
 		// prometheus.LinearBuckets(10000, 10000, 5),
 		100000, 200000, 300000, 400000, 500000,
 		// Prometheus implicitly creates a max bucket for everything that
@ -119,8 +133,122 @@ var (
 	}
 )

-func randomPort() uint32 {
-	return ports[rand.Intn(len(ports))]
+// generateProxyTraffic randomly creates metrics under the guise of a single conduit proxy routing traffic.
+// metrics are generated for each proxyMetricCollector.
+func (s *simulatedProxy) generateProxyTraffic() {
+
+	for {
+		inboundRandomCount := int(rand.Float64() * 10)
+		outboundRandomCount := int(rand.Float64() * 10)
+		deployment := getRandomDeployment(s.deployments, map[string]struct{}{s.podOwner: {}})
+
+		//split the deployment name into ["namespace", "deployment"]
+		destinationDeploymentName := strings.Split(deployment, "/")[1]
+
+		s.requestTotals.
+			With(overrideDefaultLabels(s.newConduitLabel(destinationDeploymentName, false))).
+			Add(float64(inboundRandomCount))
+
+		s.responseTotals.
+			With(overrideDefaultLabels(s.newConduitLabel(destinationDeploymentName, true))).
+			Add(float64(outboundRandomCount))
+
+		observeHistogramVec(
+			randomLatencies(randomCount()),
+			s.responseLatencyMs,
+			overrideDefaultLabels(s.newConduitLabel(destinationDeploymentName, true)))
+
+		observeHistogramVec(
+			randomLatencies(randomCount()),
+			s.requestDurationMs,
+			overrideDefaultLabels(s.newConduitLabel(destinationDeploymentName, false)))
+
+		time.Sleep(s.sleep)
+	}
+}
+
+// newConduitLabel creates a label map to be used for metric generation.
+func (s *simulatedProxy) newConduitLabel(destinationPod string, isResponseLabel bool) prom.Labels {
+	labelMap := prom.Labels{
+		"direction":  randomRequestDirection(),
+		"deployment": s.deploymentName,
+		"authority":  "world.greeting:7778",
+		"namespace":  s.namespace,
+	}
+	if labelMap["direction"] == "outbound" {
+		labelMap["dst_deployment"] = destinationPod
+	}
+	if isResponseLabel {
+		if rand.Intn(2) == 0 {
+			labelMap["grpc_status_code"] = fmt.Sprintf("%d", randomGrpcResponseCode())
+		} else {
+			labelMap["status_code"] = fmt.Sprintf("%d", randomHttpResponseCode())
+		}
+	}
+
+	return labelMap
+}
+
+// observeHistogramVec uses a latencyBuckets slice which holds an array of numbers that indicate
+// how many observations will be added to a each bucket. latencyBuckets and latencyBucketBounds
+// both are of the same array length. ObserveHistogramVec selects a latencyBucketBound based on a position
+// in the latencyBucket and then makes an observation within the selected bucket.
+func observeHistogramVec(latencyBuckets []uint32, latencies *prom.HistogramVec, latencyLabels prom.Labels) {
+	for bucketNum, count := range latencyBuckets {
+		latencyMs := float64(latencyBucketBounds[bucketNum]) / 10
+		for i := uint32(0); i < count; i++ {
+			latencies.With(latencyLabels).Observe(latencyMs)
+		}
+	}
+}
+
+func randomRequestDirection() string {
+	if rand.Intn(2) == 0 {
+		return "inbound"
+	}
+	return "outbound"
+}
+
+func generatePromLabels() []string {
+	kubeResourceTypes := []string{
+		"job",
+		"replica_set",
+		"deployment",
+		"daemon_set",
+		"replication_controller",
+		"namespace",
+	}
+	constantLabels := []string{
+		"direction",
+		"authority",
+		"status_code",
+		"grpc_status_code",
+	}
+
+	destinationLabels := make([]string, len(kubeResourceTypes))
+
+	for i, label := range kubeResourceTypes {
+		destinationLabels[i] = fmt.Sprintf("dst_%s", label)
+	}
+	return append(append(constantLabels, kubeResourceTypes...), destinationLabels...)
+}
+
+// overrideDefaultLabels combines two maps of the same size with the keys
+// map1 values take precedence during the union
+func overrideDefaultLabels(map1 map[string]string) map[string]string {
+	map2 := generateLabelMap(labels)
+	for k := range map2 {
+		map2[k] = map1[k]
+	}
+	return map2
+}
+
+func generateLabelMap(labels []string) map[string]string {
+	labelMap := make(map[string]string, len(labels))
+	for _, label := range labels {
+		labelMap[label] = ""
+	}
+	return labelMap
 }

 func randomCount() uint32 {
@ -138,30 +266,6 @@ func randomLatencies(count uint32) []uint32 {
 	return latencies
 }

-func randomGrpcEos(count uint32) (eos []*pb.EosScope) {
-	grpcResponseCodes := make(map[uint32]uint32)
-	for i := uint32(0); i < count; i++ {
-		grpcResponseCodes[randomGrpcResponseCode()] += 1
-	}
-	for code, streamCount := range grpcResponseCodes {
-		eos = append(eos, &pb.EosScope{
-			Ctx:     &pb.EosCtx{End: &pb.EosCtx_GrpcStatusCode{GrpcStatusCode: code}},
-			Streams: streamCount,
-		})
-	}
-	return
-}
-
-func randomH2Eos(count uint32) (eos []*pb.EosScope) {
-	for i := uint32(0); i < count; i++ {
-		eos = append(eos, &pb.EosScope{
-			Ctx:     &pb.EosCtx{End: &pb.EosCtx_Other{Other: true}},
-			Streams: uint32(rand.Int31()),
-		})
-	}
-	return
-}
-
 func randomGrpcResponseCode() uint32 {
 	return uint32(grpcResponseCodes[rand.Intn(len(grpcResponseCodes))])
 }
@ -170,44 +274,94 @@ func randomHttpResponseCode() uint32 {
 	return uint32(httpResponseCodes[rand.Intn(len(httpResponseCodes))])
 }

-func stringToIp(str string) *common.IPAddress {
-	octets := make([]uint8, 0)
-	for _, num := range strings.Split(str, ".") {
-		oct, _ := strconv.Atoi(num)
-		octets = append(octets, uint8(oct))
-	}
-	return util.IPV4(octets[0], octets[1], octets[2], octets[3])
-}
-
 func podIndexFunc(obj interface{}) ([]string, error) {
 	return nil, nil
 }

-func randomPod(pods []*v1.Pod, prvPodIp *common.IPAddress) *common.IPAddress {
-	var podIp *common.IPAddress
-	for {
-		if podIp != nil {
-			break
-		}
+func getRandomDeployment(deployments []string, excludeDeployments map[string]struct{}) string {
+	filteredDeployments := make([]string, 0)

-		randomPod := pods[rand.Intn(len(pods))]
-		if strings.HasPrefix(randomPod.GetNamespace(), "kube-") {
-			continue // skip pods in the kube-* namespaces
-		}
-		podIp = stringToIp(randomPod.Status.PodIP)
-		if prvPodIp != nil && podIp.GetIpv4() == prvPodIp.GetIpv4() {
-			podIp = nil
+	for _, deployment := range deployments {
+		if _, ok := excludeDeployments[deployment]; !ok {
+			filteredDeployments = append(filteredDeployments, deployment)
 		}
 	}
-	return podIp
+	return filteredDeployments[rand.Intn(len(filteredDeployments))]
+
+}
+
+func newSimulatedProxy(podOwner string, deployments []string, sleep *time.Duration) *simulatedProxy {
+	podOwnerComponents := strings.Split(podOwner, "/")
+	name := podOwnerComponents[1]
+	namespace := podOwnerComponents[0]
+
+	proxy := simulatedProxy{
+		podOwner:       podOwner,
+		sleep:          *sleep,
+		deployments:    deployments,
+		namespace:      namespace,
+		deploymentName: name,
+		registerer:     prom.NewRegistry(),
+		proxyMetricCollectors: &proxyMetricCollectors{
+			requestTotals: prom.NewCounterVec(
+				prom.CounterOpts{
+					Name: "request_total",
+					Help: "A counter of the number of requests the proxy has received",
+				}, labels),
+			responseTotals: prom.NewCounterVec(
+				prom.CounterOpts{
+					Name: "response_total",
+					Help: "A counter of the number of responses the proxy has received.",
+				}, labels),
+			requestDurationMs: prom.NewHistogramVec(
+				prom.HistogramOpts{
+					Name:    "request_duration_ms",
+					Help:    "A histogram of the duration of a response",
+					Buckets: latencyBucketBounds,
+				}, labels),
+			responseLatencyMs: prom.NewHistogramVec(
+				prom.HistogramOpts{
+					Name:    "response_latency_ms",
+					Help:    "A histogram of the total latency of a response.",
+					Buckets: latencyBucketBounds,
+				}, labels),
+		},
+	}
+
+	proxy.registerer.MustRegister(
+		proxy.requestTotals,
+		proxy.responseTotals,
+		proxy.requestDurationMs,
+		proxy.responseLatencyMs,
+	)
+	return &proxy
+}
+
+func getDeployments(podList []*v1.Pod, deploys *k8s.ReplicaSetStore, maxPods *int) []string {
+	allPods := make([]*v1.Pod, 0)
+	deploymentSet := make(map[string]struct{})
+	for _, pod := range podList {
+		if pod.Status.PodIP != "" && !strings.HasPrefix(pod.GetNamespace(), "kube-") && (*maxPods == 0 || len(allPods) < *maxPods) {
+			allPods = append(allPods, pod)
+			deploymentName, err := deploys.GetDeploymentForPod(pod)
+			if err != nil {
+				log.Fatal(err.Error())
+			}
+			deploymentSet[deploymentName] = struct{}{}
+		}
+	}
+
+	deployments := make([]string, 0)
+	for deployment := range deploymentSet {
+		deployments = append(deployments, deployment)
+	}
+	return deployments
 }

 func main() {
 	rand.Seed(time.Now().UnixNano())
-
-	addr := flag.String("addr", ":8086", "address of proxy api")
-	requestCount := flag.Int("requests", 0, "number of api requests to make (default: infinite)")
 	sleep := flag.Duration("sleep", time.Second, "time to sleep between requests")
+	metricsAddrs := flag.String("metric-addrs", ":9000,:9001,:9002", "range of network addresses to serve prometheus metrics")
 	maxPods := flag.Int("max-pods", 0, "total number of pods to simulate (default unlimited)")
 	kubeConfigPath := flag.String("kubeconfig", "", "path to kube config - required")
 	flag.Parse()
@ -217,12 +371,6 @@ func main() {
 		return
 	}

-	client, conn, err := proxy.NewTelemetryClient(*addr)
-	if err != nil {
-		log.Fatal(err.Error())
-	}
-	defer conn.Close()
-
 	clientSet, err := k8s.NewClientSet(*kubeConfigPath)
 	if err != nil {
 		log.Fatal(err.Error())
@ -233,120 +381,49 @@ func main() {
 		log.Fatal(err.Error())
 	}

+	deploys, err := k8s.NewReplicaSetStore(clientSet)
+	if err != nil {
+		log.Fatal(err.Error())
+	}
+
 	err = pods.Run()
 	if err != nil {
 		log.Fatal(err.Error())
 	}

+	err = deploys.Run()
+	if err != nil {
+		log.Fatal(err.Error())
+	}
+
 	podList, err := pods.List()
 	if err != nil {
 		log.Fatal(err.Error())
 	}

-	allPods := make([]*v1.Pod, 0)
-	for _, pod := range podList {
-		if pod.Status.PodIP != "" && (*maxPods == 0 || len(allPods) < *maxPods) {
-			allPods = append(allPods, pod)
-		}
-	}
-
-	for i := 0; (*requestCount == 0) || (i < *requestCount); i++ {
-		count := randomCount()
-		sourceIp := randomPod(allPods, nil)
-		targetIp := randomPod(allPods, sourceIp)
-
-		req := &pb.ReportRequest{
-			Process: &pb.Process{
-				ScheduledInstance:  "hello-1mfa0",
-				ScheduledNamespace: "people",
-			},
-			ClientTransports: []*pb.ClientTransport{
-				// TCP
-				&pb.ClientTransport{
-					TargetAddr: &common.TcpAddress{
-						Ip:   targetIp,
-						Port: randomPort(),
-					},
-					Connects: count,
-					Disconnects: []*pb.TransportSummary{
-						&pb.TransportSummary{
-							DurationMs: uint64(randomCount()),
-							BytesSent:  uint64(randomCount()),
-						},
-					},
-					Protocol: common.Protocol_TCP,
-				},
-			},
-			ServerTransports: []*pb.ServerTransport{
-				// TCP
-				&pb.ServerTransport{
-					SourceIp: sourceIp,
-					Connects: count,
-					Disconnects: []*pb.TransportSummary{
-						&pb.TransportSummary{
-							DurationMs: uint64(randomCount()),
-							BytesSent:  uint64(randomCount()),
-						},
-					},
-					Protocol: common.Protocol_TCP,
-				},
-			},
-			Proxy: pb.ReportRequest_INBOUND,
-			Requests: []*pb.RequestScope{
-
-				// gRPC
-				&pb.RequestScope{
-					Ctx: &pb.RequestCtx{
-						SourceIp: sourceIp,
-						TargetAddr: &common.TcpAddress{
-							Ip:   targetIp,
-							Port: randomPort(),
-						},
-						Authority: "world.greeting:7778",
-					},
-					Count: count,
-					Responses: []*pb.ResponseScope{
-						&pb.ResponseScope{
-							Ctx: &pb.ResponseCtx{
-								HttpStatusCode: http.StatusOK,
-							},
-							ResponseLatencyCounts: randomLatencies(count),
-							Ends: randomGrpcEos(count),
-						},
-					},
-				},
-
-				// HTTP/2
-				&pb.RequestScope{
-					Ctx: &pb.RequestCtx{
-						SourceIp: sourceIp,
-						TargetAddr: &common.TcpAddress{
-							Ip:   targetIp,
-							Port: randomPort(),
-						},
-						Authority: "world.greeting:7778",
-					},
-					Count: count,
-					Responses: []*pb.ResponseScope{
-						&pb.ResponseScope{
-							Ctx: &pb.ResponseCtx{
-								HttpStatusCode: randomHttpResponseCode(),
-							},
-							ResponseLatencyCounts: randomLatencies(count),
-							Ends: randomH2Eos(count),
-						},
-					},
-				},
-			},
-
-			HistogramBucketBoundsTenthMs: latencyBucketBounds[:],
-		}
-
-		_, err = client.Report(context.Background(), req)
-		if err != nil {
-			log.Fatal(err.Error())
-		}
-
-		time.Sleep(*sleep)
+	deployments := getDeployments(podList, deploys, maxPods)
+
+	stopCh := make(chan os.Signal)
+	signal.Notify(stopCh, os.Interrupt, os.Kill)
+
+	excludedDeployments := map[string]struct{}{}
+
+	for _, addr := range strings.Split(*metricsAddrs, ",") {
+		randomPodOwner := getRandomDeployment(deployments, excludedDeployments)
+		excludedDeployments[randomPodOwner] = struct{}{}
+
+		go func(address string, podOwner string, deployments []string) {
+
+			proxy := newSimulatedProxy(podOwner, deployments, sleep)
+			server := &http.Server{
+				Addr:    address,
+				Handler: promhttp.HandlerFor(proxy.registerer, promhttp.HandlerOpts{}),
+			}
+			log.Infof("serving scrapable metrics on %s", address)
+			go server.ListenAndServe()
+			go proxy.generateProxyTraffic()
+
+		}(addr, randomPodOwner, deployments)
 	}
+	<-stopCh
 }
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -110,3 +110,19 @@ services:
    command:
    - --config.file=/etc/prometheus/prometheus.yml
    - --storage.tsdb.retention=6h
+
+  simulate-proxy:
+    image: golang:1.10.0-alpine3.7
+    ports:
+    - 9000-9002:9000-9002
+    volumes:
+    - .:/go/src/github.com/runconduit/conduit
+    - ~/.kube/config:/kubeconfig:ro
+    command:
+    - go
+    - run
+    - /go/src/github.com/runconduit/conduit/controller/script/simulate-proxy/main.go
+    - --metric-addrs=0.0.0.0:9000,0.0.0.0:9001,0.0.0.0:9002
+    - --kubeconfig=/kubeconfig
+    - --max-pods=10
+    - --sleep=3s