diff --git a/cmd/k8s-netperf/k8s-netperf.go b/cmd/k8s-netperf/k8s-netperf.go index 44fcbeb..84789a3 100644 --- a/cmd/k8s-netperf/k8s-netperf.go +++ b/cmd/k8s-netperf/k8s-netperf.go @@ -257,8 +257,14 @@ var rootCmd = &cobra.Command{ if len(npr.ClientNodeInfo.NodeName) > 0 && len(npr.ServerNodeInfo.NodeName) > 0 { sr.Results[i].ClientMetrics, _ = metrics.QueryNodeCPU(npr.ClientNodeInfo, pcon, npr.StartTime, npr.EndTime) sr.Results[i].ServerMetrics, _ = metrics.QueryNodeCPU(npr.ServerNodeInfo, pcon, npr.StartTime, npr.EndTime) + metrics.VSwitchCPU(npr.ClientNodeInfo, pcon, npr.StartTime, npr.EndTime, &sr.Results[i].ClientMetrics) + metrics.VSwitchMem(npr.ClientNodeInfo, pcon, npr.StartTime, npr.EndTime, &sr.Results[i].ClientMetrics) + metrics.VSwitchCPU(npr.ServerNodeInfo, pcon, npr.StartTime, npr.EndTime, &sr.Results[i].ServerMetrics) + metrics.VSwitchMem(npr.ServerNodeInfo, pcon, npr.StartTime, npr.EndTime, &sr.Results[i].ServerMetrics) sr.Results[i].ClientPodCPU, _ = metrics.TopPodCPU(npr.ClientNodeInfo, pcon, npr.StartTime, npr.EndTime) sr.Results[i].ServerPodCPU, _ = metrics.TopPodCPU(npr.ServerNodeInfo, pcon, npr.StartTime, npr.EndTime) + sr.Results[i].ClientPodMem, _ = metrics.TopPodMem(npr.ClientNodeInfo, pcon, npr.StartTime, npr.EndTime) + sr.Results[i].ServerPodMem, _ = metrics.TopPodMem(npr.ServerNodeInfo, pcon, npr.StartTime, npr.EndTime) } } } @@ -319,6 +325,7 @@ var rootCmd = &cobra.Command{ if showMetrics { result.ShowNodeCPU(sr) result.ShowPodCPU(sr) + result.ShowPodMem(sr) } } else { err = archive.WriteJSONResult(sr) diff --git a/pkg/archive/archive.go b/pkg/archive/archive.go index 4fc19d2..ac88b68 100644 --- a/pkg/archive/archive.go +++ b/pkg/archive/archive.go @@ -19,36 +19,42 @@ const ltcyMetric = "usec" // Doc struct of the JSON document to be indexed type Doc struct { - UUID string `json:"uuid"` - Timestamp time.Time `json:"timestamp"` - HostNetwork bool `json:"hostNetwork"` - Driver string `json:"driver"` - Parallelism int `json:"parallelism"` - Profile string `json:"profile"` - Duration int `json:"duration"` - Service bool `json:"service"` - Local bool `json:"local"` - Virt bool `json:"virt"` - AcrossAZ bool `json:"acrossAZ"` - Samples int `json:"samples"` - Messagesize int `json:"messageSize"` - Burst int `json:"burst"` - Throughput float64 `json:"throughput"` - Latency float64 `json:"latency"` - TputMetric string `json:"tputMetric"` - LtcyMetric string `json:"ltcyMetric"` - TCPRetransmit float64 `json:"tcpRetransmits"` - UDPLossPercent float64 `json:"udpLossPercent"` - ToolVersion string `json:"toolVersion"` - ToolGitCommit string `json:"toolGitCommit"` - Metadata result.Metadata `json:"metadata"` - ServerNodeCPU metrics.NodeCPU `json:"serverCPU"` - ServerPodCPU []metrics.PodCPU `json:"serverPods"` - ClientNodeCPU metrics.NodeCPU `json:"clientCPU"` - ClientPodCPU []metrics.PodCPU `json:"clientPods"` - Confidence []float64 `json:"confidence"` - ServerNodeInfo metrics.NodeInfo `json:"serverNodeInfo"` - ClientNodeInfo metrics.NodeInfo `json:"clientNodeInfo"` + UUID string `json:"uuid"` + Timestamp time.Time `json:"timestamp"` + HostNetwork bool `json:"hostNetwork"` + Driver string `json:"driver"` + Parallelism int `json:"parallelism"` + Profile string `json:"profile"` + Duration int `json:"duration"` + Service bool `json:"service"` + Local bool `json:"local"` + Virt bool `json:"virt"` + AcrossAZ bool `json:"acrossAZ"` + Samples int `json:"samples"` + Messagesize int `json:"messageSize"` + Burst int `json:"burst"` + Throughput float64 `json:"throughput"` + Latency float64 `json:"latency"` + TputMetric string `json:"tputMetric"` + LtcyMetric string `json:"ltcyMetric"` + TCPRetransmit float64 `json:"tcpRetransmits"` + UDPLossPercent float64 `json:"udpLossPercent"` + ToolVersion string `json:"toolVersion"` + ToolGitCommit string `json:"toolGitCommit"` + Metadata result.Metadata `json:"metadata"` + ServerNodeCPU metrics.NodeCPU `json:"serverCPU"` + ServerPodCPU []metrics.PodCPU `json:"serverPods"` + ServerPodMem []metrics.PodMem `json:"serverPodsMem"` + ClientNodeCPU metrics.NodeCPU `json:"clientCPU"` + ClientPodCPU []metrics.PodCPU `json:"clientPods"` + ClientPodMem []metrics.PodMem `json:"clientPodsMem"` + Confidence []float64 `json:"confidence"` + ServerNodeInfo metrics.NodeInfo `json:"serverNodeInfo"` + ClientNodeInfo metrics.NodeInfo `json:"clientNodeInfo"` + ServerVSwitchCpu float64 `json:"serverVswtichCpu"` + ServerVSwitchMem float64 `json:"serverVswitchMem"` + ClientVSwitchCpu float64 `json:"clientVswtichCpu"` + ClientVSwiitchMem float64 `json:"clientVswitchMem"` } // Connect returns a client connected to the desired cluster. @@ -89,31 +95,37 @@ func BuildDocs(sr result.ScenarioResults, uuid string) ([]interface{}, error) { } c := []float64{lo, hi} d := Doc{ - UUID: uuid, - Timestamp: time, - ToolVersion: sr.Version, - ToolGitCommit: sr.GitCommit, - Driver: r.Driver, - HostNetwork: r.HostNetwork, - Parallelism: r.Parallelism, - Profile: r.Profile, - Duration: r.Duration, - Virt: sr.Virt, - Samples: r.Samples, - Service: r.Service, - Messagesize: r.MessageSize, - Burst: r.Burst, - TputMetric: r.Metric, - LtcyMetric: ltcyMetric, - ServerNodeCPU: r.ServerMetrics, - ClientNodeCPU: r.ClientMetrics, - ServerPodCPU: r.ServerPodCPU.Results, - ClientPodCPU: r.ClientPodCPU.Results, - Metadata: sr.Metadata, - AcrossAZ: r.AcrossAZ, - Confidence: c, - ClientNodeInfo: r.ClientNodeInfo, - ServerNodeInfo: r.ServerNodeInfo, + UUID: uuid, + Timestamp: time, + ToolVersion: sr.Version, + ToolGitCommit: sr.GitCommit, + Driver: r.Driver, + HostNetwork: r.HostNetwork, + Parallelism: r.Parallelism, + Profile: r.Profile, + Duration: r.Duration, + Virt: sr.Virt, + Samples: r.Samples, + Service: r.Service, + Messagesize: r.MessageSize, + Burst: r.Burst, + TputMetric: r.Metric, + LtcyMetric: ltcyMetric, + ServerNodeCPU: r.ServerMetrics, + ClientNodeCPU: r.ClientMetrics, + ServerPodCPU: r.ServerPodCPU.Results, + ServerPodMem: r.ServerPodMem.MemResults, + ClientPodMem: r.ClientPodMem.MemResults, + ClientPodCPU: r.ClientPodCPU.Results, + ClientVSwitchCpu: r.ClientMetrics.VSwitchCPU, + ClientVSwiitchMem: r.ClientMetrics.VSwitchMem, + ServerVSwitchCpu: r.ServerMetrics.VSwitchCPU, + ServerVSwitchMem: r.ServerMetrics.VSwitchMem, + Metadata: sr.Metadata, + AcrossAZ: r.AcrossAZ, + Confidence: c, + ClientNodeInfo: r.ClientNodeInfo, + ServerNodeInfo: r.ServerNodeInfo, } UDPLossPercent, e := result.Average(r.LossSummary) if e != nil { @@ -189,7 +201,7 @@ func commonCsvDataFields(row result.Data) []string { } // Writes all the mertics to the archive. -func writeArchive(cpuarchive, podarchive *csv.Writer, role string, row result.Data, podResults []metrics.PodCPU) error { +func writeArchive(vswitch, cpuarchive, podarchive, podmemarchive *csv.Writer, role string, row result.Data, podResults []metrics.PodCPU, podMem []metrics.PodMem) error { roleFieldData := []string{role} for _, pod := range podResults { if err := podarchive.Write(append(append(roleFieldData, @@ -200,11 +212,26 @@ func writeArchive(cpuarchive, podarchive *csv.Writer, role string, row result.Da return fmt.Errorf("failed to write archive to file") } } + for _, pod := range podMem { + if err := podmemarchive.Write(append(append(roleFieldData, + commonCsvDataFields(row)...), + pod.Name, + fmt.Sprintf("%f", pod.Value), + )); err != nil { + return fmt.Errorf("failed to write archive to file") + } + } cpu := row.ClientMetrics if role == "Server" { cpu = row.ServerMetrics } + if err := vswitch.Write(append(append(roleFieldData, + commonCsvDataFields(row)...), + fmt.Sprintf("%f", cpu.VSwitchCPU), + fmt.Sprintf("%f", cpu.VSwitchMem))); err != nil { + return fmt.Errorf("failed to write archive to file") + } if err := cpuarchive.Write(append(append(roleFieldData, commonCsvDataFields(row)...), fmt.Sprintf("%f", cpu.Idle), @@ -223,6 +250,17 @@ func writeArchive(cpuarchive, podarchive *csv.Writer, role string, row result.Da // WritePromCSVResult writes the prom data in CSV format func WritePromCSVResult(r result.ScenarioResults) error { d := time.Now().Unix() + + vswitchfp, err := os.Create(fmt.Sprintf("vswitch-result-%d.csv", d)) + if err != nil { + return fmt.Errorf("failed to open vswitch archive file") + } + defer vswitchfp.Close() + podmemfp, err := os.Create(fmt.Sprintf("podmem-result-%d.csv", d)) + if err != nil { + return fmt.Errorf("failed to open pod mem archive file") + } + defer podmemfp.Close() podfp, err := os.Create(fmt.Sprintf("podcpu-result-%d.csv", d)) if err != nil { return fmt.Errorf("failed to open pod cpu archive file") @@ -233,10 +271,15 @@ func WritePromCSVResult(r result.ScenarioResults) error { return fmt.Errorf("failed to open cpu archive file") } defer cpufp.Close() + vswitch := csv.NewWriter(vswitchfp) + defer vswitch.Flush() cpuarchive := csv.NewWriter(cpufp) defer cpuarchive.Flush() podarchive := csv.NewWriter(podfp) defer podarchive.Flush() + podmemarchive := csv.NewWriter(podmemfp) + defer podmemarchive.Flush() + roleField := []string{"Role"} cpudata := append(append(roleField, commonCsvHeaderFields()...), @@ -253,20 +296,32 @@ func WritePromCSVResult(r result.ScenarioResults) error { "Pod Name", "Utilization", ) + vswtichdata := append(append(roleField, + commonCsvHeaderFields()...), + "CPU Utilization", + "Memory Utilization", + ) if err := cpuarchive.Write(cpudata); err != nil { return fmt.Errorf("failed to write cpu archive to file") } if err := podarchive.Write(poddata); err != nil { return fmt.Errorf("failed to write pod archive to file") } + if err := podmemarchive.Write(poddata); err != nil { + return fmt.Errorf("failed to write pod archive to file") + } + if err := vswitch.Write(vswtichdata); err != nil { + return fmt.Errorf("failed to write vswitch archive to file") + } for _, row := range r.Results { - if err := writeArchive(cpuarchive, podarchive, "Client", row, row.ClientPodCPU.Results); err != nil { + if err := writeArchive(vswitch, cpuarchive, podarchive, podmemarchive, "Client", row, row.ClientPodCPU.Results, row.ClientPodMem.MemResults); err != nil { return err } - if err := writeArchive(cpuarchive, podarchive, "Server", row, row.ServerPodCPU.Results); err != nil { + if err := writeArchive(vswitch, cpuarchive, podarchive, podmemarchive, "Server", row, row.ServerPodCPU.Results, row.ServerPodMem.MemResults); err != nil { return err } } + return nil } diff --git a/pkg/metrics/system.go b/pkg/metrics/system.go index 5c9cda6..f02559c 100644 --- a/pkg/metrics/system.go +++ b/pkg/metrics/system.go @@ -24,14 +24,16 @@ type NodeInfo struct { // NodeCPU stores CPU information for a specific Node type NodeCPU struct { - Idle float64 `json:"idleCPU"` - User float64 `json:"userCPU"` - Steal float64 `json:"stealCPU"` - System float64 `json:"systemCPU"` - Nice float64 `json:"niceCPU"` - Irq float64 `json:"irqCPU"` - Softirq float64 `json:"softCPU"` - Iowait float64 `json:"ioCPU"` + Idle float64 `json:"idleCPU"` + User float64 `json:"userCPU"` + Steal float64 `json:"stealCPU"` + System float64 `json:"systemCPU"` + Nice float64 `json:"niceCPU"` + Irq float64 `json:"irqCPU"` + Softirq float64 `json:"softCPU"` + Iowait float64 `json:"ioCPU"` + VSwitchCPU float64 `json:"vSwitchCPU"` + VSwitchMem float64 `json:"vSwitchMem"` } // PodCPU stores pod CPU @@ -40,9 +42,15 @@ type PodCPU struct { Value float64 `json:"cpuUsage"` } +type PodMem struct { + Name string `json:"podName"` + Value float64 `json:"memUsage"` +} + // PodValues is a collection of PodCPU type PodValues struct { - Results []PodCPU + Results []PodCPU + MemResults []PodMem } // PromConnect stores the prom information @@ -166,10 +174,10 @@ func QueryNodeCPU(node NodeInfo, conn PromConnect, start time.Time, end time.Tim return cpu, true } -// TopPodCPU will return the top 5 CPU consumers for a specific node +// TopPodCPU will return the top 10 CPU consumers for a specific node func TopPodCPU(node NodeInfo, conn PromConnect, start time.Time, end time.Time) (PodValues, bool) { var pods PodValues - query := fmt.Sprintf("topk(5,sum(irate(container_cpu_usage_seconds_total{name!=\"\",instance=~\"%s:.*\"}[2m]) * 100) by (pod, namespace, instance))", node.IP) + query := fmt.Sprintf("topk(10,sum(irate(container_cpu_usage_seconds_total{name!=\"\",instance=~\"%s:.*\"}[2m]) * 100) by (pod, namespace, instance))", node.IP) logging.Debugf("Prom Query : %s", query) val, err := conn.Client.QueryRange(query, start, end, time.Minute) if err != nil { @@ -187,6 +195,59 @@ func TopPodCPU(node NodeInfo, conn PromConnect, start time.Time, end time.Time) return pods, true } +// VSwitchCPU will return the vswitchd cpu usage for specific node +func VSwitchCPU(node NodeInfo, conn PromConnect, start time.Time, end time.Time, ndata *NodeCPU) bool { + query := fmt.Sprintf("irate(container_cpu_usage_seconds_total{id=~\"/system.slice/ovs-vswitchd.service\", node=~\"%s\"}[2m])*100", node.NodeName) + logging.Debugf("Prom Query : %s", query) + val, err := conn.Client.QueryRange(query, start, end, time.Minute) + if err != nil { + logging.Error("Issue querying Prometheus") + return false + } + v := val.(model.Matrix) + for _, s := range v { + ndata.VSwitchCPU = avg(s.Values) + } + return true +} + +// VSwitchMem will return the vswitchd cpu usage for specific node +func VSwitchMem(node NodeInfo, conn PromConnect, start time.Time, end time.Time, ndata *NodeCPU) bool { + query := fmt.Sprintf("container_memory_rss{id=~\"/system.slice/ovs-vswitchd.service\", node=~\"%s\"}", node.NodeName) + logging.Debugf("Prom Query : %s", query) + val, err := conn.Client.QueryRange(query, start, end, time.Minute) + if err != nil { + logging.Error("Issue querying Prometheus") + return false + } + v := val.(model.Matrix) + for _, s := range v { + ndata.VSwitchMem = avg(s.Values) + } + return true +} + +// TopPodMem will return the top 10 Mem consumers for a specific node +func TopPodMem(node NodeInfo, conn PromConnect, start time.Time, end time.Time) (PodValues, bool) { + var pods PodValues + query := fmt.Sprintf("topk(10,sum(container_memory_rss{container!=\"POD\",name!=\"\",node=~\"%s\"}) by (pod, namespace, node))", node.NodeName) + logging.Debugf("Prom Query : %s", query) + val, err := conn.Client.QueryRange(query, start, end, time.Minute) + if err != nil { + logging.Error("Issue querying Prometheus") + return pods, false + } + v := val.(model.Matrix) + for _, s := range v { + p := PodMem{ + Name: string(s.Metric["pod"]), + Value: avg(s.Values), + } + pods.MemResults = append(pods.MemResults, p) + } + return pods, true +} + // Calculates average for the given data func avg(data []model.SamplePair) float64 { sum := 0.0 diff --git a/pkg/results/result.go b/pkg/results/result.go index 6b701ef..b2a7646 100644 --- a/pkg/results/result.go +++ b/pkg/results/result.go @@ -43,7 +43,9 @@ type Data struct { ClientMetrics metrics.NodeCPU ServerMetrics metrics.NodeCPU ClientPodCPU metrics.PodValues + ClientPodMem metrics.PodValues ServerPodCPU metrics.PodValues + ServerPodMem metrics.PodValues } // ScenarioResults each scenario could have multiple results @@ -201,6 +203,20 @@ func ShowPodCPU(s ScenarioResults) { table.Render() } +// ShowPodMem accepts ScenarioResults and presents to the user via stdout the Podmem info +func ShowPodMem(s ScenarioResults) { + table := initTable([]string{"Result Type", "Driver", "Role", "Scenario", "Parallelism", "Host Network", "Service", "Message Size", "Burst", "Same node", "Pod", "Utilization"}) + for _, r := range s.Results { + for _, pod := range r.ClientPodMem.MemResults { + table.Append([]string{"Pod Mem RSS Utilization", r.Driver, "Client", r.Profile, fmt.Sprintf("%d", r.Parallelism), fmt.Sprintf("%t", r.HostNetwork), fmt.Sprintf("%t", r.Service), fmt.Sprintf("%d", r.MessageSize), fmt.Sprintf("%d", r.Burst), fmt.Sprintf("%t", r.SameNode), fmt.Sprintf("%.20s", pod.Name), fmt.Sprintf("%f", pod.Value)}) + } + for _, pod := range r.ServerPodMem.MemResults { + table.Append([]string{"Pod Mem RSS Utilization", r.Driver, "Server", r.Profile, fmt.Sprintf("%d", r.Parallelism), fmt.Sprintf("%t", r.HostNetwork), fmt.Sprintf("%t", r.Service), fmt.Sprintf("%d", r.MessageSize), fmt.Sprintf("%d", r.Burst), fmt.Sprintf("%t", r.SameNode), fmt.Sprintf("%.20s", pod.Name), fmt.Sprintf("%f", pod.Value)}) + } + } + table.Render() +} + // ShowNodeCPU accepts ScenarioResults and presents to the user via stdout the NodeCPU info func ShowNodeCPU(s ScenarioResults) { table := initTable([]string{"Result Type", "Driver", "Role", "Scenario", "Parallelism", "Host Network", "Service", "Message Size", "Burst", "Same node", "Idle CPU", "User CPU", "System CPU", "Steal CPU", "IOWait CPU", "Nice CPU", "SoftIRQ CPU", "IRQ CPU"})