diff --git a/docs/top/prometheus.md b/docs/top/prometheus.md index cdb1d6814..e2bd204c0 100644 --- a/docs/top/prometheus.md +++ b/docs/top/prometheus.md @@ -32,8 +32,6 @@ $ kubectl apply -f kubernetes-artifacts/prometheus/gpu-exporter.yaml !!! note - * the prometheus and gpu-exporter components should be deployed in namespace ``kube-system``, and so that ``arena top job `` can work. - * if the your prometheus has been existed in cluster,please make sure the k8s service whose port is 9090 has the label `kubernetes.io/service-name=prometheus-server` 3\. You can check the GPU metrics by prometheus SQL request diff --git a/kubernetes-artifacts/prometheus/gpu-exporter.yaml b/kubernetes-artifacts/prometheus/gpu-exporter.yaml index 647d37278..e1614dbe4 100644 --- a/kubernetes-artifacts/prometheus/gpu-exporter.yaml +++ b/kubernetes-artifacts/prometheus/gpu-exporter.yaml @@ -21,17 +21,13 @@ spec: operator: Exists hostPID: true volumes: - - hostPath: - path: /var/run/docker.sock - type: FileOrCreate - name: docker-sock - hostPath: path: /run/containerd/containerd.sock - type: FileOrCreate + type: Socket name: containerd-sock containers: - name: node-gpu-exporter - image: registry.cn-hangzhou.aliyuncs.com/acs/gpu-prometheus-exporter:0.1-0e21b28 + image: registry.cn-hangzhou.aliyuncs.com/acs/gpu-prometheus-exporter:v1.0.1-b2c2f9b imagePullPolicy: Always ports: - containerPort: 9445 @@ -40,11 +36,9 @@ spec: memory: 30Mi cpu: 100m limits: - memory: 50Mi - cpu: 200m + memory: 2000Mi + cpu: 1000m volumeMounts: - - mountPath: /var/run/docker.sock - name: docker-sock - mountPath: /run/containerd/containerd.sock name: containerd-sock diff --git a/kubernetes-artifacts/prometheus/prometheus.yaml b/kubernetes-artifacts/prometheus/prometheus.yaml index c77923cc7..265d2afb6 100644 --- a/kubernetes-artifacts/prometheus/prometheus.yaml +++ b/kubernetes-artifacts/prometheus/prometheus.yaml @@ -7,13 +7,13 @@ data: storage-retention: 360h --- -apiVersion: rbac.authorization.k8s.io/v1beta1 +apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: prometheus namespace: arena-system rules: - - apiGroups: ["", "extensions", "apps"] + - apiGroups: [""] resources: - nodes - nodes/proxy @@ -32,7 +32,7 @@ metadata: name: prometheus namespace: arena-system --- -apiVersion: rbac.authorization.k8s.io/v1beta1 +apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: prometheus