This shows you the differences between two versions of the page.
Both sides previous revision Previous revision Next revision | Previous revision | ||
сервис_victoriametrics [2025/03/28 08:24] val [Метрики] |
сервис_victoriametrics [2025/03/29 09:32] (current) val [Уведомления] |
||
---|---|---|---|
Line 9: | Line 9: | ||
* [[https://docs.victoriametrics.com/guides/k8s-monitoring-via-vm-single/|Kubernetes monitoring via VictoriaMetrics Single]] | * [[https://docs.victoriametrics.com/guides/k8s-monitoring-via-vm-single/|Kubernetes monitoring via VictoriaMetrics Single]] | ||
+ | * [[https://docs.victoriametrics.com/scrape_config_examples/]] | ||
+ | * [[Система Kubernetes#kube-state-metrics]] | ||
<code> | <code> | ||
(venv1) server# ansible all -f 4 -m apt -a 'pkg=prometheus-node-exporter state=present update_cache=true' -i /root/kubespray/inventory/mycluster/hosts.yaml | (venv1) server# ansible all -f 4 -m apt -a 'pkg=prometheus-node-exporter state=present update_cache=true' -i /root/kubespray/inventory/mycluster/hosts.yaml | ||
Line 19: | Line 21: | ||
</code><code> | </code><code> | ||
... | ... | ||
+ | - job_name: kube-state-metrics | ||
+ | kubernetes_sd_configs: | ||
+ | - role: pod | ||
+ | relabel_configs: | ||
+ | - source_labels: [__meta_kubernetes_pod_container_name] | ||
+ | regex: kube-state-metrics | ||
+ | action: keep | ||
+ | - source_labels: [__meta_kubernetes_pod_container_port_number] | ||
+ | regex: "8080" | ||
+ | action: keep | ||
+ | |||
- job_name: node-exporter | - job_name: node-exporter | ||
static_configs: | static_configs: | ||
Line 43: | Line 56: | ||
kube1:~/vm# helm show values vm/victoria-metrics-alert > vm-alert-values.yaml | kube1:~/vm# helm show values vm/victoria-metrics-alert > vm-alert-values.yaml | ||
+ | |||
+ | $ wget -qO - https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/host-and-hardware/node-exporter.yml | sed 's/^/ /' | ||
+ | |||
+ | $ wget -qO - https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/kubernetes/kubestate-exporter.yml | sed 's/^/ /' | ||
kube1:~/vm# cat vm-alert-values.yaml | kube1:~/vm# cat vm-alert-values.yaml | ||
Line 62: | Line 79: | ||
# groups: [] | # groups: [] | ||
groups: | groups: | ||
- | - name: node_exporter_alerts | + | - name: NodeExporter |
... | ... | ||
- | - name: vm_k8s_alerts | + | - name: KubestateExporter |
rules: | rules: | ||
- | - alert: CriticalCPU | + | - alert: KubernetesContainerOomKiller |
- | expr: sum by (kubernetes_io_hostname) (rate (container_cpu_usage_seconds_total[1m])) / sum (machine_cpu_cores) * 100 > 40 | + | expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1 |
- | for: 1m | + | for: 0m |
labels: | labels: | ||
- | severity: "critical" | + | severity: warning |
annotations: | annotations: | ||
- | summary: "CriticalCPU {{ $labels.instance }}" | + | summary: Kubernetes Container oom killer (instance {{ $labels.instance }}) |
- | + | description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKill" | |
- | - alert: CriticalFS | + | |
- | expr: container_fs_usage_bytes{device=~"^/dev/[sv]d[a-z][1-9]$"} / container_fs_limit_bytes * 100 > 80 | + | |
- | for: 1m | + | |
- | labels: | + | |
- | severity: "critical" | + | |
- | annotations: | + | |
- | summary: "CriticalFS {{ $labels.instance }}" | + | |
- | + | ||
- | - alert: CriticalMEM | + | |
- | expr: sum by (kubernetes_io_hostname) (container_memory_working_set_bytes) / sum (machine_memory_bytes) * 100 > 80 | + | |
- | for: 1m | + | |
- | labels: | + | |
- | severity: "critical" | + | |
- | annotations: | + | |
- | summary: "CriticalMEM {{ $labels.instance }}" | + | |
... | ... | ||
alertmanager: | alertmanager: | ||
Line 169: | Line 170: | ||
Builder-> Filter: kubernetes.pod_name = my-debian | Builder-> Filter: kubernetes.pod_name = my-debian | ||
+ | </code> | ||
+ | |||
+ | ====== Черновик ====== | ||
+ | |||
+ | <code> | ||
+ | - alert: CriticalCPU | ||
+ | expr: sum by (kubernetes_io_hostname) (rate (container_cpu_usage_seconds_total[1m])) / sum (machine_cpu_cores) * 100 > 40 | ||
+ | for: 1m | ||
+ | labels: | ||
+ | severity: "critical" | ||
+ | annotations: | ||
+ | summary: "CriticalCPU {{ $labels.instance }}" | ||
+ | |||
+ | - alert: CriticalFS | ||
+ | expr: container_fs_usage_bytes{device=~"^/dev/[sv]d[a-z][1-9]$"} / container_fs_limit_bytes * 100 > 80 | ||
+ | for: 1m | ||
+ | labels: | ||
+ | severity: "critical" | ||
+ | annotations: | ||
+ | summary: "CriticalFS {{ $labels.instance }}" | ||
+ | |||
+ | - alert: CriticalMEM | ||
+ | expr: sum by (kubernetes_io_hostname) (container_memory_working_set_bytes) / sum (machine_memory_bytes) * 100 > 80 | ||
+ | for: 1m | ||
+ | labels: | ||
+ | severity: "critical" | ||
+ | annotations: | ||
+ | summary: "CriticalMEM {{ $labels.instance }}" | ||
</code> | </code> |