diff --git a/scripts/prometheus_rule_oom.py b/scripts/prometheus_rule_oom.py new file mode 100644 index 00000000..250e3f81 --- /dev/null +++ b/scripts/prometheus_rule_oom.py @@ -0,0 +1,92 @@ +from kubernetes import client, config + + +def modify_prometheus_rules(api_instance, namespaces): + # Retrieve the existing PrometheusRule resources in the specified namespace + for namespace in namespaces: + # skip if the namespace name is either monitoring, logging, kube-system, istio-system, kube-node-lease, kube-public, portieris, telepresence, twistlock, velero, wiz + if namespace in ["monitoring", "logging", "kube-system", "istio-system", "kube-node-lease", + "kube-public", "portieris", "telepresence", "twistlock", "velero", "wiz"]: + print(f"Skipping namespace: {namespace}") + continue + + rules = api_instance.list_namespaced_custom_object( + group="monitoring.coreos.com", + version="v1", + plural="prometheusrules", + namespace=namespace + ) + + for rule in rules.get("items", []): + print(f"Modifying PrometheusRule: {namespace} {rule['metadata']['name']}") + if len(rule["spec"]["groups"]) == 0: + print(f"PrometheusRule: {namespace} {rule['metadata']['name']} has no rules") + continue + # Find the existing PodOOMKilled alert and remove it + rule["spec"]["groups"][0]["rules"] = [r for r in rule["spec"]["groups"][0]["rules"] if + r["alert"] != "PodOOMKilled"] + + if 'labels' not in rule["metadata"] or 'Team' not in rule["metadata"]["labels"]: + print(f"PrometheusRule: {namespace} {rule['metadata']['name']} has no Team label") + continue + + app_name = rule["metadata"]["name"] + app_team = rule["metadata"]["labels"]['Team'] + + frequent_oom_alert = { + "alert": 'FrequentPodOOMKilled', + "annotations": { + "description": f'Namespace: {namespace}, AppName: {app_name}; Pod: {{ $labels.pod }} is restarting multiple times because of OOMKilled', + "summary": 'High Pod Failures', + "runbook": 'https://navihq.atlassian.net/wiki/spaces/IN/pages/279937094/Act+On+Pod+Alert', + }, + "expr": f'increase(kube_pod_container_status_restarts_total{{namespace="{namespace}", container="{app_name}"}}[10m]) >= 2 AND ignoring(reason) kube_pod_container_status_last_terminated_reason{{namespace="{namespace}", container="{app_name}", reason="OOMKilled"}} > 0', + "labels": { + "severity": 'critical', + "alertTeam": f'"{app_team}"', + "appName": f'"{app_name}"', + }, + } + pod_oom_killed_alert = { + "alert": 'PodOOMKilled', + "annotations": { + "description": f'Namespace: {namespace}, AppName: {app_name}; Pod: {{ $labels.pod }} killed because of OOMKilled', + "summary": 'Pod OOMKilled', + "runbook": 'https://navihq.atlassian.net/wiki/spaces/IN/pages/279937094/Act+On+Pod+Alert', + }, + "expr": f'kube_pod_container_status_restarts_total{{namespace="{namespace}", container="{app_name}"}} - kube_pod_container_status_restarts_total{{namespace="{namespace}", container="{app_name}"}} offset 5m >= 1 AND ignoring(reason) kube_pod_container_status_last_terminated_reason{{namespace="{namespace}", container="{app_name}", reason="OOMKilled"}} > 0', + "labels": { + "severity": 'warning', + "alertTeam": f'"{app_team}"', + "appName": f'"{app_name}', + }, + } + + # Add FrequentOOMKilledAlert and PodOOMKilled alerts to the rules + rule["spec"]["groups"][0]["rules"].append(frequent_oom_alert) + rule["spec"]["groups"][0]["rules"].append(pod_oom_killed_alert) + + # Update the PrometheusRule resource with the modified rules + api_instance.replace_namespaced_custom_object( + group="monitoring.coreos.com", + version="v1", + plural="prometheusrules", + namespace=namespace, + name=app_name, + body=rule + ) + + +if __name__ == "__main__": + # Load the Kubernetes configuration from default location + config.load_kube_config() + + # Create the API client + api_inst = client.CustomObjectsApi() + + # Call the function to modify the PrometheusRule resources + v1 = client.CoreV1Api() + ns = [ns.metadata.name for ns in v1.list_namespace().items] + modify_prometheus_rules(api_inst, ns) + + print("PrometheusRule modifications completed.")