from kubernetes import client, config def modify_prometheus_rules(api_instance, namespaces): # Retrieve the existing PrometheusRule resources in the specified namespace for namespace in namespaces: # skip if the namespace name is either monitoring, logging, kube-system, istio-system, kube-node-lease, kube-public, portieris, telepresence, twistlock, velero, wiz if namespace in ["monitoring", "logging", "kube-system", "istio-system", "kube-node-lease", "kube-public", "portieris", "telepresence", "twistlock", "velero", "wiz"]: print(f"Skipping namespace: {namespace}") continue rules = api_instance.list_namespaced_custom_object( group="monitoring.coreos.com", version="v1", plural="prometheusrules", namespace=namespace ) for rule in rules.get("items", []): print(f"Modifying PrometheusRule: {namespace} {rule['metadata']['name']}") if len(rule["spec"]["groups"]) == 0: print(f"PrometheusRule: {namespace} {rule['metadata']['name']} has no rules") continue # Find the existing PodOOMKilled alert and remove it rule["spec"]["groups"][0]["rules"] = [r for r in rule["spec"]["groups"][0]["rules"] if r["alert"] != "PodOOMKilled"] if 'labels' not in rule["metadata"] or 'Team' not in rule["metadata"]["labels"]: print(f"PrometheusRule: {namespace} {rule['metadata']['name']} has no Team label") continue app_name = rule["metadata"]["name"] app_team = rule["metadata"]["labels"]['Team'] frequent_oom_alert = { "alert": 'FrequentPodOOMKilled', "annotations": { "description": f'Namespace: {namespace}, AppName: {app_name}; Pod: {{ $labels.pod }} is restarting multiple times because of OOMKilled', "summary": 'High Pod Failures', "runbook": 'https://navihq.atlassian.net/wiki/spaces/IN/pages/279937094/Act+On+Pod+Alert', }, "expr": f'increase(kube_pod_container_status_restarts_total{{namespace="{namespace}", container="{app_name}"}}[10m]) >= 2 AND ignoring(reason) kube_pod_container_status_last_terminated_reason{{namespace="{namespace}", container="{app_name}", reason="OOMKilled"}} > 0', "labels": { "severity": 'critical', "alertTeam": f'{app_team}', "appName": f'{app_name}', }, } pod_oom_killed_alert = { "alert": 'PodOOMKilled', "annotations": { "description": f'Namespace: {namespace}, AppName: {app_name}; Pod: {{ $labels.pod }} killed because of OOMKilled', "summary": 'Pod OOMKilled', "runbook": 'https://navihq.atlassian.net/wiki/spaces/IN/pages/279937094/Act+On+Pod+Alert', }, "expr": f'kube_pod_container_status_restarts_total{{namespace="{namespace}", container="{app_name}"}} - kube_pod_container_status_restarts_total{{namespace="{namespace}", container="{app_name}"}} offset 5m >= 1 AND ignoring(reason) kube_pod_container_status_last_terminated_reason{{namespace="{namespace}", container="{app_name}", reason="OOMKilled"}} > 0', "labels": { "severity": 'warning', "alertTeam": f'{app_team}', "appName": f'{app_name}', }, } # Add FrequentOOMKilledAlert and PodOOMKilled alerts to the rules rule["spec"]["groups"][0]["rules"].append(frequent_oom_alert) rule["spec"]["groups"][0]["rules"].append(pod_oom_killed_alert) # Update the PrometheusRule resource with the modified rules api_instance.replace_namespaced_custom_object( group="monitoring.coreos.com", version="v1", plural="prometheusrules", namespace=namespace, name=app_name, body=rule ) if __name__ == "__main__": kubernetes_contexts = [ "nonprod.np.navi-tech.in", "aps1.prod.navi-tech.in", "aps1.np.navi-gi.in", "aps1.prod.navi-gi.in", "aps1.np.navi-sa.in", "aps1.prod.navi-sa.in", "aps1.np.navi-pay.in", "aps1.prod.navi-pay.in", ] for context in kubernetes_contexts: print(f"Modifying PrometheusRule resources for Kubernetes context: {context}") try: # Load the Kubernetes configuration for the current context config.load_kube_config(context=context) # Create the API client api_inst = client.CustomObjectsApi() # Call the function to modify the PrometheusRule resources v1 = client.CoreV1Api() ns = [ns.metadata.name for ns in v1.list_namespace().items] modify_prometheus_rules(api_inst, ns) print(f"PrometheusRule modifications completed for Kubernetes context: {context}") print("--------------------------------------------------------------") except Exception as e: print(f"Error occurred for Kubernetes context {context}: {str(e)}")