110 lines
5.3 KiB
Python
110 lines
5.3 KiB
Python
from kubernetes import client, config
|
|
|
|
|
|
def modify_prometheus_rules(api_instance, namespaces):
|
|
# Retrieve the existing PrometheusRule resources in the specified namespace
|
|
for namespace in namespaces:
|
|
# skip if the namespace name is either monitoring, logging, kube-system, istio-system, kube-node-lease, kube-public, portieris, telepresence, twistlock, velero, wiz
|
|
if namespace in ["monitoring", "logging", "kube-system", "istio-system", "kube-node-lease",
|
|
"kube-public", "portieris", "telepresence", "twistlock", "velero", "wiz"]:
|
|
print(f"Skipping namespace: {namespace}")
|
|
continue
|
|
|
|
rules = api_instance.list_namespaced_custom_object(
|
|
group="monitoring.coreos.com",
|
|
version="v1",
|
|
plural="prometheusrules",
|
|
namespace=namespace
|
|
)
|
|
|
|
for rule in rules.get("items", []):
|
|
print(f"Modifying PrometheusRule: {namespace} {rule['metadata']['name']}")
|
|
if len(rule["spec"]["groups"]) == 0:
|
|
print(f"PrometheusRule: {namespace} {rule['metadata']['name']} has no rules")
|
|
continue
|
|
# Find the existing PodOOMKilled alert and remove it
|
|
rule["spec"]["groups"][0]["rules"] = [r for r in rule["spec"]["groups"][0]["rules"] if
|
|
r["alert"] != "PodOOMKilled"]
|
|
|
|
if 'labels' not in rule["metadata"] or 'Team' not in rule["metadata"]["labels"]:
|
|
print(f"PrometheusRule: {namespace} {rule['metadata']['name']} has no Team label")
|
|
continue
|
|
|
|
app_name = rule["metadata"]["name"]
|
|
app_team = rule["metadata"]["labels"]['Team']
|
|
|
|
frequent_oom_alert = {
|
|
"alert": 'FrequentPodOOMKilled',
|
|
"annotations": {
|
|
"description": f'Namespace: {namespace}, AppName: {app_name}; Pod: {{ $labels.pod }} is restarting multiple times because of OOMKilled',
|
|
"summary": 'High Pod Failures',
|
|
"runbook": 'https://navihq.atlassian.net/wiki/spaces/IN/pages/279937094/Act+On+Pod+Alert',
|
|
},
|
|
"expr": f'increase(kube_pod_container_status_restarts_total{{namespace="{namespace}", container="{app_name}"}}[10m]) >= 2 AND ignoring(reason) kube_pod_container_status_last_terminated_reason{{namespace="{namespace}", container="{app_name}", reason="OOMKilled"}} > 0',
|
|
"labels": {
|
|
"severity": 'critical',
|
|
"alertTeam": f'{app_team}',
|
|
"appName": f'{app_name}',
|
|
},
|
|
}
|
|
pod_oom_killed_alert = {
|
|
"alert": 'PodOOMKilled',
|
|
"annotations": {
|
|
"description": f'Namespace: {namespace}, AppName: {app_name}; Pod: {{ $labels.pod }} killed because of OOMKilled',
|
|
"summary": 'Pod OOMKilled',
|
|
"runbook": 'https://navihq.atlassian.net/wiki/spaces/IN/pages/279937094/Act+On+Pod+Alert',
|
|
},
|
|
"expr": f'kube_pod_container_status_restarts_total{{namespace="{namespace}", container="{app_name}"}} - kube_pod_container_status_restarts_total{{namespace="{namespace}", container="{app_name}"}} offset 5m >= 1 AND ignoring(reason) kube_pod_container_status_last_terminated_reason{{namespace="{namespace}", container="{app_name}", reason="OOMKilled"}} > 0',
|
|
"labels": {
|
|
"severity": 'warning',
|
|
"alertTeam": f'{app_team}',
|
|
"appName": f'{app_name}',
|
|
},
|
|
}
|
|
|
|
# Add FrequentOOMKilledAlert and PodOOMKilled alerts to the rules
|
|
rule["spec"]["groups"][0]["rules"].append(frequent_oom_alert)
|
|
rule["spec"]["groups"][0]["rules"].append(pod_oom_killed_alert)
|
|
|
|
# Update the PrometheusRule resource with the modified rules
|
|
api_instance.replace_namespaced_custom_object(
|
|
group="monitoring.coreos.com",
|
|
version="v1",
|
|
plural="prometheusrules",
|
|
namespace=namespace,
|
|
name=app_name,
|
|
body=rule
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
kubernetes_contexts = [
|
|
"nonprod.np.navi-tech.in",
|
|
"aps1.prod.navi-tech.in",
|
|
"aps1.np.navi-gi.in",
|
|
"aps1.prod.navi-gi.in",
|
|
"aps1.np.navi-sa.in",
|
|
"aps1.prod.navi-sa.in",
|
|
"aps1.np.navi-pay.in",
|
|
"aps1.prod.navi-pay.in",
|
|
]
|
|
|
|
for context in kubernetes_contexts:
|
|
print(f"Modifying PrometheusRule resources for Kubernetes context: {context}")
|
|
try:
|
|
# Load the Kubernetes configuration for the current context
|
|
config.load_kube_config(context=context)
|
|
|
|
# Create the API client
|
|
api_inst = client.CustomObjectsApi()
|
|
|
|
# Call the function to modify the PrometheusRule resources
|
|
v1 = client.CoreV1Api()
|
|
ns = [ns.metadata.name for ns in v1.list_namespace().items]
|
|
modify_prometheus_rules(api_inst, ns)
|
|
|
|
print(f"PrometheusRule modifications completed for Kubernetes context: {context}")
|
|
print("--------------------------------------------------------------")
|
|
except Exception as e:
|
|
print(f"Error occurred for Kubernetes context {context}: {str(e)}")
|