INFRA-3988 | Ashvin | Add autoscaling to Flink (#1251)

* INFRA-3988 | Ashvin | Add autoscaling to Flink

Autoscaling will only be supported by Flink versions 1.18 and later.
Migration is done using the following command
```shell
find . -name "manifest*flink*" | xargs -I{} sh -c "jq -r '.flink.flinkDeployment.flinkConfiguration += {\"autoscaler\": { \"enabled\": false }}' {} | sponge {}"
```
Adds validation in JSON schema
Updates kutegen with autoscaler changes

* INFRA-3988 | Ashvin | Remove promgateway configuration from Flink

This field is not required now that we use servicemonitor to fetch the
metrics. Earlier pushgateway was used.

Command used to do the migration is
```shell
find . -name "kub*flink*" | xargs -I {} sh -c "sed -i 's/promgate//' {}"
```

* INFRA-3988 | Ashvin | Add max-parallelism in test fixtures

Command used
```shell
find . -name "kube*flink*json" | xargs -I{} sh -c "jq -r '. | .kubeObject.items |= map(if .kind == \"FlinkDeployment\" then .spec.flinkConfiguration += { \"pipeline.max-parallelism\": \"100\" } else . end)' {} | sponge {}"
```

* Revert "INFRA-3988 | Ashvin | Remove promgateway configuration from Flink"

This reverts commit 3ef63ad31a587814275d5bad6f6d467cf638a0b9.

* INFRA-3988 | Ashvin | Remove promgateway configuration from Flink

This field is not required now that we use servicemonitor to fetch the
metrics. Earlier pushgateway was used.

Command used to do the migration is
```shell
find . -name "*flink*json" | xargs -I {} sh -c "sed -i '/promgate/d' {}"
```

* INFRA-3988 | Add Flink autoscaler field to all manifests

Only Flink manifests will be effected by this migration

* INFRA-3988 | Ashvin | Update kutegen
This commit is contained in:
Ashvin S
2024-11-05 18:57:37 +05:30
committed by GitHub
parent c6bf326819
commit 1009012a4d
11 changed files with 106 additions and 30 deletions

Submodule kutegen updated: 79b0319624...4d98606cf0

View File

@@ -0,0 +1,32 @@
import asyncio
from _migration_framework.abstract import AbstractMigration
class CustomMigration(AbstractMigration):
def migrate(self, manifest: dict) -> dict:
if 'type' in manifest and manifest['type'] == 'deployment':
return manifest
(manifest.setdefault('flink', {})
.setdefault('flinkDeployment', {})
.setdefault('flinkConfiguration', {})
.setdefault('autoscaler', {})
.setdefault('enabled', False))
return manifest
async def main():
migration = CustomMigration(
"https://deployment-portal-backend.np.navi-tech.in",
"",
# Dry run is enabled by default. Uncomment this line to disable dry run
# dry_run=False
)
await migration.run()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -12,7 +12,8 @@
"type": "object",
"required": [
"taskManagerSlots",
"flinkVersion"
"flinkVersion",
"autoscaler"
],
"properties": {
"taskManagerSlots": {
@@ -24,6 +25,43 @@
"v1_17",
"v1_19"
]
},
"autoscaler": {
"type": "object",
"properties": {
"enabled": {
"type": "boolean"
},
"stabilizationInterval": {
"type": "string",
"pattern": "^\\d+m$"
},
"metricsWindow": {
"type": "string",
"pattern": "^\\d+m$"
},
"targetUtilizationBoundary": {
"type": "string",
"pattern": "^(0(\\.\\d+)?|1(\\.0)?)$"
}
},
"required": [
"enabled"
],
"if": {
"properties": {
"enabled": {
"const": true
}
}
},
"then": {
"required": [
"stabilizationInterval",
"metricsWindow",
"targetUtilizationBoundary"
]
}
}
}
},

View File

@@ -20,8 +20,6 @@
"kubernetes.operator.pod-template.merge-arrays-by-name": "true",
"kubernetes.operator.savepoint.history.max.count": "24",
"kubernetes.taskmanager.cpu.limit-factor": "1.5",
"metrics.reporter.promgateway.groupingKey": "tag_team=Infra",
"metrics.reporter.promgateway.jobName": "test-job",
"restart-strategy.exponential-delay.backoff-multiplier": "2.0",
"restart-strategy.exponential-delay.initial-backoff": "10s",
"restart-strategy.exponential-delay.jitter-factor": "0.1",
@@ -31,7 +29,8 @@
"state.backend.type": "filesystem",
"state.checkpoints.dir": "s3://navi-flink-nonprod/jobs/dev/test-job/checkpoints",
"state.savepoints.dir": "s3://navi-flink-nonprod/jobs/dev/test-job/savepoints",
"taskmanager.numberOfTaskSlots": "2"
"taskmanager.numberOfTaskSlots": "2",
"pipeline.max-parallelism": "100"
},
"flinkVersion": "v1_17",
"image": "193044292705.dkr.ecr.ap-south-1.amazonaws.com/common/flink:1.17.2-s3-hadoop",
@@ -636,7 +635,7 @@
"apiVersion": "policy/v1",
"kind": "PodDisruptionBudget",
"metadata": {
"annotations": { },
"annotations": {},
"labels": {
"Environment": "dev",
"Name": "test-job",
@@ -668,7 +667,7 @@
"apiVersion": "policy/v1",
"kind": "PodDisruptionBudget",
"metadata": {
"annotations": { },
"annotations": {},
"labels": {
"Environment": "dev",
"Name": "test-job",

View File

@@ -183,8 +183,6 @@
"kubernetes.operator.pod-template.merge-arrays-by-name": "true",
"kubernetes.operator.savepoint.history.max.count": "24",
"kubernetes.taskmanager.cpu.limit-factor": "1.5",
"metrics.reporter.promgateway.groupingKey": "tag_team=Cloud-Platform",
"metrics.reporter.promgateway.jobName": "delete-flink-test",
"restart-strategy.exponential-delay.backoff-multiplier": "2.0",
"restart-strategy.exponential-delay.initial-backoff": "10s",
"restart-strategy.exponential-delay.jitter-factor": "0.1",
@@ -194,7 +192,8 @@
"state.backend.type": "filesystem",
"state.checkpoints.dir": "s3://navi-flink-nonprod/jobs/dev/delete-flink-test/checkpoints",
"state.savepoints.dir": "s3://navi-flink-nonprod/jobs/dev/delete-flink-test/savepoints",
"taskmanager.numberOfTaskSlots": "2"
"taskmanager.numberOfTaskSlots": "2",
"pipeline.max-parallelism": "100"
},
"flinkVersion": "v1_17",
"image": "193044292705.dkr.ecr.ap-south-1.amazonaws.com/common/flink:1.17.2-s3-hadoop",
@@ -652,7 +651,7 @@
"apiVersion": "policy/v1",
"kind": "PodDisruptionBudget",
"metadata": {
"annotations": { },
"annotations": {},
"labels": {
"Environment": "dev",
"Name": "delete-flink-test",
@@ -684,7 +683,7 @@
"apiVersion": "policy/v1",
"kind": "PodDisruptionBudget",
"metadata": {
"annotations": { },
"annotations": {},
"labels": {
"Environment": "dev",
"Name": "delete-flink-test",

View File

@@ -208,8 +208,6 @@
"kubernetes.operator.pod-template.merge-arrays-by-name": "true",
"kubernetes.operator.savepoint.history.max.count": "24",
"kubernetes.taskmanager.cpu.limit-factor": "1.5",
"metrics.reporter.promgateway.groupingKey": "tag_team=Infra",
"metrics.reporter.promgateway.jobName": "sample-flink-job-1",
"restart-strategy.exponential-delay.backoff-multiplier": "2.0",
"restart-strategy.exponential-delay.initial-backoff": "10s",
"restart-strategy.exponential-delay.jitter-factor": "0.1",
@@ -219,7 +217,8 @@
"state.backend.type": "filesystem",
"state.checkpoints.dir": "s3://navi-flink-navi-pay-nonprod/jobs/dev/sample-flink-job-1/checkpoints",
"state.savepoints.dir": "s3://navi-flink-navi-pay-nonprod/jobs/dev/sample-flink-job-1/savepoints",
"taskmanager.numberOfTaskSlots": "2"
"taskmanager.numberOfTaskSlots": "2",
"pipeline.max-parallelism": "100"
},
"flinkVersion": "v1_17",
"image": "193044292705.dkr.ecr.ap-south-1.amazonaws.com/common/flink:1.17.2-s3-hadoop",
@@ -266,11 +265,11 @@
"name": "metrics"
}
],
"volumeMounts": [ ]
"volumeMounts": []
}
],
"serviceAccountName": "sample-flink-job-1-navi-service-dev",
"volumes": [ ]
"volumes": []
}
},
"restartNonce": 0,
@@ -617,7 +616,7 @@
"apiVersion": "policy/v1",
"kind": "PodDisruptionBudget",
"metadata": {
"annotations": { },
"annotations": {},
"labels": {
"Environment": "dev",
"Name": "sample-flink-job-1",
@@ -649,7 +648,7 @@
"apiVersion": "policy/v1",
"kind": "PodDisruptionBudget",
"metadata": {
"annotations": { },
"annotations": {},
"labels": {
"Environment": "dev",
"Name": "sample-flink-job-1",

View File

@@ -207,8 +207,6 @@
"kubernetes.operator.pod-template.merge-arrays-by-name": "true",
"kubernetes.operator.savepoint.history.max.count": "24",
"kubernetes.taskmanager.cpu.limit-factor": "1.5",
"metrics.reporter.promgateway.groupingKey": "tag_team=Infra",
"metrics.reporter.promgateway.jobName": "test-job",
"restart-strategy.exponential-delay.backoff-multiplier": "2.0",
"restart-strategy.exponential-delay.initial-backoff": "10s",
"restart-strategy.exponential-delay.jitter-factor": "0.1",
@@ -219,7 +217,8 @@
"state.backend.type": "rocksdb",
"state.checkpoints.dir": "s3://navi-flink-nonprod/jobs/dev/test-job/checkpoints",
"state.savepoints.dir": "s3://navi-flink-nonprod/jobs/dev/test-job/savepoints",
"taskmanager.numberOfTaskSlots": "2"
"taskmanager.numberOfTaskSlots": "2",
"pipeline.max-parallelism": "100"
},
"flinkVersion": "v1_17",
"image": "193044292705.dkr.ecr.ap-south-1.amazonaws.com/common/flink:1.17.2-s3-hadoop",
@@ -690,7 +689,7 @@
"apiVersion": "policy/v1",
"kind": "PodDisruptionBudget",
"metadata": {
"annotations": { },
"annotations": {},
"labels": {
"Environment": "dev",
"Name": "test-job",
@@ -722,7 +721,7 @@
"apiVersion": "policy/v1",
"kind": "PodDisruptionBudget",
"metadata": {
"annotations": { },
"annotations": {},
"labels": {
"Environment": "dev",
"Name": "test-job",
@@ -750,8 +749,6 @@
"maxUnavailable": 0
}
}
],
"kind": "List",
"apiVersion": "v1"

View File

@@ -93,7 +93,10 @@
"flinkConfiguration": {
"flinkVersion": "v1_17",
"taskManagerSlots": 2,
"savepointFrequency": "1h"
"savepointFrequency": "1h",
"autoscaler": {
"enabled": false
}
}
},
"flinkJob": {

View File

@@ -85,7 +85,10 @@
"flinkConfiguration": {
"taskManagerSlots": 2,
"savepointFrequency": "1d",
"flinkVersion": "v1_17"
"flinkVersion": "v1_17",
"autoscaler": {
"enabled": false
}
}
},
"flinkJob": {

View File

@@ -71,7 +71,10 @@
"flinkConfiguration": {
"taskManagerSlots": 2,
"savepointFrequency": "1h",
"flinkVersion": "v1_17"
"flinkVersion": "v1_17",
"autoscaler": {
"enabled": false
}
}
},
"flinkJob": {

View File

@@ -59,7 +59,10 @@
"flinkConfiguration": {
"taskManagerSlots": 2,
"savepointFrequency": "1h",
"flinkVersion": "v1_17"
"flinkVersion": "v1_17",
"autoscaler": {
"enabled": false
}
}
},
"flinkJob": {