INFRA-3988 | Ashvin | Add autoscaling to Flink (#1251)
* INFRA-3988 | Ashvin | Add autoscaling to Flink
Autoscaling will only be supported by Flink versions 1.18 and later.
Migration is done using the following command
```shell
find . -name "manifest*flink*" | xargs -I{} sh -c "jq -r '.flink.flinkDeployment.flinkConfiguration += {\"autoscaler\": { \"enabled\": false }}' {} | sponge {}"
```
Adds validation in JSON schema
Updates kutegen with autoscaler changes
* INFRA-3988 | Ashvin | Remove promgateway configuration from Flink
This field is not required now that we use servicemonitor to fetch the
metrics. Earlier pushgateway was used.
Command used to do the migration is
```shell
find . -name "kub*flink*" | xargs -I {} sh -c "sed -i 's/promgate//' {}"
```
* INFRA-3988 | Ashvin | Add max-parallelism in test fixtures
Command used
```shell
find . -name "kube*flink*json" | xargs -I{} sh -c "jq -r '. | .kubeObject.items |= map(if .kind == \"FlinkDeployment\" then .spec.flinkConfiguration += { \"pipeline.max-parallelism\": \"100\" } else . end)' {} | sponge {}"
```
* Revert "INFRA-3988 | Ashvin | Remove promgateway configuration from Flink"
This reverts commit 3ef63ad31a587814275d5bad6f6d467cf638a0b9.
* INFRA-3988 | Ashvin | Remove promgateway configuration from Flink
This field is not required now that we use servicemonitor to fetch the
metrics. Earlier pushgateway was used.
Command used to do the migration is
```shell
find . -name "*flink*json" | xargs -I {} sh -c "sed -i '/promgate/d' {}"
```
* INFRA-3988 | Add Flink autoscaler field to all manifests
Only Flink manifests will be effected by this migration
* INFRA-3988 | Ashvin | Update kutegen
This commit is contained in:
2
kutegen
2
kutegen
Submodule kutegen updated: 79b0319624...4d98606cf0
32
scripts/add_flink_autoscaler_field.py
Normal file
32
scripts/add_flink_autoscaler_field.py
Normal file
@@ -0,0 +1,32 @@
|
||||
import asyncio
|
||||
|
||||
from _migration_framework.abstract import AbstractMigration
|
||||
|
||||
|
||||
class CustomMigration(AbstractMigration):
|
||||
|
||||
def migrate(self, manifest: dict) -> dict:
|
||||
if 'type' in manifest and manifest['type'] == 'deployment':
|
||||
return manifest
|
||||
|
||||
(manifest.setdefault('flink', {})
|
||||
.setdefault('flinkDeployment', {})
|
||||
.setdefault('flinkConfiguration', {})
|
||||
.setdefault('autoscaler', {})
|
||||
.setdefault('enabled', False))
|
||||
|
||||
return manifest
|
||||
|
||||
|
||||
async def main():
|
||||
migration = CustomMigration(
|
||||
"https://deployment-portal-backend.np.navi-tech.in",
|
||||
"",
|
||||
# Dry run is enabled by default. Uncomment this line to disable dry run
|
||||
# dry_run=False
|
||||
)
|
||||
await migration.run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -12,7 +12,8 @@
|
||||
"type": "object",
|
||||
"required": [
|
||||
"taskManagerSlots",
|
||||
"flinkVersion"
|
||||
"flinkVersion",
|
||||
"autoscaler"
|
||||
],
|
||||
"properties": {
|
||||
"taskManagerSlots": {
|
||||
@@ -24,6 +25,43 @@
|
||||
"v1_17",
|
||||
"v1_19"
|
||||
]
|
||||
},
|
||||
"autoscaler": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"enabled": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"stabilizationInterval": {
|
||||
"type": "string",
|
||||
"pattern": "^\\d+m$"
|
||||
},
|
||||
"metricsWindow": {
|
||||
"type": "string",
|
||||
"pattern": "^\\d+m$"
|
||||
},
|
||||
"targetUtilizationBoundary": {
|
||||
"type": "string",
|
||||
"pattern": "^(0(\\.\\d+)?|1(\\.0)?)$"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"enabled"
|
||||
],
|
||||
"if": {
|
||||
"properties": {
|
||||
"enabled": {
|
||||
"const": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"then": {
|
||||
"required": [
|
||||
"stabilizationInterval",
|
||||
"metricsWindow",
|
||||
"targetUtilizationBoundary"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
@@ -20,8 +20,6 @@
|
||||
"kubernetes.operator.pod-template.merge-arrays-by-name": "true",
|
||||
"kubernetes.operator.savepoint.history.max.count": "24",
|
||||
"kubernetes.taskmanager.cpu.limit-factor": "1.5",
|
||||
"metrics.reporter.promgateway.groupingKey": "tag_team=Infra",
|
||||
"metrics.reporter.promgateway.jobName": "test-job",
|
||||
"restart-strategy.exponential-delay.backoff-multiplier": "2.0",
|
||||
"restart-strategy.exponential-delay.initial-backoff": "10s",
|
||||
"restart-strategy.exponential-delay.jitter-factor": "0.1",
|
||||
@@ -31,7 +29,8 @@
|
||||
"state.backend.type": "filesystem",
|
||||
"state.checkpoints.dir": "s3://navi-flink-nonprod/jobs/dev/test-job/checkpoints",
|
||||
"state.savepoints.dir": "s3://navi-flink-nonprod/jobs/dev/test-job/savepoints",
|
||||
"taskmanager.numberOfTaskSlots": "2"
|
||||
"taskmanager.numberOfTaskSlots": "2",
|
||||
"pipeline.max-parallelism": "100"
|
||||
},
|
||||
"flinkVersion": "v1_17",
|
||||
"image": "193044292705.dkr.ecr.ap-south-1.amazonaws.com/common/flink:1.17.2-s3-hadoop",
|
||||
@@ -636,7 +635,7 @@
|
||||
"apiVersion": "policy/v1",
|
||||
"kind": "PodDisruptionBudget",
|
||||
"metadata": {
|
||||
"annotations": { },
|
||||
"annotations": {},
|
||||
"labels": {
|
||||
"Environment": "dev",
|
||||
"Name": "test-job",
|
||||
@@ -668,7 +667,7 @@
|
||||
"apiVersion": "policy/v1",
|
||||
"kind": "PodDisruptionBudget",
|
||||
"metadata": {
|
||||
"annotations": { },
|
||||
"annotations": {},
|
||||
"labels": {
|
||||
"Environment": "dev",
|
||||
"Name": "test-job",
|
||||
|
||||
@@ -183,8 +183,6 @@
|
||||
"kubernetes.operator.pod-template.merge-arrays-by-name": "true",
|
||||
"kubernetes.operator.savepoint.history.max.count": "24",
|
||||
"kubernetes.taskmanager.cpu.limit-factor": "1.5",
|
||||
"metrics.reporter.promgateway.groupingKey": "tag_team=Cloud-Platform",
|
||||
"metrics.reporter.promgateway.jobName": "delete-flink-test",
|
||||
"restart-strategy.exponential-delay.backoff-multiplier": "2.0",
|
||||
"restart-strategy.exponential-delay.initial-backoff": "10s",
|
||||
"restart-strategy.exponential-delay.jitter-factor": "0.1",
|
||||
@@ -194,7 +192,8 @@
|
||||
"state.backend.type": "filesystem",
|
||||
"state.checkpoints.dir": "s3://navi-flink-nonprod/jobs/dev/delete-flink-test/checkpoints",
|
||||
"state.savepoints.dir": "s3://navi-flink-nonprod/jobs/dev/delete-flink-test/savepoints",
|
||||
"taskmanager.numberOfTaskSlots": "2"
|
||||
"taskmanager.numberOfTaskSlots": "2",
|
||||
"pipeline.max-parallelism": "100"
|
||||
},
|
||||
"flinkVersion": "v1_17",
|
||||
"image": "193044292705.dkr.ecr.ap-south-1.amazonaws.com/common/flink:1.17.2-s3-hadoop",
|
||||
@@ -652,7 +651,7 @@
|
||||
"apiVersion": "policy/v1",
|
||||
"kind": "PodDisruptionBudget",
|
||||
"metadata": {
|
||||
"annotations": { },
|
||||
"annotations": {},
|
||||
"labels": {
|
||||
"Environment": "dev",
|
||||
"Name": "delete-flink-test",
|
||||
@@ -684,7 +683,7 @@
|
||||
"apiVersion": "policy/v1",
|
||||
"kind": "PodDisruptionBudget",
|
||||
"metadata": {
|
||||
"annotations": { },
|
||||
"annotations": {},
|
||||
"labels": {
|
||||
"Environment": "dev",
|
||||
"Name": "delete-flink-test",
|
||||
|
||||
@@ -208,8 +208,6 @@
|
||||
"kubernetes.operator.pod-template.merge-arrays-by-name": "true",
|
||||
"kubernetes.operator.savepoint.history.max.count": "24",
|
||||
"kubernetes.taskmanager.cpu.limit-factor": "1.5",
|
||||
"metrics.reporter.promgateway.groupingKey": "tag_team=Infra",
|
||||
"metrics.reporter.promgateway.jobName": "sample-flink-job-1",
|
||||
"restart-strategy.exponential-delay.backoff-multiplier": "2.0",
|
||||
"restart-strategy.exponential-delay.initial-backoff": "10s",
|
||||
"restart-strategy.exponential-delay.jitter-factor": "0.1",
|
||||
@@ -219,7 +217,8 @@
|
||||
"state.backend.type": "filesystem",
|
||||
"state.checkpoints.dir": "s3://navi-flink-navi-pay-nonprod/jobs/dev/sample-flink-job-1/checkpoints",
|
||||
"state.savepoints.dir": "s3://navi-flink-navi-pay-nonprod/jobs/dev/sample-flink-job-1/savepoints",
|
||||
"taskmanager.numberOfTaskSlots": "2"
|
||||
"taskmanager.numberOfTaskSlots": "2",
|
||||
"pipeline.max-parallelism": "100"
|
||||
},
|
||||
"flinkVersion": "v1_17",
|
||||
"image": "193044292705.dkr.ecr.ap-south-1.amazonaws.com/common/flink:1.17.2-s3-hadoop",
|
||||
@@ -266,11 +265,11 @@
|
||||
"name": "metrics"
|
||||
}
|
||||
],
|
||||
"volumeMounts": [ ]
|
||||
"volumeMounts": []
|
||||
}
|
||||
],
|
||||
"serviceAccountName": "sample-flink-job-1-navi-service-dev",
|
||||
"volumes": [ ]
|
||||
"volumes": []
|
||||
}
|
||||
},
|
||||
"restartNonce": 0,
|
||||
@@ -617,7 +616,7 @@
|
||||
"apiVersion": "policy/v1",
|
||||
"kind": "PodDisruptionBudget",
|
||||
"metadata": {
|
||||
"annotations": { },
|
||||
"annotations": {},
|
||||
"labels": {
|
||||
"Environment": "dev",
|
||||
"Name": "sample-flink-job-1",
|
||||
@@ -649,7 +648,7 @@
|
||||
"apiVersion": "policy/v1",
|
||||
"kind": "PodDisruptionBudget",
|
||||
"metadata": {
|
||||
"annotations": { },
|
||||
"annotations": {},
|
||||
"labels": {
|
||||
"Environment": "dev",
|
||||
"Name": "sample-flink-job-1",
|
||||
|
||||
@@ -207,8 +207,6 @@
|
||||
"kubernetes.operator.pod-template.merge-arrays-by-name": "true",
|
||||
"kubernetes.operator.savepoint.history.max.count": "24",
|
||||
"kubernetes.taskmanager.cpu.limit-factor": "1.5",
|
||||
"metrics.reporter.promgateway.groupingKey": "tag_team=Infra",
|
||||
"metrics.reporter.promgateway.jobName": "test-job",
|
||||
"restart-strategy.exponential-delay.backoff-multiplier": "2.0",
|
||||
"restart-strategy.exponential-delay.initial-backoff": "10s",
|
||||
"restart-strategy.exponential-delay.jitter-factor": "0.1",
|
||||
@@ -219,7 +217,8 @@
|
||||
"state.backend.type": "rocksdb",
|
||||
"state.checkpoints.dir": "s3://navi-flink-nonprod/jobs/dev/test-job/checkpoints",
|
||||
"state.savepoints.dir": "s3://navi-flink-nonprod/jobs/dev/test-job/savepoints",
|
||||
"taskmanager.numberOfTaskSlots": "2"
|
||||
"taskmanager.numberOfTaskSlots": "2",
|
||||
"pipeline.max-parallelism": "100"
|
||||
},
|
||||
"flinkVersion": "v1_17",
|
||||
"image": "193044292705.dkr.ecr.ap-south-1.amazonaws.com/common/flink:1.17.2-s3-hadoop",
|
||||
@@ -690,7 +689,7 @@
|
||||
"apiVersion": "policy/v1",
|
||||
"kind": "PodDisruptionBudget",
|
||||
"metadata": {
|
||||
"annotations": { },
|
||||
"annotations": {},
|
||||
"labels": {
|
||||
"Environment": "dev",
|
||||
"Name": "test-job",
|
||||
@@ -722,7 +721,7 @@
|
||||
"apiVersion": "policy/v1",
|
||||
"kind": "PodDisruptionBudget",
|
||||
"metadata": {
|
||||
"annotations": { },
|
||||
"annotations": {},
|
||||
"labels": {
|
||||
"Environment": "dev",
|
||||
"Name": "test-job",
|
||||
@@ -750,8 +749,6 @@
|
||||
"maxUnavailable": 0
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
],
|
||||
"kind": "List",
|
||||
"apiVersion": "v1"
|
||||
|
||||
@@ -93,7 +93,10 @@
|
||||
"flinkConfiguration": {
|
||||
"flinkVersion": "v1_17",
|
||||
"taskManagerSlots": 2,
|
||||
"savepointFrequency": "1h"
|
||||
"savepointFrequency": "1h",
|
||||
"autoscaler": {
|
||||
"enabled": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"flinkJob": {
|
||||
|
||||
@@ -85,7 +85,10 @@
|
||||
"flinkConfiguration": {
|
||||
"taskManagerSlots": 2,
|
||||
"savepointFrequency": "1d",
|
||||
"flinkVersion": "v1_17"
|
||||
"flinkVersion": "v1_17",
|
||||
"autoscaler": {
|
||||
"enabled": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"flinkJob": {
|
||||
|
||||
@@ -71,7 +71,10 @@
|
||||
"flinkConfiguration": {
|
||||
"taskManagerSlots": 2,
|
||||
"savepointFrequency": "1h",
|
||||
"flinkVersion": "v1_17"
|
||||
"flinkVersion": "v1_17",
|
||||
"autoscaler": {
|
||||
"enabled": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"flinkJob": {
|
||||
|
||||
@@ -59,7 +59,10 @@
|
||||
"flinkConfiguration": {
|
||||
"taskManagerSlots": 2,
|
||||
"savepointFrequency": "1h",
|
||||
"flinkVersion": "v1_17"
|
||||
"flinkVersion": "v1_17",
|
||||
"autoscaler": {
|
||||
"enabled": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"flinkJob": {
|
||||
|
||||
Reference in New Issue
Block a user