diff --git a/cmd/main.go b/cmd/main.go index 17d728c..6883d15 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -26,7 +26,7 @@ func main() { logger.InitLogger() appcontext.InitiateContext() appcontext.InitializeServices() - prometheus.MustRegister(metrics.SlackChannelCreationFailureCounter, metrics.RCAGenerationFailureCounter, metrics.ConferenceFailureCounter) + prometheus.MustRegister(metrics.SlackChannelCreationFailureCounter, metrics.RCAGenerationFailureCounter, metrics.KrakatoaWorkflowFailureCounter, metrics.ConferenceFailureCounter) command := &cobra.Command{ Use: "houston", diff --git a/internal/metrics/metric_publisher.go b/internal/metrics/metric_publisher.go index 8bd4783..c314a2b 100644 --- a/internal/metrics/metric_publisher.go +++ b/internal/metrics/metric_publisher.go @@ -42,6 +42,14 @@ func (amp *PublisherImpl) PublishMetrics(metricAttributes ingester.MetricAttribu return } + case ingester.KrakatoaWorkflowFailureMetrics: + { + if err := publishKrakatoaWorkflowFailureMetric(metricAttributes.KrakatoaWorkflowFailureMetric); err != nil { + logger.Error("error while publishing krakatoa workflow failure metrics", zap.Error(err)) + } + return + } + case ingester.ConferenceFailureMetrics: { if err := publishConferenceFailureMetric(metricAttributes.ConferenceFailureMetric); err != nil { @@ -97,6 +105,19 @@ func publishRCAGenerationFailureMetric(rcaGenerationFailureMetrics ingester.RCAG return } +func publishKrakatoaWorkflowFailureMetric(krakatoaWorkflowFailureMetrics ingester.KrakatoaWorkflowFailureMetric) (err error) { + defer func() { + if r := recover(); r != nil { + err = r.(error) + } + }() + KrakatoaWorkflowFailureCounter.WithLabelValues( + strconv.Itoa(int(krakatoaWorkflowFailureMetrics.IncidentId)), + krakatoaWorkflowFailureMetrics.KrakatoaWorkflowError, + ).Inc() + return +} + func publishConferenceFailureMetric(conferenceFailureMetrics ingester.ConferenceFailureMetric) (err error) { defer func() { if r := recover(); r != nil { diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index 8c5abcc..d19d656 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -60,6 +60,13 @@ var ( []string{"incident_id", "rca_generation_error"}, ) + KrakatoaWorkflowFailureCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "krakatoa_workflow_failure", + Help: "Krakatoa workflow failure", + }, + []string{"incident_id", "krakatoa_workflow_error"}, + ) ConferenceFailureCounter = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "conference_failure", diff --git a/model/ingester/performance_metrics.go b/model/ingester/performance_metrics.go index 521283b..3e41ad5 100644 --- a/model/ingester/performance_metrics.go +++ b/model/ingester/performance_metrics.go @@ -3,10 +3,11 @@ package ingester type MetricType string const ( - ApiMetrics MetricType = "API_METRICS" - SlackMetrics MetricType = "SLACK_METRICS" - RCAGenerationFailureMetrics MetricType = "RCA_GENERATION_FAILURE_METRICS" - ConferenceFailureMetrics MetricType = "CONFERENCE_FAILURE_METRICS" + ApiMetrics MetricType = "API_METRICS" + SlackMetrics MetricType = "SLACK_METRICS" + RCAGenerationFailureMetrics MetricType = "RCA_GENERATION_FAILURE_METRICS" + KrakatoaWorkflowFailureMetrics MetricType = "KRAKATOA_WORKFLOW_FAILURE_METRICS" + ConferenceFailureMetrics MetricType = "CONFERENCE_FAILURE_METRICS" ) type ApiMetric struct { @@ -32,6 +33,11 @@ type RCAGenerationFailureMetric struct { RCAGenerationError string `json:"rca_generation_error,omitempty"` } +type KrakatoaWorkflowFailureMetric struct { + IncidentId uint `json:"incident_id,omitempty"` + KrakatoaWorkflowError string `json:"krakatoa_workflow_error,omitempty"` +} + type ConferenceFailureMetric struct { Event string `json:"event,omitempty"` ConferenceError string `json:"conference_error,omitempty"` @@ -41,5 +47,6 @@ type MetricAttributes struct { ApiMetric SlackMetric RCAGenerationFailureMetric + KrakatoaWorkflowFailureMetric ConferenceFailureMetric } diff --git a/service/incident/incident_service_v2.go b/service/incident/incident_service_v2.go index 12947db..51eb7df 100644 --- a/service/incident/incident_service_v2.go +++ b/service/incident/incident_service_v2.go @@ -633,7 +633,7 @@ func (i *IncidentServiceV2) HandleKrakatoaWorkflow(incidentEntity *incident.Inci slackChannels := util.GetSlackChannelNamesFromIncidentEntities(krakatoaIncidents) - return i.krakatoaService.ExecuteKrakatoaWorkflow(incidentEntity.SlackChannel, slackChannels) + return i.krakatoaService.ExecuteKrakatoaWorkflow(incidentEntity.ID, incidentEntity.SlackChannel, slackChannels) } logger.Info("Incident is of severity 3, skipping krakatoa workflow") return nil diff --git a/service/krakatoa/krakatoa_service.go b/service/krakatoa/krakatoa_service.go index b5981e4..dd651dc 100644 --- a/service/krakatoa/krakatoa_service.go +++ b/service/krakatoa/krakatoa_service.go @@ -9,7 +9,9 @@ import ( "houston/common/util" "houston/common/util/channel" "houston/common/util/file" + "houston/internal/metrics" "houston/logger" + "houston/model/ingester" "houston/pkg/monitoringService" "houston/pkg/rest" monitoringServiceImpl "houston/service/monitoringService" @@ -34,23 +36,20 @@ func NewKrakatoaService() *KrakatoaService { const logTag = "[krakatoa]" -func (service *KrakatoaService) ExecuteKrakatoaWorkflow( - incidentSlackChannelId string, - krakatoaSlackChannelNames []string, -) []error { - directory, grafanaError, csvError := service.fetchKrakatoaDirectory(krakatoaSlackChannelNames) +func (service *KrakatoaService) ExecuteKrakatoaWorkflow(incidentId uint, incidentSlackChannelId string, krakatoaSlackChannelNames []string) []error { + directory, grafanaError, csvError := service.fetchKrakatoaDirectory(incidentId, krakatoaSlackChannelNames) - errors := service.handleAndPostKrakatoaFailuresToSlack(grafanaError, csvError, incidentSlackChannelId) + errs := service.handleAndPostKrakatoaFailuresToSlack(grafanaError, csvError, incidentSlackChannelId) if !util.IsBlank(directory) { if postErrors := service.FetchAndPostFilesToSlack( directory, incidentSlackChannelId, ); postErrors != nil && len(postErrors) > 0 { - errors = append(errors, postErrors...) + errs = append(errs, postErrors...) } } - return errors + return errs } func (service *KrakatoaService) GetGrafanaImages(requestId string, slackChannels []string) (path string, err error) { @@ -91,7 +90,7 @@ func (service *KrakatoaService) GetImpactedCustomers(requestId string, slackChan } } -func (service *KrakatoaService) fetchKrakatoaDirectory(slackChannels []string) (directory string, grafanaError error, csvError error) { +func (service *KrakatoaService) fetchKrakatoaDirectory(incidentId uint, slackChannels []string) (directory string, grafanaError error, csvError error) { requestId := uuid.New().String() var waitGroup sync.WaitGroup waitGroup.Add(util.KrakatoaProcessCount) @@ -102,6 +101,9 @@ func (service *KrakatoaService) fetchKrakatoaDirectory(slackChannels []string) ( directory = grafanaDirectory } grafanaError = err + if err != nil { + publishKrakatoaWorkflowFailureMetrics(incidentId, err) + } }) go util.ExecuteConcurrentAction(&waitGroup, func() { @@ -110,6 +112,9 @@ func (service *KrakatoaService) fetchKrakatoaDirectory(slackChannels []string) ( directory = csvDirectory } csvError = err + if err != nil { + publishKrakatoaWorkflowFailureMetrics(incidentId, err) + } }) waitGroup.Wait() @@ -118,36 +123,38 @@ func (service *KrakatoaService) fetchKrakatoaDirectory(slackChannels []string) ( } func (service *KrakatoaService) handleAndPostKrakatoaFailuresToSlack(grafanaError error, csvError error, channelId string) []error { - var errors []error + var errs []error if grafanaError != nil { + errs = append(errs, grafanaError) logger.Error("Error while fetching grafana images", zap.Error(grafanaError)) - service.slackService.PostMessageByChannelID("`Some issue occurred while getting Grafana images`", false, channelId) - errors = append(errors, grafanaError) + _, err := service.slackService.PostMessageByChannelID("`Some issue occurred while getting Grafana images`", false, channelId) + errs = append(errs, err) } if csvError != nil { + errs = append(errs, csvError) logger.Error("Error while fetching impacted customers", zap.Error(csvError)) - service.slackService.PostMessageByChannelID("`Some issue occurred while getting impacted customers`", false, channelId) - errors = append(errors, csvError) + _, err := service.slackService.PostMessageByChannelID("`Some issue occurred while getting impacted customers`", false, channelId) + errs = append(errs, err) } - return errors + return errs } func (service *KrakatoaService) FetchAndPostFilesToSlack(directory string, channelId string) []error { - var errors []error + var errs []error err := service.FetchAndPostFilesWithGivenExtension(directory, channelId, util.ExtensionPNG) if err != nil { logger.Error("Error while posting grafana images to incident slack channel", zap.Error(err)) - errors = append(errors, err) + errs = append(errs, err) } err = service.FetchAndPostFilesWithGivenExtension(directory, channelId, util.ExtensionCSV) if err != nil { logger.Error("Error while posting impacted customers to incident slack channel", zap.Error(err)) - errors = append(errors, err) + errs = append(errs, err) } err = file.RemoveDirectory(directory) @@ -156,10 +163,10 @@ func (service *KrakatoaService) FetchAndPostFilesToSlack(directory string, chann fmt.Sprintf("%s Error occurred while removing directory: %s", logTag, directory), zap.Error(err), ) - errors = append(errors, err) + errs = append(errs, err) } - return errors + return errs } func (service *KrakatoaService) FetchAndPostFilesWithGivenExtension( @@ -180,3 +187,13 @@ func (service *KrakatoaService) FetchAndPostFilesWithGivenExtension( return nil } + +func publishKrakatoaWorkflowFailureMetrics(incidentId uint, err error) { + krakatoaWorkflowFailureMetric := ingester.MetricAttributes{ + KrakatoaWorkflowFailureMetric: ingester.KrakatoaWorkflowFailureMetric{ + IncidentId: incidentId, + KrakatoaWorkflowError: err.Error(), + }, + } + metrics.NewMetricPublisher().PublishMetrics(krakatoaWorkflowFailureMetric, ingester.KrakatoaWorkflowFailureMetrics) +} diff --git a/service/krakatoa/krakatoa_service_interface.go b/service/krakatoa/krakatoa_service_interface.go index 9b9e7d3..7f8176e 100644 --- a/service/krakatoa/krakatoa_service_interface.go +++ b/service/krakatoa/krakatoa_service_interface.go @@ -3,7 +3,7 @@ package krakatoa type IKrakatoaService interface { GetGrafanaImages(requestId string, slackChannels []string) (path string, err error) GetImpactedCustomers(requestId string, slackChannels []string) (path string, err error) - ExecuteKrakatoaWorkflow(incidentSlackChannelId string, krakatoaSlackChannelNames []string) []error + ExecuteKrakatoaWorkflow(incidentId uint, incidentSlackChannelId string, krakatoaSlackChannelNames []string) []error FetchAndPostFilesToSlack(directory string, channelId string) []error FetchAndPostFilesWithGivenExtension(directory string, channelId string, extension string) error } diff --git a/service/krakatoa/krakatoa_service_test.go b/service/krakatoa/krakatoa_service_test.go index 6600f1e..fdd9cab 100644 --- a/service/krakatoa/krakatoa_service_test.go +++ b/service/krakatoa/krakatoa_service_test.go @@ -44,7 +44,7 @@ func (suite *KrakatoaServiceSuite) Test_ExecuteKrakatoaWorkflow_BothSuccessCase( suite.monitoringService.GetImpactedCustomersCSVMock.Return() suite.slackService.UploadFilesToChannelMock.Return() - errors := suite.KrakatoaService.ExecuteKrakatoaWorkflow("", []string{"testChannel"}) + errors := suite.KrakatoaService.ExecuteKrakatoaWorkflow(1, "", []string{"testChannel"}) suite.Equal(0, len(errors)) @@ -75,9 +75,9 @@ func (suite *KrakatoaServiceSuite) Test_ExecuteKrakatoaWorkflow_SingleFailureSin suite.slackService.PostMessageByChannelIDMock.Return("", nil) suite.slackService.UploadFilesToChannelMock.Return() - errors := suite.KrakatoaService.ExecuteKrakatoaWorkflow("", []string{"testChannel"}) + errors := suite.KrakatoaService.ExecuteKrakatoaWorkflow(1, "", []string{"testChannel"}) - suite.Equal(1, len(errors)) + suite.Equal(2, len(errors)) //assert that directory cleanup is done after posting files to slack channel _, err := os.ReadDir(suite.mockDirectory) @@ -105,9 +105,9 @@ func (suite *KrakatoaServiceSuite) Test_ExecuteKrakatoaWorkflow_BothFailureCase( suite.monitoringService.GetImpactedCustomersCSVMock.Return() suite.slackService.PostMessageByChannelIDMock.Return("", nil) - errors := suite.KrakatoaService.ExecuteKrakatoaWorkflow("", []string{"testChannel"}) + errors := suite.KrakatoaService.ExecuteKrakatoaWorkflow(1, "", []string{"testChannel"}) - suite.Equal(2, len(errors)) + suite.Equal(4, len(errors)) //cleanup mock directory after test file.RemoveDirectory(suite.mockDirectory) @@ -122,9 +122,9 @@ func (suite *KrakatoaServiceSuite) Test_ExecuteKrakatoaWorkflow_BothTimeoutCase( suite.monitoringService.GetImpactedCustomersCSVMock.Return() suite.slackService.PostMessageByChannelIDMock.Return("", nil) - errors := suite.KrakatoaService.ExecuteKrakatoaWorkflow("", []string{"testChannel"}) + errors := suite.KrakatoaService.ExecuteKrakatoaWorkflow(1, "", []string{"testChannel"}) - suite.Equal(2, len(errors)) + suite.Equal(4, len(errors)) //cleanup mock directory after test file.RemoveDirectory(suite.mockDirectory) @@ -146,9 +146,9 @@ func (suite *KrakatoaServiceSuite) Test_ExecuteKrakatoaWorkflow_SingleTimeoutSin suite.slackService.PostMessageByChannelIDMock.Return("", nil) suite.slackService.UploadFilesToChannelMock.Return() - errors := suite.KrakatoaService.ExecuteKrakatoaWorkflow("", []string{"testChannel"}) + errors := suite.KrakatoaService.ExecuteKrakatoaWorkflow(1, "", []string{"testChannel"}) - suite.Equal(1, len(errors)) + suite.Equal(2, len(errors)) //assert that directory cleanup is done after posting files to slack channel _, err := os.ReadDir(suite.mockDirectory) @@ -173,9 +173,9 @@ func (suite *KrakatoaServiceSuite) Test_ExecuteKrakatoaWorkflow_SingleTimeoutSin suite.monitoringService.GetImpactedCustomersCSVMock.Return() suite.slackService.PostMessageByChannelIDMock.Return("", nil) - errors := suite.KrakatoaService.ExecuteKrakatoaWorkflow("", []string{"testChannel"}) + errors := suite.KrakatoaService.ExecuteKrakatoaWorkflow(1, "", []string{"testChannel"}) - suite.Equal(2, len(errors)) + suite.Equal(4, len(errors)) //cleanup mock directory after test file.RemoveDirectory(suite.mockDirectory) @@ -199,7 +199,7 @@ func (suite *KrakatoaServiceSuite) Test_ExecuteKrakatoaWorkflow_InvalidDirectory suite.monitoringService.GetImpactedCustomersCSVMock.Return() suite.slackService.UploadFilesToChannelMock.Return() - errors := suite.KrakatoaService.ExecuteKrakatoaWorkflow("", []string{"testChannel"}) + errors := suite.KrakatoaService.ExecuteKrakatoaWorkflow(1, "", []string{"testChannel"}) suite.Equal(2, len(errors))