INFRA-2856 : Added alerts for incident creation, resolve and zenduty failures (#366)

This commit is contained in:
Vijay Joshi
2024-02-12 19:11:02 +05:30
committed by GitHub
parent e88123ede6
commit f8b286adb1
11 changed files with 129 additions and 6 deletions

View File

@@ -6,6 +6,7 @@ import (
"github.com/gin-gonic/gin"
"go.uber.org/zap"
"gorm.io/gorm"
"houston/common/metrics"
"houston/common/util"
"houston/logger"
"houston/service"
@@ -54,6 +55,7 @@ func (handler *IncidentHandler) HandleCreateIncident(c *gin.Context) {
incidentResponse, err := handler.service.CreateIncident(createIncidentRequest, "API", "")
if err != nil {
logger.Error(fmt.Sprintf("%s Failed to create incident", logTag), zap.Error(err))
metrics.PublishIncidentCreationFailureMetric()
c.JSON(http.StatusInternalServerError, common.ErrorResponse(err, http.StatusInternalServerError, nil))
return
}
@@ -117,6 +119,7 @@ func (handler *IncidentHandler) HandleResolveIncident(c *gin.Context) {
err = handler.service.ResolveIncident(resolveIncidentRequest, userEmail)
if err != nil {
metrics.PublishIncidentResolutionFailureMetric()
common.HandleErrorResponse(c, err)
return
}

View File

@@ -26,7 +26,15 @@ func main() {
logger.InitLogger()
appcontext.InitiateContext()
appcontext.InitializeServices()
prometheus.MustRegister(metrics.SlackChannelCreationFailureCounter, metrics.RCAGenerationFailureCounter, metrics.KrakatoaWorkflowFailureCounter, metrics.ConferenceFailureCounter)
prometheus.MustRegister(
metrics.SlackChannelCreationFailureCounter,
metrics.RCAGenerationFailureCounter,
metrics.KrakatoaWorkflowFailureCounter,
metrics.ConferenceFailureCounter,
metrics.ZendutyCreationFailureCounter,
metrics.IncidentCreationFailureCounter,
metrics.IncidentResolutionFailureCounter,
)
command := &cobra.Command{
Use: "houston",

17
common/metrics/metrics.go Normal file
View File

@@ -0,0 +1,17 @@
package metrics
import (
"houston/internal/metrics"
"houston/logger"
"houston/model/ingester"
)
func PublishIncidentCreationFailureMetric() {
logger.Info("Publishing incident creation failure metric")
metrics.NewMetricPublisher().PublishMetrics(ingester.MetricAttributes{}, ingester.IncidentCreationFailureMetrics)
}
func PublishIncidentResolutionFailureMetric() {
logger.Info("Publishing incident resolution failure metric")
metrics.NewMetricPublisher().PublishMetrics(ingester.MetricAttributes{}, ingester.IncidentResolutionFailureMetrics)
}

View File

@@ -57,6 +57,28 @@ func (amp *PublisherImpl) PublishMetrics(metricAttributes ingester.MetricAttribu
}
return
}
case ingester.ZendutyCreationFailureMetrics:
{
if err := publishZendutyCreationFailureMetric(); err != nil {
logger.Error("error while publishing zenduty creation failure metrics", zap.Error(err))
}
return
}
case ingester.IncidentCreationFailureMetrics:
{
if err := publishIncidentCreationFailureMetric(); err != nil {
logger.Error("error while publishing incident creation failure metrics", zap.Error(err))
}
return
}
case ingester.IncidentResolutionFailureMetrics:
{
if err := publishIncidentResolutionFailureMetric(); err != nil {
logger.Error("error while publishing incident resolution failure metrics", zap.Error(err))
}
return
}
default:
{
return
@@ -130,3 +152,33 @@ func publishConferenceFailureMetric(conferenceFailureMetrics ingester.Conference
).Inc()
return
}
func publishZendutyCreationFailureMetric() (err error) {
defer func() {
if r := recover(); r != nil {
err = r.(error)
}
}()
ZendutyCreationFailureCounter.WithLabelValues().Inc()
return
}
func publishIncidentCreationFailureMetric() (err error) {
defer func() {
if r := recover(); r != nil {
err = r.(error)
}
}()
IncidentCreationFailureCounter.WithLabelValues().Inc()
return
}
func publishIncidentResolutionFailureMetric() (err error) {
defer func() {
if r := recover(); r != nil {
err = r.(error)
}
}()
IncidentResolutionFailureCounter.WithLabelValues().Inc()
return
}

View File

@@ -74,4 +74,28 @@ var (
},
[]string{"event", "conference_error"},
)
ZendutyCreationFailureCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "houston_zenduty_creation_failure",
Help: "Houston zenduty creation failure",
},
[]string{},
)
IncidentCreationFailureCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "houston_incident_creation_failure",
Help: "Houston incident creation failure",
},
[]string{},
)
IncidentResolutionFailureCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "houston_incident_resolution_failure",
Help: "Houston incident resolution failure",
},
[]string{},
)
)

View File

@@ -7,6 +7,7 @@ import (
"github.com/spf13/viper"
"go.uber.org/zap"
"houston/common/jira"
"houston/common/metrics"
"houston/common/util"
"houston/internal/processor/action/view"
"houston/logger"
@@ -120,6 +121,7 @@ func (action *IncidentRCASectionAction) PerformSetIncidentRCADetailsAction(
*request, incidentEntity, callback.User.ID, string(requesterType),
)
if err != nil {
metrics.PublishIncidentResolutionFailureMetric()
logger.Error(fmt.Sprintf("Error while resolving incident with id: %d", incidentEntity.ID), zap.Error(err))
}
return

View File

@@ -7,6 +7,7 @@ import (
"github.com/spf13/viper"
"go.uber.org/zap"
"houston/appcontext"
"houston/common/metrics"
"houston/common/util"
"houston/internal"
"houston/internal/processor/action/view"
@@ -91,6 +92,7 @@ func (action *StartIncidentCommandAction) startIncidentWithParams(
createIncidentResponse, err := appcontext.GetIncidentService().CreateIncident(*createIncidentRequest, "SLACK", cmd.ChannelID)
if err != nil {
logger.Error(fmt.Sprintf("%s failed to create incident. %+v", startIncidentActionLogTag, err))
metrics.PublishIncidentCreationFailureMetric()
return fmt.Errorf("failed to create incident")
}
logger.Info(fmt.Sprintf("%s incident created: %+v", startIncidentActionLogTag, createIncidentResponse))

View File

@@ -4,6 +4,7 @@ import (
"encoding/json"
"fmt"
"gorm.io/gorm"
"houston/common/metrics"
"houston/common/util"
"houston/internal/processor/action/view"
"houston/logger"
@@ -174,6 +175,7 @@ func (isp *CreateIncidentAction) CreateIncidentModalCommandProcessingV2(
_, err = service.CreateIncident(*createIncidentRequest, "SLACK", callback.View.PrivateMetadata)
if err != nil {
logger.Error("[CIP] Error while creating incident", zap.Error(err))
metrics.PublishIncidentCreationFailureMetric()
return
}

View File

@@ -3,11 +3,14 @@ package ingester
type MetricType string
const (
ApiMetrics MetricType = "API_METRICS"
SlackMetrics MetricType = "SLACK_METRICS"
RCAGenerationFailureMetrics MetricType = "RCA_GENERATION_FAILURE_METRICS"
KrakatoaWorkflowFailureMetrics MetricType = "KRAKATOA_WORKFLOW_FAILURE_METRICS"
ConferenceFailureMetrics MetricType = "CONFERENCE_FAILURE_METRICS"
ApiMetrics MetricType = "API_METRICS"
SlackMetrics MetricType = "SLACK_METRICS"
RCAGenerationFailureMetrics MetricType = "RCA_GENERATION_FAILURE_METRICS"
KrakatoaWorkflowFailureMetrics MetricType = "KRAKATOA_WORKFLOW_FAILURE_METRICS"
ConferenceFailureMetrics MetricType = "CONFERENCE_FAILURE_METRICS"
ZendutyCreationFailureMetrics MetricType = "HOUSTON_ZENDUTY_CREATION_FAILURE_METRICS"
IncidentCreationFailureMetrics MetricType = "HOUSTON_INCIDENT_CREATION_FAILURE_METRICS"
IncidentResolutionFailureMetrics MetricType = "HOUSTON_INCIDENT_RESOLUTION_FAILURE_METRICS"
)
type ApiMetric struct {

View File

@@ -7,8 +7,10 @@ import (
"github.com/spf13/viper"
"go.uber.org/zap"
"houston/common/util"
"houston/internal/metrics"
"houston/logger"
"houston/model/externalTeam"
"houston/model/ingester"
"houston/pkg/alertClient"
request "houston/service/request"
response "houston/service/response"
@@ -49,8 +51,14 @@ func (alertService *AlertService) CreateIncidentAlert(incidentDTO response.Incid
err = alertService.AlertClient.CreateIncident(alertRequest)
if err != nil {
logger.Error(fmt.Sprintf("Error while sending alert for teamId: %d", teamId))
publishZendutyCreationFailureMetrics()
return err
}
logger.Info(fmt.Sprintf("Succesfully sent alert for teamId: %d", teamId))
return nil
}
func publishZendutyCreationFailureMetrics() {
logger.Info("Publishing zenduty creation failure metrics")
metrics.NewMetricPublisher().PublishMetrics(ingester.MetricAttributes{}, ingester.ZendutyCreationFailureMetrics)
}

View File

@@ -9,6 +9,7 @@ import (
"github.com/spf13/viper"
"go.uber.org/zap"
"gorm.io/gorm"
"houston/common/metrics"
"houston/common/util"
houstonSlackUtil "houston/common/util/slack"
"houston/internal/processor/action/view"
@@ -200,6 +201,7 @@ func (i *IncidentServiceV2) CreateIncident(
go func() {
err := createIncidentWorkflow(i, channel, incidentEntity, teamEntity, severityEntity, incidentStatusEntity, blazeGroupChannelID)
if err != nil {
metrics.PublishIncidentCreationFailureMetric()
return
}
i.HandleKrakatoaWorkflow(incidentEntity)