INFRA-2856 : Added alerts for incident creation, resolve and zenduty failures (#366)
This commit is contained in:
@@ -6,6 +6,7 @@ import (
|
||||
"github.com/gin-gonic/gin"
|
||||
"go.uber.org/zap"
|
||||
"gorm.io/gorm"
|
||||
"houston/common/metrics"
|
||||
"houston/common/util"
|
||||
"houston/logger"
|
||||
"houston/service"
|
||||
@@ -54,6 +55,7 @@ func (handler *IncidentHandler) HandleCreateIncident(c *gin.Context) {
|
||||
incidentResponse, err := handler.service.CreateIncident(createIncidentRequest, "API", "")
|
||||
if err != nil {
|
||||
logger.Error(fmt.Sprintf("%s Failed to create incident", logTag), zap.Error(err))
|
||||
metrics.PublishIncidentCreationFailureMetric()
|
||||
c.JSON(http.StatusInternalServerError, common.ErrorResponse(err, http.StatusInternalServerError, nil))
|
||||
return
|
||||
}
|
||||
@@ -117,6 +119,7 @@ func (handler *IncidentHandler) HandleResolveIncident(c *gin.Context) {
|
||||
|
||||
err = handler.service.ResolveIncident(resolveIncidentRequest, userEmail)
|
||||
if err != nil {
|
||||
metrics.PublishIncidentResolutionFailureMetric()
|
||||
common.HandleErrorResponse(c, err)
|
||||
return
|
||||
}
|
||||
|
||||
10
cmd/main.go
10
cmd/main.go
@@ -26,7 +26,15 @@ func main() {
|
||||
logger.InitLogger()
|
||||
appcontext.InitiateContext()
|
||||
appcontext.InitializeServices()
|
||||
prometheus.MustRegister(metrics.SlackChannelCreationFailureCounter, metrics.RCAGenerationFailureCounter, metrics.KrakatoaWorkflowFailureCounter, metrics.ConferenceFailureCounter)
|
||||
prometheus.MustRegister(
|
||||
metrics.SlackChannelCreationFailureCounter,
|
||||
metrics.RCAGenerationFailureCounter,
|
||||
metrics.KrakatoaWorkflowFailureCounter,
|
||||
metrics.ConferenceFailureCounter,
|
||||
metrics.ZendutyCreationFailureCounter,
|
||||
metrics.IncidentCreationFailureCounter,
|
||||
metrics.IncidentResolutionFailureCounter,
|
||||
)
|
||||
|
||||
command := &cobra.Command{
|
||||
Use: "houston",
|
||||
|
||||
17
common/metrics/metrics.go
Normal file
17
common/metrics/metrics.go
Normal file
@@ -0,0 +1,17 @@
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"houston/internal/metrics"
|
||||
"houston/logger"
|
||||
"houston/model/ingester"
|
||||
)
|
||||
|
||||
func PublishIncidentCreationFailureMetric() {
|
||||
logger.Info("Publishing incident creation failure metric")
|
||||
metrics.NewMetricPublisher().PublishMetrics(ingester.MetricAttributes{}, ingester.IncidentCreationFailureMetrics)
|
||||
}
|
||||
|
||||
func PublishIncidentResolutionFailureMetric() {
|
||||
logger.Info("Publishing incident resolution failure metric")
|
||||
metrics.NewMetricPublisher().PublishMetrics(ingester.MetricAttributes{}, ingester.IncidentResolutionFailureMetrics)
|
||||
}
|
||||
@@ -57,6 +57,28 @@ func (amp *PublisherImpl) PublishMetrics(metricAttributes ingester.MetricAttribu
|
||||
}
|
||||
return
|
||||
}
|
||||
case ingester.ZendutyCreationFailureMetrics:
|
||||
{
|
||||
if err := publishZendutyCreationFailureMetric(); err != nil {
|
||||
logger.Error("error while publishing zenduty creation failure metrics", zap.Error(err))
|
||||
}
|
||||
return
|
||||
}
|
||||
case ingester.IncidentCreationFailureMetrics:
|
||||
{
|
||||
if err := publishIncidentCreationFailureMetric(); err != nil {
|
||||
logger.Error("error while publishing incident creation failure metrics", zap.Error(err))
|
||||
}
|
||||
return
|
||||
}
|
||||
case ingester.IncidentResolutionFailureMetrics:
|
||||
{
|
||||
if err := publishIncidentResolutionFailureMetric(); err != nil {
|
||||
logger.Error("error while publishing incident resolution failure metrics", zap.Error(err))
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
default:
|
||||
{
|
||||
return
|
||||
@@ -130,3 +152,33 @@ func publishConferenceFailureMetric(conferenceFailureMetrics ingester.Conference
|
||||
).Inc()
|
||||
return
|
||||
}
|
||||
|
||||
func publishZendutyCreationFailureMetric() (err error) {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
err = r.(error)
|
||||
}
|
||||
}()
|
||||
ZendutyCreationFailureCounter.WithLabelValues().Inc()
|
||||
return
|
||||
}
|
||||
|
||||
func publishIncidentCreationFailureMetric() (err error) {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
err = r.(error)
|
||||
}
|
||||
}()
|
||||
IncidentCreationFailureCounter.WithLabelValues().Inc()
|
||||
return
|
||||
}
|
||||
|
||||
func publishIncidentResolutionFailureMetric() (err error) {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
err = r.(error)
|
||||
}
|
||||
}()
|
||||
IncidentResolutionFailureCounter.WithLabelValues().Inc()
|
||||
return
|
||||
}
|
||||
|
||||
@@ -74,4 +74,28 @@ var (
|
||||
},
|
||||
[]string{"event", "conference_error"},
|
||||
)
|
||||
|
||||
ZendutyCreationFailureCounter = prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Name: "houston_zenduty_creation_failure",
|
||||
Help: "Houston zenduty creation failure",
|
||||
},
|
||||
[]string{},
|
||||
)
|
||||
|
||||
IncidentCreationFailureCounter = prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Name: "houston_incident_creation_failure",
|
||||
Help: "Houston incident creation failure",
|
||||
},
|
||||
[]string{},
|
||||
)
|
||||
|
||||
IncidentResolutionFailureCounter = prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Name: "houston_incident_resolution_failure",
|
||||
Help: "Houston incident resolution failure",
|
||||
},
|
||||
[]string{},
|
||||
)
|
||||
)
|
||||
|
||||
@@ -7,6 +7,7 @@ import (
|
||||
"github.com/spf13/viper"
|
||||
"go.uber.org/zap"
|
||||
"houston/common/jira"
|
||||
"houston/common/metrics"
|
||||
"houston/common/util"
|
||||
"houston/internal/processor/action/view"
|
||||
"houston/logger"
|
||||
@@ -120,6 +121,7 @@ func (action *IncidentRCASectionAction) PerformSetIncidentRCADetailsAction(
|
||||
*request, incidentEntity, callback.User.ID, string(requesterType),
|
||||
)
|
||||
if err != nil {
|
||||
metrics.PublishIncidentResolutionFailureMetric()
|
||||
logger.Error(fmt.Sprintf("Error while resolving incident with id: %d", incidentEntity.ID), zap.Error(err))
|
||||
}
|
||||
return
|
||||
|
||||
@@ -7,6 +7,7 @@ import (
|
||||
"github.com/spf13/viper"
|
||||
"go.uber.org/zap"
|
||||
"houston/appcontext"
|
||||
"houston/common/metrics"
|
||||
"houston/common/util"
|
||||
"houston/internal"
|
||||
"houston/internal/processor/action/view"
|
||||
@@ -91,6 +92,7 @@ func (action *StartIncidentCommandAction) startIncidentWithParams(
|
||||
createIncidentResponse, err := appcontext.GetIncidentService().CreateIncident(*createIncidentRequest, "SLACK", cmd.ChannelID)
|
||||
if err != nil {
|
||||
logger.Error(fmt.Sprintf("%s failed to create incident. %+v", startIncidentActionLogTag, err))
|
||||
metrics.PublishIncidentCreationFailureMetric()
|
||||
return fmt.Errorf("failed to create incident")
|
||||
}
|
||||
logger.Info(fmt.Sprintf("%s incident created: %+v", startIncidentActionLogTag, createIncidentResponse))
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"gorm.io/gorm"
|
||||
"houston/common/metrics"
|
||||
"houston/common/util"
|
||||
"houston/internal/processor/action/view"
|
||||
"houston/logger"
|
||||
@@ -174,6 +175,7 @@ func (isp *CreateIncidentAction) CreateIncidentModalCommandProcessingV2(
|
||||
_, err = service.CreateIncident(*createIncidentRequest, "SLACK", callback.View.PrivateMetadata)
|
||||
if err != nil {
|
||||
logger.Error("[CIP] Error while creating incident", zap.Error(err))
|
||||
metrics.PublishIncidentCreationFailureMetric()
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
@@ -3,11 +3,14 @@ package ingester
|
||||
type MetricType string
|
||||
|
||||
const (
|
||||
ApiMetrics MetricType = "API_METRICS"
|
||||
SlackMetrics MetricType = "SLACK_METRICS"
|
||||
RCAGenerationFailureMetrics MetricType = "RCA_GENERATION_FAILURE_METRICS"
|
||||
KrakatoaWorkflowFailureMetrics MetricType = "KRAKATOA_WORKFLOW_FAILURE_METRICS"
|
||||
ConferenceFailureMetrics MetricType = "CONFERENCE_FAILURE_METRICS"
|
||||
ApiMetrics MetricType = "API_METRICS"
|
||||
SlackMetrics MetricType = "SLACK_METRICS"
|
||||
RCAGenerationFailureMetrics MetricType = "RCA_GENERATION_FAILURE_METRICS"
|
||||
KrakatoaWorkflowFailureMetrics MetricType = "KRAKATOA_WORKFLOW_FAILURE_METRICS"
|
||||
ConferenceFailureMetrics MetricType = "CONFERENCE_FAILURE_METRICS"
|
||||
ZendutyCreationFailureMetrics MetricType = "HOUSTON_ZENDUTY_CREATION_FAILURE_METRICS"
|
||||
IncidentCreationFailureMetrics MetricType = "HOUSTON_INCIDENT_CREATION_FAILURE_METRICS"
|
||||
IncidentResolutionFailureMetrics MetricType = "HOUSTON_INCIDENT_RESOLUTION_FAILURE_METRICS"
|
||||
)
|
||||
|
||||
type ApiMetric struct {
|
||||
|
||||
@@ -7,8 +7,10 @@ import (
|
||||
"github.com/spf13/viper"
|
||||
"go.uber.org/zap"
|
||||
"houston/common/util"
|
||||
"houston/internal/metrics"
|
||||
"houston/logger"
|
||||
"houston/model/externalTeam"
|
||||
"houston/model/ingester"
|
||||
"houston/pkg/alertClient"
|
||||
request "houston/service/request"
|
||||
response "houston/service/response"
|
||||
@@ -49,8 +51,14 @@ func (alertService *AlertService) CreateIncidentAlert(incidentDTO response.Incid
|
||||
err = alertService.AlertClient.CreateIncident(alertRequest)
|
||||
if err != nil {
|
||||
logger.Error(fmt.Sprintf("Error while sending alert for teamId: %d", teamId))
|
||||
publishZendutyCreationFailureMetrics()
|
||||
return err
|
||||
}
|
||||
logger.Info(fmt.Sprintf("Succesfully sent alert for teamId: %d", teamId))
|
||||
return nil
|
||||
}
|
||||
|
||||
func publishZendutyCreationFailureMetrics() {
|
||||
logger.Info("Publishing zenduty creation failure metrics")
|
||||
metrics.NewMetricPublisher().PublishMetrics(ingester.MetricAttributes{}, ingester.ZendutyCreationFailureMetrics)
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@ import (
|
||||
"github.com/spf13/viper"
|
||||
"go.uber.org/zap"
|
||||
"gorm.io/gorm"
|
||||
"houston/common/metrics"
|
||||
"houston/common/util"
|
||||
houstonSlackUtil "houston/common/util/slack"
|
||||
"houston/internal/processor/action/view"
|
||||
@@ -200,6 +201,7 @@ func (i *IncidentServiceV2) CreateIncident(
|
||||
go func() {
|
||||
err := createIncidentWorkflow(i, channel, incidentEntity, teamEntity, severityEntity, incidentStatusEntity, blazeGroupChannelID)
|
||||
if err != nil {
|
||||
metrics.PublishIncidentCreationFailureMetric()
|
||||
return
|
||||
}
|
||||
i.HandleKrakatoaWorkflow(incidentEntity)
|
||||
|
||||
Reference in New Issue
Block a user