Monitoring and Alerting
This guide covers comprehensive monitoring, alerting, and observability strategies for CVSS Parser in production environments.
Overview
Effective monitoring ensures:
- System health and performance visibility
- Proactive issue detection
- Performance optimization insights
- Compliance and audit trails
- Incident response capabilities
Metrics Collection
Application Metrics
go
type CVSSMetrics struct {
// Processing metrics
VectorsProcessed prometheus.Counter
ProcessingDuration prometheus.Histogram
ProcessingErrors *prometheus.CounterVec
// Cache metrics
CacheHits prometheus.Counter
CacheMisses prometheus.Counter
CacheSize prometheus.Gauge
// Business metrics
SeverityDistribution *prometheus.CounterVec
VectorTypes *prometheus.CounterVec
// System metrics
MemoryUsage prometheus.Gauge
GoroutineCount prometheus.Gauge
GCDuration prometheus.Histogram
}
func NewCVSSMetrics() *CVSSMetrics {
metrics := &CVSSMetrics{
VectorsProcessed: prometheus.NewCounter(prometheus.CounterOpts{
Name: "cvss_vectors_processed_total",
Help: "Total number of CVSS vectors processed",
}),
ProcessingDuration: prometheus.NewHistogram(prometheus.HistogramOpts{
Name: "cvss_processing_duration_seconds",
Help: "Time spent processing CVSS vectors",
Buckets: []float64{0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0},
}),
ProcessingErrors: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "cvss_processing_errors_total",
Help: "Total number of processing errors by type",
},
[]string{"error_type"},
),
SeverityDistribution: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "cvss_severity_distribution_total",
Help: "Distribution of CVSS severity levels",
},
[]string{"severity"},
),
}
// Register metrics
prometheus.MustRegister(
metrics.VectorsProcessed,
metrics.ProcessingDuration,
metrics.ProcessingErrors,
metrics.SeverityDistribution,
)
return metrics
}
func (m *CVSSMetrics) RecordProcessing(duration time.Duration, severity string, err error) {
m.VectorsProcessed.Inc()
m.ProcessingDuration.Observe(duration.Seconds())
m.SeverityDistribution.WithLabelValues(severity).Inc()
if err != nil {
errorType := categorizeError(err)
m.ProcessingErrors.WithLabelValues(errorType).Inc()
}
}
System Metrics Collection
go
func (m *CVSSMetrics) CollectSystemMetrics() {
go func() {
ticker := time.NewTicker(30 * time.Second)
defer ticker.Stop()
for range ticker.C {
var memStats runtime.MemStats
runtime.ReadMemStats(&memStats)
m.MemoryUsage.Set(float64(memStats.Alloc))
m.GoroutineCount.Set(float64(runtime.NumGoroutine()))
}
}()
}
Logging
Structured Logging
go
type Logger struct {
logger *logrus.Logger
}
func NewLogger(level string) *Logger {
logger := logrus.New()
logger.SetFormatter(&logrus.JSONFormatter{})
logLevel, err := logrus.ParseLevel(level)
if err != nil {
logLevel = logrus.InfoLevel
}
logger.SetLevel(logLevel)
return &Logger{logger: logger}
}
func (l *Logger) LogVectorProcessing(ctx context.Context, vector string, score float64, duration time.Duration) {
l.logger.WithFields(logrus.Fields{
"trace_id": getTraceID(ctx),
"vector": vector,
"score": score,
"duration": duration.Milliseconds(),
"severity": getSeverityFromScore(score),
"timestamp": time.Now().UTC(),
"component": "cvss_processor",
}).Info("Vector processed successfully")
}
func (l *Logger) LogError(ctx context.Context, err error, vector string) {
l.logger.WithFields(logrus.Fields{
"trace_id": getTraceID(ctx),
"error": err.Error(),
"vector": vector,
"error_type": categorizeError(err),
"timestamp": time.Now().UTC(),
"component": "cvss_processor",
}).Error("Vector processing failed")
}
Log Aggregation
yaml
# Fluentd configuration
<source>
@type tail
path /var/log/cvss-service/*.log
pos_file /var/log/fluentd/cvss-service.log.pos
tag cvss.service
format json
time_key timestamp
time_format %Y-%m-%dT%H:%M:%S.%LZ
</source>
<filter cvss.service>
@type record_transformer
<record>
service cvss-parser
environment ${ENV}
version ${VERSION}
</record>
</filter>
<match cvss.service>
@type elasticsearch
host elasticsearch.monitoring.svc.cluster.local
port 9200
index_name cvss-logs
type_name _doc
</match>
Alerting Rules
Prometheus Alerting Rules
yaml
groups:
- name: cvss-service
rules:
- alert: CVSSHighErrorRate
expr: rate(cvss_processing_errors_total[5m]) > 0.1
for: 2m
labels:
severity: warning
service: cvss-parser
annotations:
summary: "High error rate in CVSS processing"
description: "Error rate is {{ $value }} errors per second"
- alert: CVSSHighLatency
expr: histogram_quantile(0.95, rate(cvss_processing_duration_seconds_bucket[5m])) > 1.0
for: 5m
labels:
severity: warning
service: cvss-parser
annotations:
summary: "High latency in CVSS processing"
description: "95th percentile latency is {{ $value }} seconds"
- alert: CVSSServiceDown
expr: up{job="cvss-service"} == 0
for: 1m
labels:
severity: critical
service: cvss-parser
annotations:
summary: "CVSS service is down"
description: "CVSS service has been down for more than 1 minute"
- alert: CVSSHighMemoryUsage
expr: cvss_memory_usage_bytes > 1073741824 # 1GB
for: 5m
labels:
severity: warning
service: cvss-parser
annotations:
summary: "High memory usage in CVSS service"
description: "Memory usage is {{ $value | humanizeBytes }}"
- alert: CVSSCacheMissRate
expr: rate(cvss_cache_misses_total[5m]) / (rate(cvss_cache_hits_total[5m]) + rate(cvss_cache_misses_total[5m])) > 0.8
for: 10m
labels:
severity: warning
service: cvss-parser
annotations:
summary: "High cache miss rate"
description: "Cache miss rate is {{ $value | humanizePercentage }}"
Alert Manager Configuration
yaml
global:
smtp_smarthost: 'smtp.company.com:587'
smtp_from: 'alerts@company.com'
route:
group_by: ['alertname', 'service']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'web.hook'
routes:
- match:
severity: critical
receiver: 'critical-alerts'
- match:
service: cvss-parser
receiver: 'cvss-team'
receivers:
- name: 'web.hook'
webhook_configs:
- url: 'http://slack-webhook/alerts'
- name: 'critical-alerts'
email_configs:
- to: 'oncall@company.com'
subject: 'CRITICAL: {{ .GroupLabels.alertname }}'
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
{{ end }}
pagerduty_configs:
- service_key: 'your-pagerduty-key'
- name: 'cvss-team'
slack_configs:
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
channel: '#cvss-alerts'
title: 'CVSS Service Alert'
text: '{{ .CommonAnnotations.summary }}'
Dashboards
Grafana Dashboard
json
{
"dashboard": {
"title": "CVSS Parser Monitoring",
"panels": [
{
"title": "Request Rate",
"type": "graph",
"targets": [
{
"expr": "rate(cvss_vectors_processed_total[5m])",
"legendFormat": "Requests/sec"
}
]
},
{
"title": "Response Time",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.50, rate(cvss_processing_duration_seconds_bucket[5m]))",
"legendFormat": "50th percentile"
},
{
"expr": "histogram_quantile(0.95, rate(cvss_processing_duration_seconds_bucket[5m]))",
"legendFormat": "95th percentile"
}
]
},
{
"title": "Error Rate",
"type": "graph",
"targets": [
{
"expr": "rate(cvss_processing_errors_total[5m])",
"legendFormat": "Errors/sec"
}
]
},
{
"title": "Severity Distribution",
"type": "pie",
"targets": [
{
"expr": "cvss_severity_distribution_total",
"legendFormat": "{{ severity }}"
}
]
}
]
}
}
Health Checks
Comprehensive Health Monitoring
go
type HealthMonitor struct {
service *CVSSService
db *sql.DB
redis *redis.Client
lastCheck time.Time
status HealthStatus
mutex sync.RWMutex
}
type HealthStatus struct {
Overall string `json:"overall"`
Components map[string]ComponentHealth `json:"components"`
Timestamp time.Time `json:"timestamp"`
Uptime time.Duration `json:"uptime"`
}
type ComponentHealth struct {
Status string `json:"status"`
ResponseTime time.Duration `json:"response_time"`
Error string `json:"error,omitempty"`
}
func (hm *HealthMonitor) CheckHealth() HealthStatus {
hm.mutex.Lock()
defer hm.mutex.Unlock()
status := HealthStatus{
Components: make(map[string]ComponentHealth),
Timestamp: time.Now(),
Uptime: time.Since(startTime),
}
// Check database
start := time.Now()
if err := hm.db.Ping(); err != nil {
status.Components["database"] = ComponentHealth{
Status: "unhealthy",
Error: err.Error(),
ResponseTime: time.Since(start),
}
} else {
status.Components["database"] = ComponentHealth{
Status: "healthy",
ResponseTime: time.Since(start),
}
}
// Check Redis
start = time.Now()
if err := hm.redis.Ping().Err(); err != nil {
status.Components["redis"] = ComponentHealth{
Status: "unhealthy",
Error: err.Error(),
ResponseTime: time.Since(start),
}
} else {
status.Components["redis"] = ComponentHealth{
Status: "healthy",
ResponseTime: time.Since(start),
}
}
// Check CVSS processing
start = time.Now()
testVector := "CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:L/I:L/A:L"
if _, err := hm.service.ProcessVector(context.Background(), testVector); err != nil {
status.Components["cvss_processing"] = ComponentHealth{
Status: "unhealthy",
Error: err.Error(),
ResponseTime: time.Since(start),
}
} else {
status.Components["cvss_processing"] = ComponentHealth{
Status: "healthy",
ResponseTime: time.Since(start),
}
}
// Determine overall status
status.Overall = "healthy"
for _, component := range status.Components {
if component.Status != "healthy" {
status.Overall = "unhealthy"
break
}
}
hm.status = status
hm.lastCheck = time.Now()
return status
}
Distributed Tracing
OpenTelemetry Integration
go
func initTracing() {
exporter, err := jaeger.New(jaeger.WithCollectorEndpoint(jaeger.WithEndpoint("http://jaeger:14268/api/traces")))
if err != nil {
log.Fatal(err)
}
tp := trace.NewTracerProvider(
trace.WithBatcher(exporter),
trace.WithResource(resource.NewWithAttributes(
semconv.SchemaURL,
semconv.ServiceNameKey.String("cvss-parser"),
semconv.ServiceVersionKey.String(version.Get()),
)),
)
otel.SetTracerProvider(tp)
otel.SetTextMapPropagator(propagation.TraceContext{})
}
func (s *CVSSService) ProcessVectorWithTracing(ctx context.Context, vectorStr string) (*VectorResult, error) {
tracer := otel.Tracer("cvss-parser")
ctx, span := tracer.Start(ctx, "process_vector")
defer span.End()
span.SetAttributes(
attribute.String("cvss.vector", vectorStr),
attribute.String("cvss.version", "3.1"),
)
// Parse vector
ctx, parseSpan := tracer.Start(ctx, "parse_vector")
vector, err := s.parser.Parse(vectorStr)
parseSpan.End()
if err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, "Failed to parse vector")
return nil, err
}
// Calculate score
ctx, calcSpan := tracer.Start(ctx, "calculate_score")
score, err := s.calculator.Calculate(vector)
calcSpan.End()
if err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, "Failed to calculate score")
return nil, err
}
span.SetAttributes(
attribute.Float64("cvss.score", score),
attribute.String("cvss.severity", s.calculator.GetSeverityRating(score)),
)
return &VectorResult{
Vector: vectorStr,
Score: score,
Severity: s.calculator.GetSeverityRating(score),
}, nil
}
Performance Monitoring
SLA Monitoring
go
type SLAMonitor struct {
targets map[string]SLATarget
metrics *CVSSMetrics
}
type SLATarget struct {
Name string
Threshold float64
Window time.Duration
Description string
}
func NewSLAMonitor(metrics *CVSSMetrics) *SLAMonitor {
return &SLAMonitor{
metrics: metrics,
targets: map[string]SLATarget{
"availability": {
Name: "Service Availability",
Threshold: 99.9,
Window: 24 * time.Hour,
Description: "Service should be available 99.9% of the time",
},
"latency_p95": {
Name: "95th Percentile Latency",
Threshold: 500, // milliseconds
Window: 5 * time.Minute,
Description: "95% of requests should complete within 500ms",
},
"error_rate": {
Name: "Error Rate",
Threshold: 1.0, // percentage
Window: 5 * time.Minute,
Description: "Error rate should be below 1%",
},
},
}
}
func (sla *SLAMonitor) CheckSLA(target string) (bool, float64, error) {
slaTarget, exists := sla.targets[target]
if !exists {
return false, 0, fmt.Errorf("unknown SLA target: %s", target)
}
switch target {
case "availability":
return sla.checkAvailability(slaTarget)
case "latency_p95":
return sla.checkLatency(slaTarget)
case "error_rate":
return sla.checkErrorRate(slaTarget)
default:
return false, 0, fmt.Errorf("unsupported SLA target: %s", target)
}
}
Incident Response
Automated Incident Detection
go
type IncidentDetector struct {
alertManager *AlertManager
escalation *EscalationPolicy
logger *Logger
}
func (id *IncidentDetector) HandleAlert(alert Alert) {
incident := &Incident{
ID: generateIncidentID(),
Alert: alert,
Severity: alert.Severity,
Status: "open",
CreatedAt: time.Now(),
UpdatedAt: time.Now(),
}
// Log incident
id.logger.LogIncident(incident)
// Auto-remediation for known issues
if remediation := id.getAutoRemediation(alert); remediation != nil {
if err := remediation.Execute(); err != nil {
id.logger.LogError(context.Background(), err, "Auto-remediation failed")
} else {
incident.Status = "auto-resolved"
incident.Resolution = "Auto-remediated"
incident.UpdatedAt = time.Now()
}
}
// Escalate if needed
if incident.Status == "open" {
id.escalation.Escalate(incident)
}
}
Next Steps
After implementing monitoring, consider:
- Performance Optimization - Advanced optimization
- Security Monitoring - Security-focused monitoring
- Capacity Planning - Resource planning
Related Documentation
- Metrics Reference - Complete metrics documentation
- Alerting Cookbook - Common alerting patterns
- Troubleshooting Guide - Common issues and solutions