Monitoring and Alerting

This guide covers comprehensive monitoring, alerting, and observability strategies for CVSS Parser in production environments.

Overview

Effective monitoring ensures:

System health and performance visibility
Proactive issue detection
Performance optimization insights
Compliance and audit trails
Incident response capabilities

Metrics Collection

Application Metrics

type CVSSMetrics struct {
    // Processing metrics
    VectorsProcessed    prometheus.Counter
    ProcessingDuration  prometheus.Histogram
    ProcessingErrors    *prometheus.CounterVec
    
    // Cache metrics
    CacheHits          prometheus.Counter
    CacheMisses        prometheus.Counter
    CacheSize          prometheus.Gauge
    
    // Business metrics
    SeverityDistribution *prometheus.CounterVec
    VectorTypes         *prometheus.CounterVec
    
    // System metrics
    MemoryUsage        prometheus.Gauge
    GoroutineCount     prometheus.Gauge
    GCDuration         prometheus.Histogram
}

func NewCVSSMetrics() *CVSSMetrics {
    metrics := &CVSSMetrics{
        VectorsProcessed: prometheus.NewCounter(prometheus.CounterOpts{
            Name: "cvss_vectors_processed_total",
            Help: "Total number of CVSS vectors processed",
        }),
        ProcessingDuration: prometheus.NewHistogram(prometheus.HistogramOpts{
            Name: "cvss_processing_duration_seconds",
            Help: "Time spent processing CVSS vectors",
            Buckets: []float64{0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0},
        }),
        ProcessingErrors: prometheus.NewCounterVec(
            prometheus.CounterOpts{
                Name: "cvss_processing_errors_total",
                Help: "Total number of processing errors by type",
            },
            []string{"error_type"},
        ),
        SeverityDistribution: prometheus.NewCounterVec(
            prometheus.CounterOpts{
                Name: "cvss_severity_distribution_total",
                Help: "Distribution of CVSS severity levels",
            },
            []string{"severity"},
        ),
    }
    
    // Register metrics
    prometheus.MustRegister(
        metrics.VectorsProcessed,
        metrics.ProcessingDuration,
        metrics.ProcessingErrors,
        metrics.SeverityDistribution,
    )
    
    return metrics
}

func (m *CVSSMetrics) RecordProcessing(duration time.Duration, severity string, err error) {
    m.VectorsProcessed.Inc()
    m.ProcessingDuration.Observe(duration.Seconds())
    m.SeverityDistribution.WithLabelValues(severity).Inc()
    
    if err != nil {
        errorType := categorizeError(err)
        m.ProcessingErrors.WithLabelValues(errorType).Inc()
    }
}

System Metrics Collection

func (m *CVSSMetrics) CollectSystemMetrics() {
    go func() {
        ticker := time.NewTicker(30 * time.Second)
        defer ticker.Stop()
        
        for range ticker.C {
            var memStats runtime.MemStats
            runtime.ReadMemStats(&memStats)
            
            m.MemoryUsage.Set(float64(memStats.Alloc))
            m.GoroutineCount.Set(float64(runtime.NumGoroutine()))
        }
    }()
}

Logging

Structured Logging

type Logger struct {
    logger *logrus.Logger
}

func NewLogger(level string) *Logger {
    logger := logrus.New()
    logger.SetFormatter(&logrus.JSONFormatter{})
    
    logLevel, err := logrus.ParseLevel(level)
    if err != nil {
        logLevel = logrus.InfoLevel
    }
    logger.SetLevel(logLevel)
    
    return &Logger{logger: logger}
}

func (l *Logger) LogVectorProcessing(ctx context.Context, vector string, score float64, duration time.Duration) {
    l.logger.WithFields(logrus.Fields{
        "trace_id":   getTraceID(ctx),
        "vector":     vector,
        "score":      score,
        "duration":   duration.Milliseconds(),
        "severity":   getSeverityFromScore(score),
        "timestamp":  time.Now().UTC(),
        "component":  "cvss_processor",
    }).Info("Vector processed successfully")
}

func (l *Logger) LogError(ctx context.Context, err error, vector string) {
    l.logger.WithFields(logrus.Fields{
        "trace_id":   getTraceID(ctx),
        "error":      err.Error(),
        "vector":     vector,
        "error_type": categorizeError(err),
        "timestamp":  time.Now().UTC(),
        "component":  "cvss_processor",
    }).Error("Vector processing failed")
}

Log Aggregation

yaml

# Fluentd configuration
<source>
  @type tail
  path /var/log/cvss-service/*.log
  pos_file /var/log/fluentd/cvss-service.log.pos
  tag cvss.service
  format json
  time_key timestamp
  time_format %Y-%m-%dT%H:%M:%S.%LZ
</source>

<filter cvss.service>
  @type record_transformer
  <record>
    service cvss-parser
    environment ${ENV}
    version ${VERSION}
  </record>
</filter>

<match cvss.service>
  @type elasticsearch
  host elasticsearch.monitoring.svc.cluster.local
  port 9200
  index_name cvss-logs
  type_name _doc
</match>

Alerting Rules

Prometheus Alerting Rules

yaml

groups:
- name: cvss-service
  rules:
  - alert: CVSSHighErrorRate
    expr: rate(cvss_processing_errors_total[5m]) > 0.1
    for: 2m
    labels:
      severity: warning
      service: cvss-parser
    annotations:
      summary: "High error rate in CVSS processing"
      description: "Error rate is {{ $value }} errors per second"

  - alert: CVSSHighLatency
    expr: histogram_quantile(0.95, rate(cvss_processing_duration_seconds_bucket[5m])) > 1.0
    for: 5m
    labels:
      severity: warning
      service: cvss-parser
    annotations:
      summary: "High latency in CVSS processing"
      description: "95th percentile latency is {{ $value }} seconds"

  - alert: CVSSServiceDown
    expr: up{job="cvss-service"} == 0
    for: 1m
    labels:
      severity: critical
      service: cvss-parser
    annotations:
      summary: "CVSS service is down"
      description: "CVSS service has been down for more than 1 minute"

  - alert: CVSSHighMemoryUsage
    expr: cvss_memory_usage_bytes > 1073741824  # 1GB
    for: 5m
    labels:
      severity: warning
      service: cvss-parser
    annotations:
      summary: "High memory usage in CVSS service"
      description: "Memory usage is {{ $value | humanizeBytes }}"

  - alert: CVSSCacheMissRate
    expr: rate(cvss_cache_misses_total[5m]) / (rate(cvss_cache_hits_total[5m]) + rate(cvss_cache_misses_total[5m])) > 0.8
    for: 10m
    labels:
      severity: warning
      service: cvss-parser
    annotations:
      summary: "High cache miss rate"
      description: "Cache miss rate is {{ $value | humanizePercentage }}"

Alert Manager Configuration

yaml

global:
  smtp_smarthost: 'smtp.company.com:587'
  smtp_from: 'alerts@company.com'

route:
  group_by: ['alertname', 'service']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  receiver: 'web.hook'
  routes:
  - match:
      severity: critical
    receiver: 'critical-alerts'
  - match:
      service: cvss-parser
    receiver: 'cvss-team'

receivers:
- name: 'web.hook'
  webhook_configs:
  - url: 'http://slack-webhook/alerts'

- name: 'critical-alerts'
  email_configs:
  - to: 'oncall@company.com'
    subject: 'CRITICAL: {{ .GroupLabels.alertname }}'
    body: |
      {{ range .Alerts }}
      Alert: {{ .Annotations.summary }}
      Description: {{ .Annotations.description }}
      {{ end }}
  pagerduty_configs:
  - service_key: 'your-pagerduty-key'

- name: 'cvss-team'
  slack_configs:
  - api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
    channel: '#cvss-alerts'
    title: 'CVSS Service Alert'
    text: '{{ .CommonAnnotations.summary }}'

Dashboards

Grafana Dashboard

json

{
  "dashboard": {
    "title": "CVSS Parser Monitoring",
    "panels": [
      {
        "title": "Request Rate",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(cvss_vectors_processed_total[5m])",
            "legendFormat": "Requests/sec"
          }
        ]
      },
      {
        "title": "Response Time",
        "type": "graph",
        "targets": [
          {
            "expr": "histogram_quantile(0.50, rate(cvss_processing_duration_seconds_bucket[5m]))",
            "legendFormat": "50th percentile"
          },
          {
            "expr": "histogram_quantile(0.95, rate(cvss_processing_duration_seconds_bucket[5m]))",
            "legendFormat": "95th percentile"
          }
        ]
      },
      {
        "title": "Error Rate",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(cvss_processing_errors_total[5m])",
            "legendFormat": "Errors/sec"
          }
        ]
      },
      {
        "title": "Severity Distribution",
        "type": "pie",
        "targets": [
          {
            "expr": "cvss_severity_distribution_total",
            "legendFormat": "{{ severity }}"
          }
        ]
      }
    ]
  }
}

Health Checks

Comprehensive Health Monitoring

type HealthMonitor struct {
    service    *CVSSService
    db         *sql.DB
    redis      *redis.Client
    lastCheck  time.Time
    status     HealthStatus
    mutex      sync.RWMutex
}

type HealthStatus struct {
    Overall    string                 `json:"overall"`
    Components map[string]ComponentHealth `json:"components"`
    Timestamp  time.Time              `json:"timestamp"`
    Uptime     time.Duration          `json:"uptime"`
}

type ComponentHealth struct {
    Status      string        `json:"status"`
    ResponseTime time.Duration `json:"response_time"`
    Error       string        `json:"error,omitempty"`
}

func (hm *HealthMonitor) CheckHealth() HealthStatus {
    hm.mutex.Lock()
    defer hm.mutex.Unlock()
    
    status := HealthStatus{
        Components: make(map[string]ComponentHealth),
        Timestamp:  time.Now(),
        Uptime:     time.Since(startTime),
    }
    
    // Check database
    start := time.Now()
    if err := hm.db.Ping(); err != nil {
        status.Components["database"] = ComponentHealth{
            Status: "unhealthy",
            Error:  err.Error(),
            ResponseTime: time.Since(start),
        }
    } else {
        status.Components["database"] = ComponentHealth{
            Status: "healthy",
            ResponseTime: time.Since(start),
        }
    }
    
    // Check Redis
    start = time.Now()
    if err := hm.redis.Ping().Err(); err != nil {
        status.Components["redis"] = ComponentHealth{
            Status: "unhealthy",
            Error:  err.Error(),
            ResponseTime: time.Since(start),
        }
    } else {
        status.Components["redis"] = ComponentHealth{
            Status: "healthy",
            ResponseTime: time.Since(start),
        }
    }
    
    // Check CVSS processing
    start = time.Now()
    testVector := "CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:L/I:L/A:L"
    if _, err := hm.service.ProcessVector(context.Background(), testVector); err != nil {
        status.Components["cvss_processing"] = ComponentHealth{
            Status: "unhealthy",
            Error:  err.Error(),
            ResponseTime: time.Since(start),
        }
    } else {
        status.Components["cvss_processing"] = ComponentHealth{
            Status: "healthy",
            ResponseTime: time.Since(start),
        }
    }
    
    // Determine overall status
    status.Overall = "healthy"
    for _, component := range status.Components {
        if component.Status != "healthy" {
            status.Overall = "unhealthy"
            break
        }
    }
    
    hm.status = status
    hm.lastCheck = time.Now()
    
    return status
}

Distributed Tracing

OpenTelemetry Integration

func initTracing() {
    exporter, err := jaeger.New(jaeger.WithCollectorEndpoint(jaeger.WithEndpoint("http://jaeger:14268/api/traces")))
    if err != nil {
        log.Fatal(err)
    }
    
    tp := trace.NewTracerProvider(
        trace.WithBatcher(exporter),
        trace.WithResource(resource.NewWithAttributes(
            semconv.SchemaURL,
            semconv.ServiceNameKey.String("cvss-parser"),
            semconv.ServiceVersionKey.String(version.Get()),
        )),
    )
    
    otel.SetTracerProvider(tp)
    otel.SetTextMapPropagator(propagation.TraceContext{})
}

func (s *CVSSService) ProcessVectorWithTracing(ctx context.Context, vectorStr string) (*VectorResult, error) {
    tracer := otel.Tracer("cvss-parser")
    ctx, span := tracer.Start(ctx, "process_vector")
    defer span.End()
    
    span.SetAttributes(
        attribute.String("cvss.vector", vectorStr),
        attribute.String("cvss.version", "3.1"),
    )
    
    // Parse vector
    ctx, parseSpan := tracer.Start(ctx, "parse_vector")
    vector, err := s.parser.Parse(vectorStr)
    parseSpan.End()
    
    if err != nil {
        span.RecordError(err)
        span.SetStatus(codes.Error, "Failed to parse vector")
        return nil, err
    }
    
    // Calculate score
    ctx, calcSpan := tracer.Start(ctx, "calculate_score")
    score, err := s.calculator.Calculate(vector)
    calcSpan.End()
    
    if err != nil {
        span.RecordError(err)
        span.SetStatus(codes.Error, "Failed to calculate score")
        return nil, err
    }
    
    span.SetAttributes(
        attribute.Float64("cvss.score", score),
        attribute.String("cvss.severity", s.calculator.GetSeverityRating(score)),
    )
    
    return &VectorResult{
        Vector:   vectorStr,
        Score:    score,
        Severity: s.calculator.GetSeverityRating(score),
    }, nil
}

Performance Monitoring

SLA Monitoring

type SLAMonitor struct {
    targets map[string]SLATarget
    metrics *CVSSMetrics
}

type SLATarget struct {
    Name        string
    Threshold   float64
    Window      time.Duration
    Description string
}

func NewSLAMonitor(metrics *CVSSMetrics) *SLAMonitor {
    return &SLAMonitor{
        metrics: metrics,
        targets: map[string]SLATarget{
            "availability": {
                Name:        "Service Availability",
                Threshold:   99.9,
                Window:      24 * time.Hour,
                Description: "Service should be available 99.9% of the time",
            },
            "latency_p95": {
                Name:        "95th Percentile Latency",
                Threshold:   500, // milliseconds
                Window:      5 * time.Minute,
                Description: "95% of requests should complete within 500ms",
            },
            "error_rate": {
                Name:        "Error Rate",
                Threshold:   1.0, // percentage
                Window:      5 * time.Minute,
                Description: "Error rate should be below 1%",
            },
        },
    }
}

func (sla *SLAMonitor) CheckSLA(target string) (bool, float64, error) {
    slaTarget, exists := sla.targets[target]
    if !exists {
        return false, 0, fmt.Errorf("unknown SLA target: %s", target)
    }
    
    switch target {
    case "availability":
        return sla.checkAvailability(slaTarget)
    case "latency_p95":
        return sla.checkLatency(slaTarget)
    case "error_rate":
        return sla.checkErrorRate(slaTarget)
    default:
        return false, 0, fmt.Errorf("unsupported SLA target: %s", target)
    }
}

Incident Response

Automated Incident Detection

type IncidentDetector struct {
    alertManager *AlertManager
    escalation   *EscalationPolicy
    logger       *Logger
}

func (id *IncidentDetector) HandleAlert(alert Alert) {
    incident := &Incident{
        ID:          generateIncidentID(),
        Alert:       alert,
        Severity:    alert.Severity,
        Status:      "open",
        CreatedAt:   time.Now(),
        UpdatedAt:   time.Now(),
    }
    
    // Log incident
    id.logger.LogIncident(incident)
    
    // Auto-remediation for known issues
    if remediation := id.getAutoRemediation(alert); remediation != nil {
        if err := remediation.Execute(); err != nil {
            id.logger.LogError(context.Background(), err, "Auto-remediation failed")
        } else {
            incident.Status = "auto-resolved"
            incident.Resolution = "Auto-remediated"
            incident.UpdatedAt = time.Now()
        }
    }
    
    // Escalate if needed
    if incident.Status == "open" {
        id.escalation.Escalate(incident)
    }
}

Next Steps

After implementing monitoring, consider:

Performance Optimization - Advanced optimization
Security Monitoring - Security-focused monitoring
Capacity Planning - Resource planning

Metrics Reference - Complete metrics documentation
Alerting Cookbook - Common alerting patterns
Troubleshooting Guide - Common issues and solutions

Monitoring and Alerting ​

Overview ​

Metrics Collection ​

Application Metrics ​

System Metrics Collection ​

Logging ​

Structured Logging ​

Log Aggregation ​

Alerting Rules ​

Prometheus Alerting Rules ​

Alert Manager Configuration ​

Dashboards ​

Grafana Dashboard ​

Health Checks ​

Comprehensive Health Monitoring ​

Distributed Tracing ​

OpenTelemetry Integration ​

Performance Monitoring ​

SLA Monitoring ​

Incident Response ​

Automated Incident Detection ​

Next Steps ​

Related Documentation ​

Monitoring and Alerting

Overview

Metrics Collection

Application Metrics

System Metrics Collection

Logging

Structured Logging

Log Aggregation

Alerting Rules

Prometheus Alerting Rules

Alert Manager Configuration

Dashboards

Grafana Dashboard

Health Checks

Comprehensive Health Monitoring

Distributed Tracing

OpenTelemetry Integration

Performance Monitoring

SLA Monitoring

Incident Response

Automated Incident Detection

Next Steps

Related Documentation