Skip to content

监控和告警

本指南涵盖生产环境中 CVSS Parser 的全面监控、告警和可观测性策略。

概述

有效的监控确保:

  • 系统健康和性能可见性
  • 主动问题检测
  • 性能优化洞察
  • 合规性和审计跟踪
  • 事件响应能力

指标收集

应用程序指标

go
type CVSSMetrics struct {
    // 处理指标
    VectorsProcessed    prometheus.Counter
    ProcessingDuration  prometheus.Histogram
    ProcessingErrors    *prometheus.CounterVec
    
    // 缓存指标
    CacheHits          prometheus.Counter
    CacheMisses        prometheus.Counter
    CacheSize          prometheus.Gauge
    
    // 业务指标
    SeverityDistribution *prometheus.CounterVec
    VectorTypes         *prometheus.CounterVec
    
    // 系统指标
    MemoryUsage        prometheus.Gauge
    GoroutineCount     prometheus.Gauge
    GCDuration         prometheus.Histogram
}

func NewCVSSMetrics() *CVSSMetrics {
    metrics := &CVSSMetrics{
        VectorsProcessed: prometheus.NewCounter(prometheus.CounterOpts{
            Name: "cvss_vectors_processed_total",
            Help: "处理的 CVSS 向量总数",
        }),
        ProcessingDuration: prometheus.NewHistogram(prometheus.HistogramOpts{
            Name: "cvss_processing_duration_seconds",
            Help: "处理 CVSS 向量的时间",
            Buckets: []float64{0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0},
        }),
        ProcessingErrors: prometheus.NewCounterVec(
            prometheus.CounterOpts{
                Name: "cvss_processing_errors_total",
                Help: "按类型分类的处理错误总数",
            },
            []string{"error_type"},
        ),
        SeverityDistribution: prometheus.NewCounterVec(
            prometheus.CounterOpts{
                Name: "cvss_severity_distribution_total",
                Help: "CVSS 严重性级别分布",
            },
            []string{"severity"},
        ),
    }
    
    // 注册指标
    prometheus.MustRegister(
        metrics.VectorsProcessed,
        metrics.ProcessingDuration,
        metrics.ProcessingErrors,
        metrics.SeverityDistribution,
    )
    
    return metrics
}

func (m *CVSSMetrics) RecordProcessing(duration time.Duration, severity string, err error) {
    m.VectorsProcessed.Inc()
    m.ProcessingDuration.Observe(duration.Seconds())
    m.SeverityDistribution.WithLabelValues(severity).Inc()
    
    if err != nil {
        errorType := categorizeError(err)
        m.ProcessingErrors.WithLabelValues(errorType).Inc()
    }
}

系统指标收集

go
func (m *CVSSMetrics) CollectSystemMetrics() {
    go func() {
        ticker := time.NewTicker(30 * time.Second)
        defer ticker.Stop()
        
        for range ticker.C {
            var memStats runtime.MemStats
            runtime.ReadMemStats(&memStats)
            
            m.MemoryUsage.Set(float64(memStats.Alloc))
            m.GoroutineCount.Set(float64(runtime.NumGoroutine()))
        }
    }()
}

日志记录

结构化日志记录

go
type Logger struct {
    logger *logrus.Logger
}

func NewLogger(level string) *Logger {
    logger := logrus.New()
    logger.SetFormatter(&logrus.JSONFormatter{})
    
    logLevel, err := logrus.ParseLevel(level)
    if err != nil {
        logLevel = logrus.InfoLevel
    }
    logger.SetLevel(logLevel)
    
    return &Logger{logger: logger}
}

func (l *Logger) LogVectorProcessing(ctx context.Context, vector string, score float64, duration time.Duration) {
    l.logger.WithFields(logrus.Fields{
        "trace_id":   getTraceID(ctx),
        "vector":     vector,
        "score":      score,
        "duration":   duration.Milliseconds(),
        "severity":   getSeverityFromScore(score),
        "timestamp":  time.Now().UTC(),
        "component":  "cvss_processor",
    }).Info("向量处理成功")
}

func (l *Logger) LogError(ctx context.Context, err error, vector string) {
    l.logger.WithFields(logrus.Fields{
        "trace_id":   getTraceID(ctx),
        "error":      err.Error(),
        "vector":     vector,
        "error_type": categorizeError(err),
        "timestamp":  time.Now().UTC(),
        "component":  "cvss_processor",
    }).Error("向量处理失败")
}

日志聚合

yaml
# Fluentd 配置
<source>
  @type tail
  path /var/log/cvss-service/*.log
  pos_file /var/log/fluentd/cvss-service.log.pos
  tag cvss.service
  format json
  time_key timestamp
  time_format %Y-%m-%dT%H:%M:%S.%LZ
</source>

<filter cvss.service>
  @type record_transformer
  <record>
    service cvss-parser
    environment ${ENV}
    version ${VERSION}
  </record>
</filter>

<match cvss.service>
  @type elasticsearch
  host elasticsearch.monitoring.svc.cluster.local
  port 9200
  index_name cvss-logs
  type_name _doc
</match>

告警规则

Prometheus 告警规则

yaml
groups:
- name: cvss-service
  rules:
  - alert: CVSS高错误率
    expr: rate(cvss_processing_errors_total[5m]) > 0.1
    for: 2m
    labels:
      severity: warning
      service: cvss-parser
    annotations:
      summary: "CVSS 处理中的高错误率"
      description: "错误率为每秒 {{ $value }} 个错误"

  - alert: CVSS高延迟
    expr: histogram_quantile(0.95, rate(cvss_processing_duration_seconds_bucket[5m])) > 1.0
    for: 5m
    labels:
      severity: warning
      service: cvss-parser
    annotations:
      summary: "CVSS 处理中的高延迟"
      description: "95% 延迟为 {{ $value }} 秒"

  - alert: CVSS服务宕机
    expr: up{job="cvss-service"} == 0
    for: 1m
    labels:
      severity: critical
      service: cvss-parser
    annotations:
      summary: "CVSS 服务宕机"
      description: "CVSS 服务已宕机超过 1 分钟"

  - alert: CVSS高内存使用
    expr: cvss_memory_usage_bytes > 1073741824  # 1GB
    for: 5m
    labels:
      severity: warning
      service: cvss-parser
    annotations:
      summary: "CVSS 服务中的高内存使用"
      description: "内存使用为 {{ $value | humanizeBytes }}"

  - alert: CVSS缓存未命中率高
    expr: rate(cvss_cache_misses_total[5m]) / (rate(cvss_cache_hits_total[5m]) + rate(cvss_cache_misses_total[5m])) > 0.8
    for: 10m
    labels:
      severity: warning
      service: cvss-parser
    annotations:
      summary: "高缓存未命中率"
      description: "缓存未命中率为 {{ $value | humanizePercentage }}"

Alert Manager 配置

yaml
global:
  smtp_smarthost: 'smtp.company.com:587'
  smtp_from: 'alerts@company.com'

route:
  group_by: ['alertname', 'service']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  receiver: 'web.hook'
  routes:
  - match:
      severity: critical
    receiver: 'critical-alerts'
  - match:
      service: cvss-parser
    receiver: 'cvss-team'

receivers:
- name: 'web.hook'
  webhook_configs:
  - url: 'http://slack-webhook/alerts'

- name: 'critical-alerts'
  email_configs:
  - to: 'oncall@company.com'
    subject: '严重: {{ .GroupLabels.alertname }}'
    body: |
      {{ range .Alerts }}
      告警: {{ .Annotations.summary }}
      描述: {{ .Annotations.description }}
      {{ end }}
  pagerduty_configs:
  - service_key: 'your-pagerduty-key'

- name: 'cvss-team'
  slack_configs:
  - api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
    channel: '#cvss-alerts'
    title: 'CVSS 服务告警'
    text: '{{ .CommonAnnotations.summary }}'

仪表板

Grafana 仪表板

json
{
  "dashboard": {
    "title": "CVSS Parser 监控",
    "panels": [
      {
        "title": "请求速率",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(cvss_vectors_processed_total[5m])",
            "legendFormat": "请求/秒"
          }
        ]
      },
      {
        "title": "响应时间",
        "type": "graph",
        "targets": [
          {
            "expr": "histogram_quantile(0.50, rate(cvss_processing_duration_seconds_bucket[5m]))",
            "legendFormat": "50% 分位数"
          },
          {
            "expr": "histogram_quantile(0.95, rate(cvss_processing_duration_seconds_bucket[5m]))",
            "legendFormat": "95% 分位数"
          }
        ]
      },
      {
        "title": "错误率",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(cvss_processing_errors_total[5m])",
            "legendFormat": "错误/秒"
          }
        ]
      },
      {
        "title": "严重性分布",
        "type": "pie",
        "targets": [
          {
            "expr": "cvss_severity_distribution_total",
            "legendFormat": "{{ severity }}"
          }
        ]
      }
    ]
  }
}

健康检查

全面健康监控

go
type HealthMonitor struct {
    service    *CVSSService
    db         *sql.DB
    redis      *redis.Client
    lastCheck  time.Time
    status     HealthStatus
    mutex      sync.RWMutex
}

type HealthStatus struct {
    Overall    string                 `json:"overall"`
    Components map[string]ComponentHealth `json:"components"`
    Timestamp  time.Time              `json:"timestamp"`
    Uptime     time.Duration          `json:"uptime"`
}

type ComponentHealth struct {
    Status      string        `json:"status"`
    ResponseTime time.Duration `json:"response_time"`
    Error       string        `json:"error,omitempty"`
}

func (hm *HealthMonitor) CheckHealth() HealthStatus {
    hm.mutex.Lock()
    defer hm.mutex.Unlock()
    
    status := HealthStatus{
        Components: make(map[string]ComponentHealth),
        Timestamp:  time.Now(),
        Uptime:     time.Since(startTime),
    }
    
    // 检查数据库
    start := time.Now()
    if err := hm.db.Ping(); err != nil {
        status.Components["database"] = ComponentHealth{
            Status: "不健康",
            Error:  err.Error(),
            ResponseTime: time.Since(start),
        }
    } else {
        status.Components["database"] = ComponentHealth{
            Status: "健康",
            ResponseTime: time.Since(start),
        }
    }
    
    // 检查 Redis
    start = time.Now()
    if err := hm.redis.Ping().Err(); err != nil {
        status.Components["redis"] = ComponentHealth{
            Status: "不健康",
            Error:  err.Error(),
            ResponseTime: time.Since(start),
        }
    } else {
        status.Components["redis"] = ComponentHealth{
            Status: "健康",
            ResponseTime: time.Since(start),
        }
    }
    
    // 检查 CVSS 处理
    start = time.Now()
    testVector := "CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:L/I:L/A:L"
    if _, err := hm.service.ProcessVector(context.Background(), testVector); err != nil {
        status.Components["cvss_processing"] = ComponentHealth{
            Status: "不健康",
            Error:  err.Error(),
            ResponseTime: time.Since(start),
        }
    } else {
        status.Components["cvss_processing"] = ComponentHealth{
            Status: "健康",
            ResponseTime: time.Since(start),
        }
    }
    
    // 确定整体状态
    status.Overall = "健康"
    for _, component := range status.Components {
        if component.Status != "健康" {
            status.Overall = "不健康"
            break
        }
    }
    
    hm.status = status
    hm.lastCheck = time.Now()
    
    return status
}

分布式追踪

OpenTelemetry 集成

go
func initTracing() {
    exporter, err := jaeger.New(jaeger.WithCollectorEndpoint(jaeger.WithEndpoint("http://jaeger:14268/api/traces")))
    if err != nil {
        log.Fatal(err)
    }
    
    tp := trace.NewTracerProvider(
        trace.WithBatcher(exporter),
        trace.WithResource(resource.NewWithAttributes(
            semconv.SchemaURL,
            semconv.ServiceNameKey.String("cvss-parser"),
            semconv.ServiceVersionKey.String(version.Get()),
        )),
    )
    
    otel.SetTracerProvider(tp)
    otel.SetTextMapPropagator(propagation.TraceContext{})
}

func (s *CVSSService) ProcessVectorWithTracing(ctx context.Context, vectorStr string) (*VectorResult, error) {
    tracer := otel.Tracer("cvss-parser")
    ctx, span := tracer.Start(ctx, "process_vector")
    defer span.End()
    
    span.SetAttributes(
        attribute.String("cvss.vector", vectorStr),
        attribute.String("cvss.version", "3.1"),
    )
    
    // 解析向量
    ctx, parseSpan := tracer.Start(ctx, "parse_vector")
    vector, err := s.parser.Parse(vectorStr)
    parseSpan.End()
    
    if err != nil {
        span.RecordError(err)
        span.SetStatus(codes.Error, "解析向量失败")
        return nil, err
    }
    
    // 计算分数
    ctx, calcSpan := tracer.Start(ctx, "calculate_score")
    score, err := s.calculator.Calculate(vector)
    calcSpan.End()
    
    if err != nil {
        span.RecordError(err)
        span.SetStatus(codes.Error, "计算分数失败")
        return nil, err
    }
    
    span.SetAttributes(
        attribute.Float64("cvss.score", score),
        attribute.String("cvss.severity", s.calculator.GetSeverityRating(score)),
    )
    
    return &VectorResult{
        Vector:   vectorStr,
        Score:    score,
        Severity: s.calculator.GetSeverityRating(score),
    }, nil
}

性能监控

SLA 监控

go
type SLAMonitor struct {
    targets map[string]SLATarget
    metrics *CVSSMetrics
}

type SLATarget struct {
    Name        string
    Threshold   float64
    Window      time.Duration
    Description string
}

func NewSLAMonitor(metrics *CVSSMetrics) *SLAMonitor {
    return &SLAMonitor{
        metrics: metrics,
        targets: map[string]SLATarget{
            "availability": {
                Name:        "服务可用性",
                Threshold:   99.9,
                Window:      24 * time.Hour,
                Description: "服务应在 99.9% 的时间内可用",
            },
            "latency_p95": {
                Name:        "95% 延迟",
                Threshold:   500, // 毫秒
                Window:      5 * time.Minute,
                Description: "95% 的请求应在 500ms 内完成",
            },
            "error_rate": {
                Name:        "错误率",
                Threshold:   1.0, // 百分比
                Window:      5 * time.Minute,
                Description: "错误率应低于 1%",
            },
        },
    }
}

func (sla *SLAMonitor) CheckSLA(target string) (bool, float64, error) {
    slaTarget, exists := sla.targets[target]
    if !exists {
        return false, 0, fmt.Errorf("未知的 SLA 目标: %s", target)
    }
    
    switch target {
    case "availability":
        return sla.checkAvailability(slaTarget)
    case "latency_p95":
        return sla.checkLatency(slaTarget)
    case "error_rate":
        return sla.checkErrorRate(slaTarget)
    default:
        return false, 0, fmt.Errorf("不支持的 SLA 目标: %s", target)
    }
}

事件响应

自动化事件检测

go
type IncidentDetector struct {
    alertManager *AlertManager
    escalation   *EscalationPolicy
    logger       *Logger
}

func (id *IncidentDetector) HandleAlert(alert Alert) {
    incident := &Incident{
        ID:          generateIncidentID(),
        Alert:       alert,
        Severity:    alert.Severity,
        Status:      "开放",
        CreatedAt:   time.Now(),
        UpdatedAt:   time.Now(),
    }
    
    // 记录事件
    id.logger.LogIncident(incident)
    
    // 已知问题的自动修复
    if remediation := id.getAutoRemediation(alert); remediation != nil {
        if err := remediation.Execute(); err != nil {
            id.logger.LogError(context.Background(), err, "自动修复失败")
        } else {
            incident.Status = "自动解决"
            incident.Resolution = "自动修复"
            incident.UpdatedAt = time.Now()
        }
    }
    
    // 如果需要则升级
    if incident.Status == "开放" {
        id.escalation.Escalate(incident)
    }
}

下一步

实施监控后,考虑:

相关文档

Released under the MIT License.