projectrules.ai

Monitoring Guidelines

GoLoggingMetricsTracingObservability

Description

Standards for logging, metrics, tracing, and overall system observability in Go

Globs

**/*.go
---
description: Standards for logging, metrics, tracing, and overall system observability in Go
globs: **/*.go
---

# Monitoring Guidelines

## Structured Logging

1. **Logger Configuration**
   ```go
   // logger/config.go
   
   // NewLogger creates a new structured logger
   func NewLogger(env string) (*zap.Logger, error) {
       var config zap.Config
       
       if env == "production" {
           config = zap.NewProductionConfig()
           config.EncoderConfig.TimeKey = "timestamp"
           config.EncoderConfig.EncodeTime = zapcore.ISO8601TimeEncoder
       } else {
           config = zap.NewDevelopmentConfig()
       }
       
       return config.Build()
   }
   ```

2. **Contextual Logging**
   ```go
   // handler/user.go
   
   func (h *Handler) CreateUser(w http.ResponseWriter, r *http.Request) {
       ctx := r.Context()
       logger := h.logger.With(
           zap.String("handler", "CreateUser"),
           zap.String("request_id", middleware.GetRequestID(ctx)),
       )
       
       logger.Info("processing create user request")
       
       // ... handler implementation
       
       if err != nil {
           logger.Error("failed to create user",
               zap.Error(err),
               zap.String("user_email", input.Email),
           )
           // ... error handling
       }
   }
   ```

## Metrics Collection

1. **Prometheus Metrics**
   ```go
   // metrics/metrics.go
   
   var (
       RequestDuration = promauto.NewHistogramVec(
           prometheus.HistogramOpts{
               Name: "http_request_duration_seconds",
               Help: "Duration of HTTP requests",
               Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10},
           },
           []string{"handler", "method", "status"},
       )
       
       ActiveRequests = promauto.NewGaugeVec(
           prometheus.GaugeOpts{
               Name: "http_requests_active",
               Help: "Number of active HTTP requests",
           },
           []string{"handler"},
       )
       
       DatabaseErrors = promauto.NewCounterVec(
           prometheus.CounterOpts{
               Name: "database_errors_total",
               Help: "Total number of database errors",
           },
           []string{"operation"},
       )
   )
   ```

2. **Metrics Middleware**
   ```go
   // middleware/metrics.go
   
   func MetricsMiddleware(next http.Handler) http.Handler {
       return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
           start := time.Now()
           path := r.URL.Path
           
           // Track active requests
           ActiveRequests.WithLabelValues(path).Inc()
           defer ActiveRequests.WithLabelValues(path).Dec()
           
           // Use response writer wrapper to capture status code
           ww := middleware.NewWrapResponseWriter(w, r.ProtoMajor)
           
           next.ServeHTTP(ww, r)
           
           // Record request duration
           duration := time.Since(start).Seconds()
           RequestDuration.WithLabelValues(
               path,
               r.Method,
               strconv.Itoa(ww.Status()),
           ).Observe(duration)
       })
   }
   ```

## Distributed Tracing

1. **Trace Configuration**
   ```go
   // tracer/config.go
   
   func InitTracer(serviceName string) (*sdktrace.TracerProvider, error) {
       exporter, err := jaeger.New(jaeger.WithCollectorEndpoint(
           jaeger.WithEndpoint("http://jaeger:14268/api/traces"),
       ))
       if err != nil {
           return nil, fmt.Errorf("failed to create jaeger exporter: %w", err)
       }
       
       tp := sdktrace.NewTracerProvider(
           sdktrace.WithBatcher(exporter),
           sdktrace.WithResource(resource.NewWithAttributes(
               semconv.SchemaURL,
               semconv.ServiceNameKey.String(serviceName),
           )),
       )
       
       otel.SetTracerProvider(tp)
       return tp, nil
   }
   ```

2. **Trace Implementation**
   ```go
   // handler/order.go
   
   func (h *Handler) ProcessOrder(w http.ResponseWriter, r *http.Request) {
       ctx := r.Context()
       
       tracer := otel.Tracer("order-service")
       ctx, span := tracer.Start(ctx, "ProcessOrder")
       defer span.End()
       
       // Add relevant attributes
       span.SetAttributes(
           attribute.String("order.id", orderID),
           attribute.Float64("order.amount", amount),
       )
       
       // Process order with context
       err := h.orderService.Process(ctx, order)
       if err != nil {
           span.RecordError(err)
           span.SetStatus(codes.Error, err.Error())
           // ... error handling
       }
   }
   ```

## Health Checks

1. **Health Check Handler**
   ```go
   // health/handler.go
   
   type HealthChecker struct {
       checks map[string]HealthCheck
   }
   
   type HealthCheck func() error
   
   func (h *HealthChecker) AddCheck(name string, check HealthCheck) {
       h.checks[name] = check
   }
   
   func (h *HealthChecker) Handler() http.HandlerFunc {
       return func(w http.ResponseWriter, r *http.Request) {
           status := http.StatusOK
           result := make(map[string]string)
           
           for name, check := range h.checks {
               if err := check(); err != nil {
                   status = http.StatusServiceUnavailable
                   result[name] = fmt.Sprintf("unhealthy: %v", err)
               } else {
                   result[name] = "healthy"
               }
           }
           
           w.Header().Set("Content-Type", "application/json")
           w.WriteHeader(status)
           json.NewEncoder(w).Encode(result)
       }
   }
   ```

## Resource Monitoring

1. **System Metrics**
   ```go
   // metrics/system.go
   
   var (
       MemoryUsage = promauto.NewGauge(prometheus.GaugeOpts{
           Name: "process_memory_bytes",
           Help: "Process memory usage in bytes",
       })
       
       GoroutineCount = promauto.NewGauge(prometheus.GaugeOpts{
           Name: "goroutines_total",
           Help: "Total number of goroutines",
       })
   )
   
   func CollectSystemMetrics(ctx context.Context) {
       ticker := time.NewTicker(15 * time.Second)
       defer ticker.Stop()
       
       for {
           select {
           case <-ctx.Done():
               return
           case <-ticker.C:
               var m runtime.MemStats
               runtime.ReadMemStats(&m)
               
               MemoryUsage.Set(float64(m.Alloc))
               GoroutineCount.Set(float64(runtime.NumGoroutine()))
           }
       }
   }
   ```

## Alert Configuration

1. **Prometheus Alert Rules**
   ```yaml
   # alerts/rules.yaml
   groups:
   - name: app
     rules:
     - alert: HighErrorRate
       expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.1
       for: 5m
       labels:
         severity: critical
       annotations:
         summary: High HTTP error rate
         description: Error rate is {{ $value }} per second
   
     - alert: HighLatency
       expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
       for: 5m
       labels:
         severity: warning
       annotations:
         summary: High latency detected
         description: 95th percentile latency is {{ $value }} seconds
   ```

## Monitoring Setup

1. **Monitoring Stack**
   ```yaml
   # docker-compose.monitoring.yml
   version: '3.8'
   
   services:
     prometheus:
       image: prom/prometheus
       volumes:
         - ./prometheus.yml:/etc/prometheus/prometheus.yml
       ports:
         - "9090:9090"
   
     grafana:
       image: grafana/grafana
       ports:
         - "3000:3000"
       depends_on:
         - prometheus
   
     jaeger:
       image: jaegertracing/all-in-one
       ports:
         - "16686:16686"
         - "14268:14268"
   ```

## Best Practices

1. **Log Levels**
   - DEBUG: Detailed information for debugging
   - INFO: General operational events
   - WARN: Warning messages for potentially harmful situations
   - ERROR: Error events that might still allow the application to continue running
   - FATAL: Very severe error events that will lead the application to abort

2. **Metric Naming**
   - Use lowercase with underscores
   - Include units in the name (e.g., `_seconds`, `_bytes`)
   - Use prefixes for grouping (e.g., `http_`, `db_`)
   - Include relevant labels but don't over-dimension
Monitoring Guidelines