System failures are inevitable. The question isn’t whether your system will fail, but how gracefully it handles those failures. Building resilient systems requires embracing failure as a first-class concern in your architecture.
The Reality of Distributed Systems
Modern applications are complex distributed systems with many potential failure points:
- Network partitions between services
- Hardware failures causing server downtime
- Software bugs leading to crashes
- Traffic spikes overwhelming system capacity
- Dependency failures cascading through the system
Each of these can bring down an entire application if not properly handled.
Circuit Breaker Pattern
The circuit breaker prevents cascading failures by failing fast when dependencies are unhealthy:
1type CircuitBreaker struct {
2 name string
3 state CircuitState
4 failures int64
5 successes int64
6 lastFailTime time.Time
7 settings CircuitSettings
8 mutex sync.RWMutex
9}
10
11type CircuitSettings struct {
12 MaxFailures int64
13 ResetTimeout time.Duration
14 SuccessThreshold int64
15}
16
17func (cb *CircuitBreaker) Execute(operation func() error) error {
18 cb.mutex.RLock()
19 state := cb.state
20 failures := cb.failures
21 lastFailTime := cb.lastFailTime
22 cb.mutex.RUnlock()
23
24 switch state {
25 case CircuitClosed:
26 return cb.executeInClosedState(operation)
27 case CircuitOpen:
28 if time.Since(lastFailTime) > cb.settings.ResetTimeout {
29 return cb.executeInHalfOpenState(operation)
30 }
31 return ErrCircuitOpen
32 case CircuitHalfOpen:
33 return cb.executeInHalfOpenState(operation)
34 }
35
36 return nil
37}
38
39func (cb *CircuitBreaker) executeInClosedState(operation func() error) error {
40 err := operation()
41
42 cb.mutex.Lock()
43 defer cb.mutex.Unlock()
44
45 if err != nil {
46 cb.failures++
47 cb.lastFailTime = time.Now()
48
49 if cb.failures >= cb.settings.MaxFailures {
50 cb.state = CircuitOpen
51 log.Warn("Circuit breaker opened", "name", cb.name)
52 }
53 return err
54 }
55
56 cb.failures = 0
57 return nil
58}
Retry Strategies
Implement intelligent retry mechanisms with exponential backoff:
1type RetryConfig struct {
2 MaxRetries int
3 BaseDelay time.Duration
4 MaxDelay time.Duration
5 BackoffFactor float64
6 Jitter bool
7}
8
9func RetryWithBackoff(operation func() error, config RetryConfig) error {
10 var lastErr error
11
12 for attempt := 0; attempt <= config.MaxRetries; attempt++ {
13 if attempt > 0 {
14 delay := calculateDelay(attempt, config)
15 if config.Jitter {
16 delay = addJitter(delay)
17 }
18 time.Sleep(delay)
19 }
20
21 lastErr = operation()
22 if lastErr == nil {
23 return nil
24 }
25
26 // Don't retry for certain error types
27 if !isRetryableError(lastErr) {
28 return lastErr
29 }
30
31 log.Debug("Retrying operation",
32 "attempt", attempt+1,
33 "error", lastErr)
34 }
35
36 return fmt.Errorf("operation failed after %d attempts: %w",
37 config.MaxRetries+1, lastErr)
38}
39
40func calculateDelay(attempt int, config RetryConfig) time.Duration {
41 delay := float64(config.BaseDelay) *
42 math.Pow(config.BackoffFactor, float64(attempt-1))
43
44 if delay > float64(config.MaxDelay) {
45 delay = float64(config.MaxDelay)
46 }
47
48 return time.Duration(delay)
49}
50
51func addJitter(delay time.Duration) time.Duration {
52 jitter := time.Duration(rand.Float64() * float64(delay) * 0.1)
53 return delay + jitter
54}
Bulkhead Pattern
Isolate critical resources to prevent total system failure:
1// Thread pool isolation
2type BulkheadExecutor struct {
3 pools map[string]*ThreadPool
4 mutex sync.RWMutex
5}
6
7type ThreadPool struct {
8 name string
9 workers chan struct{}
10 queue chan Task
11 metrics *PoolMetrics
12}
13
14func NewBulkheadExecutor() *BulkheadExecutor {
15 return &BulkheadExecutor{
16 pools: make(map[string]*ThreadPool),
17 }
18}
19
20func (be *BulkheadExecutor) CreatePool(name string, size int, queueSize int) {
21 pool := &ThreadPool{
22 name: name,
23 workers: make(chan struct{}, size),
24 queue: make(chan Task, queueSize),
25 metrics: NewPoolMetrics(),
26 }
27
28 // Fill worker semaphore
29 for i := 0; i < size; i++ {
30 pool.workers <- struct{}{}
31 }
32
33 // Start worker goroutines
34 for i := 0; i < size; i++ {
35 go pool.worker()
36 }
37
38 be.mutex.Lock()
39 be.pools[name] = pool
40 be.mutex.Unlock()
41}
42
43func (tp *ThreadPool) worker() {
44 for task := range tp.queue {
45 <-tp.workers // Acquire worker slot
46
47 start := time.Now()
48 err := task.Execute()
49 duration := time.Since(start)
50
51 tp.metrics.RecordExecution(duration, err)
52 tp.workers <- struct{} // Release worker slot
53 }
54}
55
56func (be *BulkheadExecutor) Submit(poolName string, task Task) error {
57 be.mutex.RLock()
58 pool, exists := be.pools[poolName]
59 be.mutex.RUnlock()
60
61 if !exists {
62 return fmt.Errorf("pool %s does not exist", poolName)
63 }
64
65 select {
66 case pool.queue <- task:
67 return nil
68 default:
69 pool.metrics.RecordRejection()
70 return ErrPoolFull
71 }
72}
Graceful Degradation
Design systems that continue operating with reduced functionality:
1type FeatureFlag struct {
2 name string
3 enabled bool
4 fallback func() interface{}
5 healthCheck func() bool
6}
7
8type FeatureManager struct {
9 flags map[string]*FeatureFlag
10 mutex sync.RWMutex
11 monitor *HealthMonitor
12}
13
14func (fm *FeatureManager) Execute(flagName string, primary func() (interface{}, error)) (interface{}, error) {
15 fm.mutex.RLock()
16 flag, exists := fm.flags[flagName]
17 fm.mutex.RUnlock()
18
19 if !exists {
20 return primary()
21 }
22
23 // Check if feature is healthy
24 if !flag.enabled || (flag.healthCheck != nil && !flag.healthCheck()) {
25 log.Info("Feature degraded, using fallback", "feature", flagName)
26
27 if flag.fallback != nil {
28 return flag.fallback(), nil
29 }
30
31 return nil, ErrFeatureDegraded
32 }
33
34 return primary()
35}
36
37// Example usage
38func (s *SearchService) SearchProducts(query string) (*SearchResults, error) {
39 return s.featureManager.Execute("advanced_search", func() (interface{}, error) {
40 // Advanced search with ML ranking
41 return s.advancedSearch(query)
42 })
43}
44
45func (s *SearchService) setupSearchDegradation() {
46 s.featureManager.RegisterFlag("advanced_search", &FeatureFlag{
47 name: "advanced_search",
48 enabled: true,
49 fallback: func() interface{} {
50 // Simple text-based search fallback
51 return s.simpleSearch(query)
52 },
53 healthCheck: func() bool {
54 // Check if ML service is responsive
55 return s.mlService.IsHealthy()
56 },
57 })
58}
Health Checks and Monitoring
Implement comprehensive health monitoring:
1type HealthChecker struct {
2 checks map[string]HealthCheck
3 timeout time.Duration
4 cache *HealthCache
5}
6
7type HealthCheck interface {
8 Name() string
9 Check(ctx context.Context) HealthStatus
10 IsCritical() bool
11}
12
13type DatabaseHealthCheck struct {
14 db *sql.DB
15 critical bool
16}
17
18func (dhc *DatabaseHealthCheck) Check(ctx context.Context) HealthStatus {
19 ctx, cancel := context.WithTimeout(ctx, time.Second*5)
20 defer cancel()
21
22 err := dhc.db.PingContext(ctx)
23 if err != nil {
24 return HealthStatus{
25 Status: StatusUnhealthy,
26 Message: fmt.Sprintf("Database ping failed: %v", err),
27 }
28 }
29
30 // Check if we can perform a simple query
31 var count int
32 err = dhc.db.QueryRowContext(ctx, "SELECT 1").Scan(&count)
33 if err != nil {
34 return HealthStatus{
35 Status: StatusUnhealthy,
36 Message: fmt.Sprintf("Database query failed: %v", err),
37 }
38 }
39
40 return HealthStatus{
41 Status: StatusHealthy,
42 Message: "Database is healthy",
43 }
44}
45
46func (hc *HealthChecker) CheckAll(ctx context.Context) map[string]HealthStatus {
47 results := make(map[string]HealthStatus)
48
49 // Run checks concurrently
50 var wg sync.WaitGroup
51 var mutex sync.Mutex
52
53 for name, check := range hc.checks {
54 wg.Add(1)
55 go func(name string, check HealthCheck) {
56 defer wg.Done()
57
58 // Check cache first
59 if cached := hc.cache.Get(name); cached != nil {
60 mutex.Lock()
61 results[name] = *cached
62 mutex.Unlock()
63 return
64 }
65
66 status := check.Check(ctx)
67
68 // Cache result
69 hc.cache.Set(name, &status, time.Minute)
70
71 mutex.Lock()
72 results[name] = status
73 mutex.Unlock()
74 }(name, check)
75 }
76
77 wg.Wait()
78 return results
79}
Load Shedding
Protect system capacity during traffic spikes:
1type LoadShedder struct {
2 maxConcurrency int64
3 current int64
4 queue chan Request
5 metrics *LoadMetrics
6}
7
8func NewLoadShedder(maxConcurrency int, queueSize int) *LoadShedder {
9 return &LoadShedder{
10 maxConcurrency: int64(maxConcurrency),
11 queue: make(chan Request, queueSize),
12 metrics: NewLoadMetrics(),
13 }
14}
15
16func (ls *LoadShedder) Process(req Request, handler func(Request) error) error {
17 current := atomic.LoadInt64(&ls.current)
18
19 // Reject if over capacity
20 if current >= ls.maxConcurrency {
21 ls.metrics.RecordShed()
22 return ErrOverCapacity
23 }
24
25 // Try to queue request
26 select {
27 case ls.queue <- req:
28 atomic.AddInt64(&ls.current, 1)
29 defer atomic.AddInt64(&ls.current, -1)
30
31 return handler(req)
32 default:
33 ls.metrics.RecordShed()
34 return ErrQueueFull
35 }
36}
37
38// Priority-based load shedding
39func (ls *LoadShedder) ProcessWithPriority(req PriorityRequest, handler func(Request) error) error {
40 current := atomic.LoadInt64(&ls.current)
41
42 if current >= ls.maxConcurrency {
43 // Shed low priority requests first
44 if req.Priority < PriorityHigh {
45 ls.metrics.RecordShed()
46 return ErrOverCapacity
47 }
48
49 // For high priority requests, try to preempt lower priority ones
50 if ls.preemptLowPriority() {
51 atomic.AddInt64(&ls.current, 1)
52 defer atomic.AddInt64(&ls.current, -1)
53 return handler(req.Request)
54 }
55
56 return ErrOverCapacity
57 }
58
59 atomic.AddInt64(&ls.current, 1)
60 defer atomic.AddInt64(&ls.current, -1)
61
62 return handler(req.Request)
63}
Chaos Engineering
Proactively test system resilience:
1type ChaosExperiment struct {
2 name string
3 enabled bool
4 probability float64
5 impact ChaosImpact
6 schedule *ChaosSchedule
7}
8
9type ChaosImpact interface {
10 Apply(ctx context.Context) error
11 Rollback(ctx context.Context) error
12}
13
14type LatencyInjection struct {
15 delay time.Duration
16 variance time.Duration
17}
18
19func (li *LatencyInjection) Apply(ctx context.Context) error {
20 delay := li.delay
21 if li.variance > 0 {
22 variance := time.Duration(rand.Float64() * float64(li.variance))
23 delay += variance
24 }
25
26 select {
27 case <-time.After(delay):
28 return nil
29 case <-ctx.Done():
30 return ctx.Err()
31 }
32}
33
34type ChaosMonkey struct {
35 experiments []*ChaosExperiment
36 enabled bool
37 logger logger.Logger
38}
39
40func (cm *ChaosMonkey) MaybeInjectChaos(ctx context.Context, operation string) error {
41 if !cm.enabled {
42 return nil
43 }
44
45 for _, exp := range cm.experiments {
46 if !exp.enabled {
47 continue
48 }
49
50 if rand.Float64() < exp.probability {
51 cm.logger.Info("Injecting chaos",
52 "experiment", exp.name,
53 "operation", operation)
54
55 return exp.impact.Apply(ctx)
56 }
57 }
58
59 return nil
60}
61
62// Usage in service calls
63func (s *PaymentService) ProcessPayment(ctx context.Context, req PaymentRequest) error {
64 // Inject chaos for testing
65 if err := s.chaosMonkey.MaybeInjectChaos(ctx, "process_payment"); err != nil {
66 return err
67 }
68
69 // Normal payment processing
70 return s.processPaymentInternal(ctx, req)
71}
Metrics and Alerting
Monitor system resilience with key metrics:
1type ResilienceMetrics struct {
2 CircuitBreakerState *prometheus.GaugeVec
3 RetryAttempts *prometheus.CounterVec
4 LoadShedCount *prometheus.CounterVec
5 DegradationEvents *prometheus.CounterVec
6 RecoveryTime *prometheus.HistogramVec
7}
8
9func NewResilienceMetrics() *ResilienceMetrics {
10 return &ResilienceMetrics{
11 CircuitBreakerState: prometheus.NewGaugeVec(
12 prometheus.GaugeOpts{
13 Name: "circuit_breaker_state",
14 Help: "Circuit breaker state (0=closed, 1=half-open, 2=open)",
15 },
16 []string{"service", "operation"},
17 ),
18
19 RetryAttempts: prometheus.NewCounterVec(
20 prometheus.CounterOpts{
21 Name: "retry_attempts_total",
22 Help: "Total number of retry attempts",
23 },
24 []string{"service", "operation", "result"},
25 ),
26
27 LoadShedCount: prometheus.NewCounterVec(
28 prometheus.CounterOpts{
29 Name: "load_shed_total",
30 Help: "Total number of requests shed",
31 },
32 []string{"service", "reason"},
33 ),
34 }
35}
36
37// Alerting rules for system resilience
38const alertingRules = `
39groups:
40 - name: resilience
41 rules:
42 - alert: CircuitBreakerOpen
43 expr: circuit_breaker_state == 2
44 for: 1m
45 annotations:
46 summary: "Circuit breaker is open for {{ $labels.service }}"
47
48 - alert: HighRetryRate
49 expr: rate(retry_attempts_total[5m]) > 10
50 for: 2m
51 annotations:
52 summary: "High retry rate detected"
53
54 - alert: LoadSheddingActive
55 expr: rate(load_shed_total[5m]) > 1
56 for: 30s
57 annotations:
58 summary: "Load shedding is active"
59`
Key Principles for Resilient Systems
- Fail fast: Don’t let failing operations consume resources
- Isolate failures: Use bulkheads to prevent cascade failures
- Degrade gracefully: Maintain core functionality when possible
- Monitor everything: Comprehensive observability is essential
- Test failures: Use chaos engineering to validate resilience
- Plan for recovery: Design systems that can recover automatically
Building resilient systems requires thinking about failure from day one. By implementing these patterns and practices, you can build systems that not only survive failures but recover gracefully and continue serving users even under adverse conditions.
Remember: resilience is not a destination but an ongoing practice. Continuously test, monitor, and improve your system’s ability to handle failure.