System failures are inevitable. The question isn’t whether your system will fail, but how gracefully it handles those failures. Building resilient systems requires embracing failure as a first-class concern in your architecture.
The Reality of Distributed Systems
Modern applications are complex distributed systems with many potential failure points:
- Network partitions between services
- Hardware failures causing server downtime
- Software bugs leading to crashes
- Traffic spikes overwhelming system capacity
- Dependency failures cascading through the system
Each of these can bring down an entire application if not properly handled.
Circuit Breaker Pattern
The circuit breaker prevents cascading failures by failing fast when dependencies are unhealthy:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
| type CircuitBreaker struct {
name string
state CircuitState
failures int64
successes int64
lastFailTime time.Time
settings CircuitSettings
mutex sync.RWMutex
}
type CircuitSettings struct {
MaxFailures int64
ResetTimeout time.Duration
SuccessThreshold int64
}
func (cb *CircuitBreaker) Execute(operation func() error) error {
cb.mutex.RLock()
state := cb.state
failures := cb.failures
lastFailTime := cb.lastFailTime
cb.mutex.RUnlock()
switch state {
case CircuitClosed:
return cb.executeInClosedState(operation)
case CircuitOpen:
if time.Since(lastFailTime) > cb.settings.ResetTimeout {
return cb.executeInHalfOpenState(operation)
}
return ErrCircuitOpen
case CircuitHalfOpen:
return cb.executeInHalfOpenState(operation)
}
return nil
}
func (cb *CircuitBreaker) executeInClosedState(operation func() error) error {
err := operation()
cb.mutex.Lock()
defer cb.mutex.Unlock()
if err != nil {
cb.failures++
cb.lastFailTime = time.Now()
if cb.failures >= cb.settings.MaxFailures {
cb.state = CircuitOpen
log.Warn("Circuit breaker opened", "name", cb.name)
}
return err
}
cb.failures = 0
return nil
}
|
Retry Strategies
Implement intelligent retry mechanisms with exponential backoff:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
| type RetryConfig struct {
MaxRetries int
BaseDelay time.Duration
MaxDelay time.Duration
BackoffFactor float64
Jitter bool
}
func RetryWithBackoff(operation func() error, config RetryConfig) error {
var lastErr error
for attempt := 0; attempt <= config.MaxRetries; attempt++ {
if attempt > 0 {
delay := calculateDelay(attempt, config)
if config.Jitter {
delay = addJitter(delay)
}
time.Sleep(delay)
}
lastErr = operation()
if lastErr == nil {
return nil
}
// Don't retry for certain error types
if !isRetryableError(lastErr) {
return lastErr
}
log.Debug("Retrying operation",
"attempt", attempt+1,
"error", lastErr)
}
return fmt.Errorf("operation failed after %d attempts: %w",
config.MaxRetries+1, lastErr)
}
func calculateDelay(attempt int, config RetryConfig) time.Duration {
delay := float64(config.BaseDelay) *
math.Pow(config.BackoffFactor, float64(attempt-1))
if delay > float64(config.MaxDelay) {
delay = float64(config.MaxDelay)
}
return time.Duration(delay)
}
func addJitter(delay time.Duration) time.Duration {
jitter := time.Duration(rand.Float64() * float64(delay) * 0.1)
return delay + jitter
}
|
Bulkhead Pattern
Isolate critical resources to prevent total system failure:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
| // Thread pool isolation
type BulkheadExecutor struct {
pools map[string]*ThreadPool
mutex sync.RWMutex
}
type ThreadPool struct {
name string
workers chan struct{}
queue chan Task
metrics *PoolMetrics
}
func NewBulkheadExecutor() *BulkheadExecutor {
return &BulkheadExecutor{
pools: make(map[string]*ThreadPool),
}
}
func (be *BulkheadExecutor) CreatePool(name string, size int, queueSize int) {
pool := &ThreadPool{
name: name,
workers: make(chan struct{}, size),
queue: make(chan Task, queueSize),
metrics: NewPoolMetrics(),
}
// Fill worker semaphore
for i := 0; i < size; i++ {
pool.workers <- struct{}{}
}
// Start worker goroutines
for i := 0; i < size; i++ {
go pool.worker()
}
be.mutex.Lock()
be.pools[name] = pool
be.mutex.Unlock()
}
func (tp *ThreadPool) worker() {
for task := range tp.queue {
<-tp.workers // Acquire worker slot
start := time.Now()
err := task.Execute()
duration := time.Since(start)
tp.metrics.RecordExecution(duration, err)
tp.workers <- struct{} // Release worker slot
}
}
func (be *BulkheadExecutor) Submit(poolName string, task Task) error {
be.mutex.RLock()
pool, exists := be.pools[poolName]
be.mutex.RUnlock()
if !exists {
return fmt.Errorf("pool %s does not exist", poolName)
}
select {
case pool.queue <- task:
return nil
default:
pool.metrics.RecordRejection()
return ErrPoolFull
}
}
|
Graceful Degradation
Design systems that continue operating with reduced functionality:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
| type FeatureFlag struct {
name string
enabled bool
fallback func() interface{}
healthCheck func() bool
}
type FeatureManager struct {
flags map[string]*FeatureFlag
mutex sync.RWMutex
monitor *HealthMonitor
}
func (fm *FeatureManager) Execute(flagName string, primary func() (interface{}, error)) (interface{}, error) {
fm.mutex.RLock()
flag, exists := fm.flags[flagName]
fm.mutex.RUnlock()
if !exists {
return primary()
}
// Check if feature is healthy
if !flag.enabled || (flag.healthCheck != nil && !flag.healthCheck()) {
log.Info("Feature degraded, using fallback", "feature", flagName)
if flag.fallback != nil {
return flag.fallback(), nil
}
return nil, ErrFeatureDegraded
}
return primary()
}
// Example usage
func (s *SearchService) SearchProducts(query string) (*SearchResults, error) {
return s.featureManager.Execute("advanced_search", func() (interface{}, error) {
// Advanced search with ML ranking
return s.advancedSearch(query)
})
}
func (s *SearchService) setupSearchDegradation() {
s.featureManager.RegisterFlag("advanced_search", &FeatureFlag{
name: "advanced_search",
enabled: true,
fallback: func() interface{} {
// Simple text-based search fallback
return s.simpleSearch(query)
},
healthCheck: func() bool {
// Check if ML service is responsive
return s.mlService.IsHealthy()
},
})
}
|
Health Checks and Monitoring
Implement comprehensive health monitoring:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
| type HealthChecker struct {
checks map[string]HealthCheck
timeout time.Duration
cache *HealthCache
}
type HealthCheck interface {
Name() string
Check(ctx context.Context) HealthStatus
IsCritical() bool
}
type DatabaseHealthCheck struct {
db *sql.DB
critical bool
}
func (dhc *DatabaseHealthCheck) Check(ctx context.Context) HealthStatus {
ctx, cancel := context.WithTimeout(ctx, time.Second*5)
defer cancel()
err := dhc.db.PingContext(ctx)
if err != nil {
return HealthStatus{
Status: StatusUnhealthy,
Message: fmt.Sprintf("Database ping failed: %v", err),
}
}
// Check if we can perform a simple query
var count int
err = dhc.db.QueryRowContext(ctx, "SELECT 1").Scan(&count)
if err != nil {
return HealthStatus{
Status: StatusUnhealthy,
Message: fmt.Sprintf("Database query failed: %v", err),
}
}
return HealthStatus{
Status: StatusHealthy,
Message: "Database is healthy",
}
}
func (hc *HealthChecker) CheckAll(ctx context.Context) map[string]HealthStatus {
results := make(map[string]HealthStatus)
// Run checks concurrently
var wg sync.WaitGroup
var mutex sync.Mutex
for name, check := range hc.checks {
wg.Add(1)
go func(name string, check HealthCheck) {
defer wg.Done()
// Check cache first
if cached := hc.cache.Get(name); cached != nil {
mutex.Lock()
results[name] = *cached
mutex.Unlock()
return
}
status := check.Check(ctx)
// Cache result
hc.cache.Set(name, &status, time.Minute)
mutex.Lock()
results[name] = status
mutex.Unlock()
}(name, check)
}
wg.Wait()
return results
}
|
Load Shedding
Protect system capacity during traffic spikes:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
| type LoadShedder struct {
maxConcurrency int64
current int64
queue chan Request
metrics *LoadMetrics
}
func NewLoadShedder(maxConcurrency int, queueSize int) *LoadShedder {
return &LoadShedder{
maxConcurrency: int64(maxConcurrency),
queue: make(chan Request, queueSize),
metrics: NewLoadMetrics(),
}
}
func (ls *LoadShedder) Process(req Request, handler func(Request) error) error {
current := atomic.LoadInt64(&ls.current)
// Reject if over capacity
if current >= ls.maxConcurrency {
ls.metrics.RecordShed()
return ErrOverCapacity
}
// Try to queue request
select {
case ls.queue <- req:
atomic.AddInt64(&ls.current, 1)
defer atomic.AddInt64(&ls.current, -1)
return handler(req)
default:
ls.metrics.RecordShed()
return ErrQueueFull
}
}
// Priority-based load shedding
func (ls *LoadShedder) ProcessWithPriority(req PriorityRequest, handler func(Request) error) error {
current := atomic.LoadInt64(&ls.current)
if current >= ls.maxConcurrency {
// Shed low priority requests first
if req.Priority < PriorityHigh {
ls.metrics.RecordShed()
return ErrOverCapacity
}
// For high priority requests, try to preempt lower priority ones
if ls.preemptLowPriority() {
atomic.AddInt64(&ls.current, 1)
defer atomic.AddInt64(&ls.current, -1)
return handler(req.Request)
}
return ErrOverCapacity
}
atomic.AddInt64(&ls.current, 1)
defer atomic.AddInt64(&ls.current, -1)
return handler(req.Request)
}
|
Chaos Engineering
Proactively test system resilience:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
| type ChaosExperiment struct {
name string
enabled bool
probability float64
impact ChaosImpact
schedule *ChaosSchedule
}
type ChaosImpact interface {
Apply(ctx context.Context) error
Rollback(ctx context.Context) error
}
type LatencyInjection struct {
delay time.Duration
variance time.Duration
}
func (li *LatencyInjection) Apply(ctx context.Context) error {
delay := li.delay
if li.variance > 0 {
variance := time.Duration(rand.Float64() * float64(li.variance))
delay += variance
}
select {
case <-time.After(delay):
return nil
case <-ctx.Done():
return ctx.Err()
}
}
type ChaosMonkey struct {
experiments []*ChaosExperiment
enabled bool
logger logger.Logger
}
func (cm *ChaosMonkey) MaybeInjectChaos(ctx context.Context, operation string) error {
if !cm.enabled {
return nil
}
for _, exp := range cm.experiments {
if !exp.enabled {
continue
}
if rand.Float64() < exp.probability {
cm.logger.Info("Injecting chaos",
"experiment", exp.name,
"operation", operation)
return exp.impact.Apply(ctx)
}
}
return nil
}
// Usage in service calls
func (s *PaymentService) ProcessPayment(ctx context.Context, req PaymentRequest) error {
// Inject chaos for testing
if err := s.chaosMonkey.MaybeInjectChaos(ctx, "process_payment"); err != nil {
return err
}
// Normal payment processing
return s.processPaymentInternal(ctx, req)
}
|
Metrics and Alerting
Monitor system resilience with key metrics:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
| type ResilienceMetrics struct {
CircuitBreakerState *prometheus.GaugeVec
RetryAttempts *prometheus.CounterVec
LoadShedCount *prometheus.CounterVec
DegradationEvents *prometheus.CounterVec
RecoveryTime *prometheus.HistogramVec
}
func NewResilienceMetrics() *ResilienceMetrics {
return &ResilienceMetrics{
CircuitBreakerState: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "circuit_breaker_state",
Help: "Circuit breaker state (0=closed, 1=half-open, 2=open)",
},
[]string{"service", "operation"},
),
RetryAttempts: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "retry_attempts_total",
Help: "Total number of retry attempts",
},
[]string{"service", "operation", "result"},
),
LoadShedCount: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "load_shed_total",
Help: "Total number of requests shed",
},
[]string{"service", "reason"},
),
}
}
// Alerting rules for system resilience
const alertingRules = `
groups:
- name: resilience
rules:
- alert: CircuitBreakerOpen
expr: circuit_breaker_state == 2
for: 1m
annotations:
summary: "Circuit breaker is open for {{ $labels.service }}"
- alert: HighRetryRate
expr: rate(retry_attempts_total[5m]) > 10
for: 2m
annotations:
summary: "High retry rate detected"
- alert: LoadSheddingActive
expr: rate(load_shed_total[5m]) > 1
for: 30s
annotations:
summary: "Load shedding is active"
`
|
Key Principles for Resilient Systems
- Fail fast: Don’t let failing operations consume resources
- Isolate failures: Use bulkheads to prevent cascade failures
- Degrade gracefully: Maintain core functionality when possible
- Monitor everything: Comprehensive observability is essential
- Test failures: Use chaos engineering to validate resilience
- Plan for recovery: Design systems that can recover automatically
Building resilient systems requires thinking about failure from day one. By implementing these patterns and practices, you can build systems that not only survive failures but recover gracefully and continue serving users even under adverse conditions.
Remember: resilience is not a destination but an ongoing practice. Continuously test, monitor, and improve your system’s ability to handle failure.