System Design Fundamentals: Building Scalable Applications

February 1, 2024

System design is the art of building software systems that can handle millions of users while remaining maintainable and cost-effective. Let’s explore the fundamental concepts that form the backbone of scalable applications.

Scalability Principles

Horizontal vs Vertical Scaling

Vertical Scaling (Scale Up)

Add more power to existing machines
Limited by hardware constraints
Single point of failure

Horizontal Scaling (Scale Out)

Add more machines to the pool
Theoretically unlimited scaling
Better fault tolerance

# Example: Horizontal scaling with load balancer
class LoadBalancer:
    def __init__(self):
        self.servers = [
            "server1.example.com",
            "server2.example.com", 
            "server3.example.com"
        ]
        self.current = 0
    
    def get_server(self):
        server = self.servers[self.current]
        self.current = (self.current + 1) % len(self.servers)
        return server

Database Design Patterns

Master-Slave Replication

-- Master handles writes
INSERT INTO users (name, email) VALUES ('John', 'john@example.com');

-- Slaves handle reads (with eventual consistency)
SELECT * FROM users WHERE email = 'john@example.com';

Database Sharding

class UserSharding:
    def __init__(self, shard_count=4):
        self.shard_count = shard_count
        self.shards = {
            0: "shard0.db.example.com",
            1: "shard1.db.example.com", 
            2: "shard2.db.example.com",
            3: "shard3.db.example.com"
        }
    
    def get_shard(self, user_id):
        shard_key = hash(user_id) % self.shard_count
        return self.shards[shard_key]
    
    def get_user(self, user_id):
        shard_url = self.get_shard(user_id)
        # Connect to specific shard and fetch user
        return fetch_from_shard(shard_url, user_id)

Caching Strategies

Cache-Aside Pattern

import redis
import json

class CacheAside:
    def __init__(self, redis_client, ttl=3600):
        self.redis = redis_client
        self.ttl = ttl
    
    def get_user(self, user_id):
        # Try cache first
        cached = self.redis.get(f"user:{user_id}")
        if cached:
            return json.loads(cached)
        
        # Cache miss - fetch from database
        user = fetch_user_from_db(user_id)
        if user:
            # Store in cache
            self.redis.setex(
                f"user:{user_id}", 
                self.ttl, 
                json.dumps(user)
            )
        return user

Write-Through Cache

class WriteThroughCache:
    def update_user(self, user_id, user_data):
        # Write to database first
        update_user_in_db(user_id, user_data)
        
        # Then update cache
        self.redis.setex(
            f"user:{user_id}",
            self.ttl,
            json.dumps(user_data)
        )

Message Queues and Event Streaming

Task Queue with Celery

from celery import Celery

app = Celery('tasks', broker='redis://localhost:6379')

@app.task
def send_email(user_id, email_template):
    user = get_user(user_id)
    email_content = render_template(email_template, user=user)
    send_email_via_smtp(user.email, email_content)

# Usage
send_email.delay(user_id=123, email_template='welcome')

Event Streaming with Kafka

from kafka import KafkaProducer, KafkaConsumer
import json

# Producer
producer = KafkaProducer(
    bootstrap_servers=['localhost:9092'],
    value_serializer=lambda x: json.dumps(x).encode('utf-8')
)

def publish_user_event(event_type, user_data):
    producer.send('user-events', {
        'event_type': event_type,
        'data': user_data,
        'timestamp': datetime.utcnow().isoformat()
    })

# Consumer
consumer = KafkaConsumer(
    'user-events',
    bootstrap_servers=['localhost:9092'],
    value_deserializer=lambda m: json.loads(m.decode('utf-8'))
)

for message in consumer:
    event = message.value
    handle_user_event(event['event_type'], event['data'])

Monitoring and Observability

Health Check Endpoints

@app.route('/health')
def health_check():
    checks = {
        'database': check_database_connection(),
        'redis': check_redis_connection(),
        'external_api': check_external_api()
    }
    
    all_healthy = all(checks.values())
    status_code = 200 if all_healthy else 503
    
    return jsonify({
        'status': 'healthy' if all_healthy else 'unhealthy',
        'checks': checks,
        'timestamp': datetime.utcnow().isoformat()
    }), status_code

Metrics Collection

from prometheus_client import Counter, Histogram, generate_latest

REQUEST_COUNT = Counter('requests_total', 'Total requests', ['method', 'endpoint'])
REQUEST_DURATION = Histogram('request_duration_seconds', 'Request duration')

@app.before_request
def before_request():
    request.start_time = time.time()

@app.after_request
def after_request(response):
    REQUEST_COUNT.labels(
        method=request.method,
        endpoint=request.endpoint
    ).inc()
    
    REQUEST_DURATION.observe(time.time() - request.start_time)
    return response

Security Considerations

Rate Limiting

from flask_limiter import Limiter
from flask_limiter.util import get_remote_address

limiter = Limiter(
    app,
    key_func=get_remote_address,
    default_limits=["1000 per hour"]
)

@app.route('/api/users')
@limiter.limit("100 per minute")
def get_users():
    return jsonify(fetch_users())

System design is about making informed trade-offs. Understanding these fundamentals helps you build systems that not only work today but can evolve and scale as your requirements grow.