System Design Fundamentals: Building Scalable Applications

System design is the art of building software systems that can handle millions of users while remaining maintainable and cost-effective. Let’s explore the fundamental concepts that form the backbone of scalable applications.

Scalability Principles

Horizontal vs Vertical Scaling

Vertical Scaling (Scale Up)

  • Add more power to existing machines
  • Limited by hardware constraints
  • Single point of failure

Horizontal Scaling (Scale Out)

  • Add more machines to the pool
  • Theoretically unlimited scaling
  • Better fault tolerance
# Example: Horizontal scaling with load balancer
class LoadBalancer:
    def __init__(self):
        self.servers = [
            "server1.example.com",
            "server2.example.com", 
            "server3.example.com"
        ]
        self.current = 0
    
    def get_server(self):
        server = self.servers[self.current]
        self.current = (self.current + 1) % len(self.servers)
        return server

Database Design Patterns

Master-Slave Replication

-- Master handles writes
INSERT INTO users (name, email) VALUES ('John', 'john@example.com');

-- Slaves handle reads (with eventual consistency)
SELECT * FROM users WHERE email = 'john@example.com';

Database Sharding

class UserSharding:
    def __init__(self, shard_count=4):
        self.shard_count = shard_count
        self.shards = {
            0: "shard0.db.example.com",
            1: "shard1.db.example.com", 
            2: "shard2.db.example.com",
            3: "shard3.db.example.com"
        }
    
    def get_shard(self, user_id):
        shard_key = hash(user_id) % self.shard_count
        return self.shards[shard_key]
    
    def get_user(self, user_id):
        shard_url = self.get_shard(user_id)
        # Connect to specific shard and fetch user
        return fetch_from_shard(shard_url, user_id)

Caching Strategies

Cache-Aside Pattern

import redis
import json

class CacheAside:
    def __init__(self, redis_client, ttl=3600):
        self.redis = redis_client
        self.ttl = ttl
    
    def get_user(self, user_id):
        # Try cache first
        cached = self.redis.get(f"user:{user_id}")
        if cached:
            return json.loads(cached)
        
        # Cache miss - fetch from database
        user = fetch_user_from_db(user_id)
        if user:
            # Store in cache
            self.redis.setex(
                f"user:{user_id}", 
                self.ttl, 
                json.dumps(user)
            )
        return user

Write-Through Cache

class WriteThroughCache:
    def update_user(self, user_id, user_data):
        # Write to database first
        update_user_in_db(user_id, user_data)
        
        # Then update cache
        self.redis.setex(
            f"user:{user_id}",
            self.ttl,
            json.dumps(user_data)
        )

Message Queues and Event Streaming

Task Queue with Celery

from celery import Celery

app = Celery('tasks', broker='redis://localhost:6379')

@app.task
def send_email(user_id, email_template):
    user = get_user(user_id)
    email_content = render_template(email_template, user=user)
    send_email_via_smtp(user.email, email_content)

# Usage
send_email.delay(user_id=123, email_template='welcome')

Event Streaming with Kafka

from kafka import KafkaProducer, KafkaConsumer
import json

# Producer
producer = KafkaProducer(
    bootstrap_servers=['localhost:9092'],
    value_serializer=lambda x: json.dumps(x).encode('utf-8')
)

def publish_user_event(event_type, user_data):
    producer.send('user-events', {
        'event_type': event_type,
        'data': user_data,
        'timestamp': datetime.utcnow().isoformat()
    })

# Consumer
consumer = KafkaConsumer(
    'user-events',
    bootstrap_servers=['localhost:9092'],
    value_deserializer=lambda m: json.loads(m.decode('utf-8'))
)

for message in consumer:
    event = message.value
    handle_user_event(event['event_type'], event['data'])

Monitoring and Observability

Health Check Endpoints

@app.route('/health')
def health_check():
    checks = {
        'database': check_database_connection(),
        'redis': check_redis_connection(),
        'external_api': check_external_api()
    }
    
    all_healthy = all(checks.values())
    status_code = 200 if all_healthy else 503
    
    return jsonify({
        'status': 'healthy' if all_healthy else 'unhealthy',
        'checks': checks,
        'timestamp': datetime.utcnow().isoformat()
    }), status_code

Metrics Collection

from prometheus_client import Counter, Histogram, generate_latest

REQUEST_COUNT = Counter('requests_total', 'Total requests', ['method', 'endpoint'])
REQUEST_DURATION = Histogram('request_duration_seconds', 'Request duration')

@app.before_request
def before_request():
    request.start_time = time.time()

@app.after_request
def after_request(response):
    REQUEST_COUNT.labels(
        method=request.method,
        endpoint=request.endpoint
    ).inc()
    
    REQUEST_DURATION.observe(time.time() - request.start_time)
    return response

Security Considerations

Rate Limiting

from flask_limiter import Limiter
from flask_limiter.util import get_remote_address

limiter = Limiter(
    app,
    key_func=get_remote_address,
    default_limits=["1000 per hour"]
)

@app.route('/api/users')
@limiter.limit("100 per minute")
def get_users():
    return jsonify(fetch_users())

System design is about making informed trade-offs. Understanding these fundamentals helps you build systems that not only work today but can evolve and scale as your requirements grow.