System Design Fundamentals: Building Scalable Applications
System design is the art of building software systems that can handle millions of users while remaining maintainable and cost-effective. Let’s explore the fundamental concepts that form the backbone of scalable applications.
Scalability Principles
Horizontal vs Vertical Scaling
Vertical Scaling (Scale Up)
- Add more power to existing machines
- Limited by hardware constraints
- Single point of failure
Horizontal Scaling (Scale Out)
- Add more machines to the pool
- Theoretically unlimited scaling
- Better fault tolerance
# Example: Horizontal scaling with load balancer
class LoadBalancer:
def __init__(self):
self.servers = [
"server1.example.com",
"server2.example.com",
"server3.example.com"
]
self.current = 0
def get_server(self):
server = self.servers[self.current]
self.current = (self.current + 1) % len(self.servers)
return server
Database Design Patterns
Master-Slave Replication
-- Master handles writes
INSERT INTO users (name, email) VALUES ('John', 'john@example.com');
-- Slaves handle reads (with eventual consistency)
SELECT * FROM users WHERE email = 'john@example.com';
Database Sharding
class UserSharding:
def __init__(self, shard_count=4):
self.shard_count = shard_count
self.shards = {
0: "shard0.db.example.com",
1: "shard1.db.example.com",
2: "shard2.db.example.com",
3: "shard3.db.example.com"
}
def get_shard(self, user_id):
shard_key = hash(user_id) % self.shard_count
return self.shards[shard_key]
def get_user(self, user_id):
shard_url = self.get_shard(user_id)
# Connect to specific shard and fetch user
return fetch_from_shard(shard_url, user_id)
Caching Strategies
Cache-Aside Pattern
import redis
import json
class CacheAside:
def __init__(self, redis_client, ttl=3600):
self.redis = redis_client
self.ttl = ttl
def get_user(self, user_id):
# Try cache first
cached = self.redis.get(f"user:{user_id}")
if cached:
return json.loads(cached)
# Cache miss - fetch from database
user = fetch_user_from_db(user_id)
if user:
# Store in cache
self.redis.setex(
f"user:{user_id}",
self.ttl,
json.dumps(user)
)
return user
Write-Through Cache
class WriteThroughCache:
def update_user(self, user_id, user_data):
# Write to database first
update_user_in_db(user_id, user_data)
# Then update cache
self.redis.setex(
f"user:{user_id}",
self.ttl,
json.dumps(user_data)
)
Message Queues and Event Streaming
Task Queue with Celery
from celery import Celery
app = Celery('tasks', broker='redis://localhost:6379')
@app.task
def send_email(user_id, email_template):
user = get_user(user_id)
email_content = render_template(email_template, user=user)
send_email_via_smtp(user.email, email_content)
# Usage
send_email.delay(user_id=123, email_template='welcome')
Event Streaming with Kafka
from kafka import KafkaProducer, KafkaConsumer
import json
# Producer
producer = KafkaProducer(
bootstrap_servers=['localhost:9092'],
value_serializer=lambda x: json.dumps(x).encode('utf-8')
)
def publish_user_event(event_type, user_data):
producer.send('user-events', {
'event_type': event_type,
'data': user_data,
'timestamp': datetime.utcnow().isoformat()
})
# Consumer
consumer = KafkaConsumer(
'user-events',
bootstrap_servers=['localhost:9092'],
value_deserializer=lambda m: json.loads(m.decode('utf-8'))
)
for message in consumer:
event = message.value
handle_user_event(event['event_type'], event['data'])
Monitoring and Observability
Health Check Endpoints
@app.route('/health')
def health_check():
checks = {
'database': check_database_connection(),
'redis': check_redis_connection(),
'external_api': check_external_api()
}
all_healthy = all(checks.values())
status_code = 200 if all_healthy else 503
return jsonify({
'status': 'healthy' if all_healthy else 'unhealthy',
'checks': checks,
'timestamp': datetime.utcnow().isoformat()
}), status_code
Metrics Collection
from prometheus_client import Counter, Histogram, generate_latest
REQUEST_COUNT = Counter('requests_total', 'Total requests', ['method', 'endpoint'])
REQUEST_DURATION = Histogram('request_duration_seconds', 'Request duration')
@app.before_request
def before_request():
request.start_time = time.time()
@app.after_request
def after_request(response):
REQUEST_COUNT.labels(
method=request.method,
endpoint=request.endpoint
).inc()
REQUEST_DURATION.observe(time.time() - request.start_time)
return response
Security Considerations
Rate Limiting
from flask_limiter import Limiter
from flask_limiter.util import get_remote_address
limiter = Limiter(
app,
key_func=get_remote_address,
default_limits=["1000 per hour"]
)
@app.route('/api/users')
@limiter.limit("100 per minute")
def get_users():
return jsonify(fetch_users())
System design is about making informed trade-offs. Understanding these fundamentals helps you build systems that not only work today but can evolve and scale as your requirements grow.