Chapter 10 Production Environment Deployment and Operations
Chapter 10 Production Environment Deployment and Operations
Learning Objectives
- Master production environment containerized deployment strategies
- Learn to implement high availability and load balancing
- Understand container orchestration and cluster management
- Master production environment monitoring and troubleshooting
- Learn containerized application performance optimization
Knowledge Point Details
10.1 Production Environment Architecture Design
10.1.1 Architecture Pattern Selection
Single Server Deployment Mode:
# docker-compose.prod.yml
version: '3.8'
services:
web:
image: myapp:latest
ports:
- "80:3000"
environment:
- NODE_ENV=production
deploy:
resources:
limits:
memory: 512M
cpus: '1.0'
restart: unless-stopped
nginx:
image: nginx:alpine
ports:
- "443:443"
volumes:
- ./nginx.conf:/etc/nginx/nginx.conf:ro
- ./ssl:/etc/ssl:ro
depends_on:
- web
restart: unless-stopped
Cluster Deployment Mode:
# docker-stack.yml
version: '3.8'
services:
web:
image: myapp:latest
networks:
- web-network
deploy:
replicas: 3
placement:
constraints:
- node.role == worker
resources:
limits:
memory: 256M
cpus: '0.5'
reservations:
memory: 128M
cpus: '0.25'
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
window: 120s
update_config:
parallelism: 1
delay: 10s
failure_action: rollback
monitor: 60s
order: start-first
lb:
image: nginx:alpine
ports:
- "80:80"
- "443:443"
networks:
- web-network
volumes:
- ./nginx.conf:/etc/nginx/nginx.conf:ro
deploy:
replicas: 2
placement:
constraints:
- node.role == manager
networks:
web-network:
driver: overlay
attachable: true
10.1.2 Resource Planning
Server Resource Configuration:
#!/bin/bash
# resource-planning.sh
# Check system resources
echo "=== System Resource Check ==="
echo "CPU cores: $(nproc)"
echo "Memory: $(free -h | grep '^Mem:' | awk '{print $2}')"
echo "Disk space: $(df -h / | tail -1 | awk '{print $4}')"
echo "Docker version: $(docker --version)"
# Calculate resource allocation
TOTAL_MEMORY=$(free -m | grep '^Mem:' | awk '{print $2}')
TOTAL_CPUS=$(nproc)
# Reserve resources for system
SYSTEM_MEMORY=$((TOTAL_MEMORY * 20 / 100))
SYSTEM_CPUS="0.5"
# Available resources
AVAILABLE_MEMORY=$((TOTAL_MEMORY - SYSTEM_MEMORY))
AVAILABLE_CPUS=$((TOTAL_CPUS - 1))
echo "Available for containers:"
echo "Memory: ${AVAILABLE_MEMORY}MB"
echo "CPUs: ${AVAILABLE_CPUS}"
# Generate resource limit configuration
cat > resource-limits.yml << EOF
version: '3.8'
services:
web:
deploy:
resources:
limits:
memory: ${AVAILABLE_MEMORY}m
cpus: '${AVAILABLE_CPUS}'
reservations:
memory: $((AVAILABLE_MEMORY / 2))m
cpus: '0.25'
EOF
10.2 High Availability Design
10.2.1 Multi-Replica Deployment
Docker Swarm High Availability Configuration:
# Initialize Swarm cluster
docker swarm init --advertise-addr 192.168.1.100
# Add manager node
docker swarm join-token manager
# Add worker node
docker swarm join-token worker
# Check cluster status
docker node ls
Highly Available Service Deployment:
#!/bin/bash
# ha-deploy.sh
# Create overlay network
docker network create -d overlay --attachable ha-network
# Deploy Web service (3 replicas)
docker service create \
--name web-service \
--replicas 3 \
--network ha-network \
--constraint 'node.role==worker' \
--update-parallelism 1 \
--update-delay 10s \
--rollback-parallelism 1 \
--rollback-delay 10s \
--restart-condition on-failure \
--restart-max-attempts 3 \
--limit-memory 256m \
--limit-cpu 0.5 \
myapp:latest
# Deploy load balancer
docker service create \
--name lb-service \
--replicas 2 \
--network ha-network \
--publish 80:80 \
--publish 443:443 \
--constraint 'node.role==manager' \
--mount type=bind,source=$(pwd)/nginx.conf,target=/etc/nginx/nginx.conf \
nginx:alpine
# Deploy database (master-slave replication)
docker service create \
--name db-master \
--replicas 1 \
--network ha-network \
--constraint 'node.labels.db-role==master' \
--mount type=volume,source=db-master-data,target=/var/lib/mysql \
--env MYSQL_ROOT_PASSWORD=rootpassword \
--env MYSQL_REPLICATION_MODE=master \
--env MYSQL_REPLICATION_USER=replicator \
--env MYSQL_REPLICATION_PASSWORD=replicatorpassword \
mysql:8.0
docker service create \
--name db-slave \
--replicas 2 \
--network ha-network \
--constraint 'node.labels.db-role==slave' \
--env MYSQL_ROOT_PASSWORD=rootpassword \
--env MYSQL_REPLICATION_MODE=slave \
--env MYSQL_REPLICATION_USER=replicator \
--env MYSQL_REPLICATION_PASSWORD=replicatorpassword \
--env MYSQL_MASTER_HOST=db-master \
mysql:8.0
10.2.2 Load Balancing Configuration
Nginx Load Balancing:
# nginx.conf
upstream web_backend {
least_conn;
server web-service:3000 max_fails=3 fail_timeout=30s;
server web-service:3000 max_fails=3 fail_timeout=30s;
server web-service:3000 max_fails=3 fail_timeout=30s;
}
upstream api_backend {
ip_hash;
server api-service:8080 weight=3;
server api-service:8080 weight=2;
server api-service:8080 weight=1 backup;
}
server {
listen 80;
server_name example.com;
# Redirect to HTTPS
return 301 https://$server_name$request_uri;
}
server {
listen 443 ssl http2;
server_name example.com;
ssl_certificate /etc/ssl/cert.pem;
ssl_certificate_key /etc/ssl/private.key;
# SSL configuration
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers ECDHE+AESGCM:ECDHE+CHACHA20:DHE+AESGCM:DHE+CHACHA20:!aNULL:!MD5:!DSS;
ssl_prefer_server_ciphers off;
# Health check
location /health {
access_log off;
proxy_pass http://web_backend/health;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
}
# Web application
location / {
proxy_pass http://web_backend;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# Connection timeout settings
proxy_connect_timeout 30s;
proxy_send_timeout 30s;
proxy_read_timeout 30s;
# Buffer settings
proxy_buffering on;
proxy_buffer_size 4k;
proxy_buffers 8 4k;
}
# API routing
location /api/ {
proxy_pass http://api_backend/;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
}
# Static files
location /static/ {
expires 1y;
add_header Cache-Control "public, immutable";
try_files $uri @fallback;
}
location @fallback {
proxy_pass http://web_backend;
}
}
HAProxy Configuration:
# haproxy.cfg
global
daemon
maxconn 4096
log stdout local0 info
defaults
mode http
timeout connect 5000ms
timeout client 50000ms
timeout server 50000ms
option httplog
option dontlognull
option redispatch
retries 3
frontend web_frontend
bind *:80
bind *:443 ssl crt /etc/ssl/certs/example.com.pem
redirect scheme https if !{ ssl_fc }
# ACL definitions
acl is_api path_beg /api/
acl is_admin path_beg /admin/
# Routing rules
use_backend api_backend if is_api
use_backend admin_backend if is_admin
default_backend web_backend
backend web_backend
balance roundrobin
option httpchk GET /health
server web1 web-service:3000 check inter 3000 fall 3 rise 2
server web2 web-service:3000 check inter 3000 fall 3 rise 2
server web3 web-service:3000 check inter 3000 fall 3 rise 2
backend api_backend
balance leastconn
option httpchk GET /api/health
server api1 api-service:8080 check inter 3000 fall 3 rise 2
server api2 api-service:8080 check inter 3000 fall 3 rise 2
backend admin_backend
balance source
option httpchk GET /admin/health
server admin1 admin-service:9000 check inter 3000 fall 3 rise 2
listen stats
bind *:8404
stats enable
stats uri /stats
stats refresh 30s
stats admin if TRUE
10.3 Data Persistence and Backup
10.3.1 Data Volume Management
Production Environment Data Volume Configuration:
# docker-compose.prod.yml
version: '3.8'
services:
database:
image: postgres:14
volumes:
- db_data:/var/lib/postgresql/data
- db_backup:/backup
- ./init.sql:/docker-entrypoint-initdb.d/init.sql:ro
environment:
POSTGRES_DB: myapp
POSTGRES_USER: myuser
POSTGRES_PASSWORD_FILE: /run/secrets/db_password
secrets:
- db_password
networks:
- db_network
deploy:
placement:
constraints:
- node.labels.storage == ssd
resources:
limits:
memory: 1G
cpus: '1.0'
reservations:
memory: 512M
cpus: '0.5'
redis:
image: redis:7-alpine
volumes:
- redis_data:/data
- ./redis.conf:/etc/redis/redis.conf:ro
command: redis-server /etc/redis/redis.conf
networks:
- db_network
deploy:
resources:
limits:
memory: 256M
cpus: '0.5'
volumes:
db_data:
driver: local
driver_opts:
type: none
o: bind
device: /opt/docker/data/postgres
redis_data:
driver: local
driver_opts:
type: none
o: bind
device: /opt/docker/data/redis
db_backup:
driver: local
driver_opts:
type: none
o: bind
device: /opt/docker/backup
secrets:
db_password:
external: true
networks:
db_network:
driver: overlay
internal: true
10.3.2 Automated Backup Strategy
Database Backup Script:
#!/bin/bash
# backup-database.sh
DB_CONTAINER="myapp_database_1"
DB_USER="myuser"
DB_NAME="myapp"
BACKUP_DIR="/opt/docker/backup"
DATE=$(date +"%Y%m%d_%H%M%S")
BACKUP_FILE="backup_${DB_NAME}_${DATE}.sql"
# Create backup directory
mkdir -p $BACKUP_DIR
# Database backup
echo "Starting database backup..."
docker exec $DB_CONTAINER pg_dump -U $DB_USER $DB_NAME > $BACKUP_DIR/$BACKUP_FILE
# Compress backup file
gzip $BACKUP_DIR/$BACKUP_FILE
# Upload to remote storage
echo "Uploading backup to cloud storage..."
aws s3 cp $BACKUP_DIR/${BACKUP_FILE}.gz s3://myapp-backups/database/
# Clean up old local backups (keep 7 days)
find $BACKUP_DIR -name "backup_${DB_NAME}_*.sql.gz" -mtime +7 -delete
# Log backup status
if [ $? -eq 0 ]; then
echo "$(date): Backup completed successfully - ${BACKUP_FILE}.gz" >> /var/log/backup.log
else
echo "$(date): Backup failed" >> /var/log/backup.log
# Send alert email
echo "Database backup failed on $(hostname)" | mail -s "Backup Alert" admin@example.com
fi
Complete Backup Solution:
#!/bin/bash
# full-backup.sh
BACKUP_BASE="/opt/docker/backup"
DATE=$(date +"%Y%m%d_%H%M%S")
RETENTION_DAYS=30
# Create backup directory
mkdir -p $BACKUP_BASE/{database,volumes,configs}
# 1. Database backup
echo "Backing up databases..."
docker exec postgres_container pg_dumpall -U postgres | gzip > $BACKUP_BASE/database/full_dump_$DATE.sql.gz
# 2. Data volume backup
echo "Backing up volumes..."
docker run --rm \
-v myapp_db_data:/source:ro \
-v $BACKUP_BASE/volumes:/backup \
alpine tar czf /backup/db_data_$DATE.tar.gz -C /source .
docker run --rm \
-v myapp_redis_data:/source:ro \
-v $BACKUP_BASE/volumes:/backup \
alpine tar czf /backup/redis_data_$DATE.tar.gz -C /source .
# 3. Configuration file backup
echo "Backing up configurations..."
tar czf $BACKUP_BASE/configs/configs_$DATE.tar.gz \
/opt/docker/compose \
/opt/docker/nginx \
/opt/docker/ssl
# 4. Upload to cloud storage
echo "Uploading to cloud storage..."
aws s3 sync $BACKUP_BASE s3://myapp-backups/$(date +"%Y/%m/%d")/
# 5. Clean up old backups
echo "Cleaning up old backups..."
find $BACKUP_BASE -type f -mtime +$RETENTION_DAYS -delete
# 6. Verify backup integrity
echo "Verifying backup integrity..."
BACKUP_SIZE=$(du -s $BACKUP_BASE | cut -f1)
if [ $BACKUP_SIZE -lt 1000 ]; then
echo "Warning: Backup size seems too small" | mail -s "Backup Warning" admin@example.com
fi
echo "Backup completed at $(date)"
Crontab Scheduled Backup:
# Edit crontab
crontab -e
# Add scheduled tasks
# Run full backup daily at 2 AM
0 2 * * * /opt/scripts/full-backup.sh >> /var/log/backup.log 2>&1
# Run incremental backup every 6 hours
0 */6 * * * /opt/scripts/incremental-backup.sh >> /var/log/backup.log 2>&1
# Run data integrity check every Sunday
0 3 * * 0 /opt/scripts/backup-verify.sh >> /var/log/backup-verify.log 2>&1
10.4 Monitoring and Alerting
10.4.1 Comprehensive Monitoring Solution
Prometheus + Grafana Monitoring Stack:
# monitoring-stack.yml
version: '3.8'
services:
prometheus:
image: prom/prometheus:latest
ports:
- "9090:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./rules:/etc/prometheus/rules:ro
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--storage.tsdb.retention.time=200h'
- '--web.enable-lifecycle'
- '--web.enable-admin-api'
networks:
- monitoring
deploy:
resources:
limits:
memory: 512M
cpus: '1.0'
grafana:
image: grafana/grafana:latest
ports:
- "3000:3000"
volumes:
- grafana_data:/var/lib/grafana
- ./grafana/provisioning:/etc/grafana/provisioning:ro
- ./grafana/dashboards:/var/lib/grafana/dashboards:ro
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin123
- GF_USERS_ALLOW_SIGN_UP=false
- GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-simple-json-datasource
networks:
- monitoring
depends_on:
- prometheus
alertmanager:
image: prom/alertmanager:latest
ports:
- "9093:9093"
volumes:
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
- alertmanager_data:/alertmanager
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
- '--web.external-url=http://localhost:9093'
- '--web.route-prefix=/'
networks:
- monitoring
node-exporter:
image: prom/node-exporter:latest
ports:
- "9100:9100"
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.rootfs=/rootfs'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
networks:
- monitoring
cadvisor:
image: gcr.io/cadvisor/cadvisor:latest
ports:
- "8080:8080"
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
devices:
- /dev/kmsg:/dev/kmsg
networks:
- monitoring
volumes:
prometheus_data:
grafana_data:
alertmanager_data:
networks:
monitoring:
driver: overlay
attachable: true
Prometheus Configuration:
# prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- "rules/*.yml"
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'node-exporter'
static_configs:
- targets: ['node-exporter:9100']
- job_name: 'cadvisor'
static_configs:
- targets: ['cadvisor:8080']
- job_name: 'docker'
static_configs:
- targets: ['host.docker.internal:9323']
- job_name: 'app'
docker_sd_configs:
- host: unix:///var/run/docker.sock
port: 3000
relabel_configs:
- source_labels: [__meta_docker_container_label_monitoring]
target_label: __tmp_should_be_scraped
regex: true
- source_labels: [__tmp_should_be_scraped]
regex: true
target_label: __address__
replacement: ${1}:3000
10.4.2 Alert Rules Configuration
Production Environment Alert Rules:
# rules/production.yml
groups:
- name: production.rules
rules:
# Service availability alert
- alert: ServiceDown
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Service {{ $labels.job }} is down"
description: "Service {{ $labels.job }} on {{ $labels.instance }} has been down for more than 1 minute."
# High CPU usage
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage detected"
description: "CPU usage is {{ $value }}% on {{ $labels.instance }}"
# High memory usage
- alert: HighMemoryUsage
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage detected"
description: "Memory usage is {{ $value }}% on {{ $labels.instance }}"
# Low disk space
- alert: DiskSpaceLow
expr: (node_filesystem_size_bytes - node_filesystem_free_bytes) / node_filesystem_size_bytes * 100 > 90
for: 5m
labels:
severity: critical
annotations:
summary: "Disk space is running low"
description: "Disk usage is {{ $value }}% on {{ $labels.instance }}"
# Frequent container restarts
- alert: ContainerRestartingOften
expr: increase(container_start_time_seconds[1h]) > 5
for: 5m
labels:
severity: warning
annotations:
summary: "Container restarting frequently"
description: "Container {{ $labels.name }} has restarted {{ $value }} times in the last hour"
# High HTTP response time
- alert: HighResponseTime
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "High HTTP response time"
description: "95th percentile response time is {{ $value }}s"
# High HTTP error rate
- alert: HighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) * 100 > 5
for: 5m
labels:
severity: critical
annotations:
summary: "High HTTP error rate"
description: "HTTP error rate is {{ $value }}%"
# Database connection failure
- alert: DatabaseConnectionFailed
expr: mysql_up == 0 or postgres_up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Database connection failed"
description: "Cannot connect to database {{ $labels.instance }}"
Alert Manager Configuration:
# alertmanager.yml
global:
smtp_smarthost: 'smtp.gmail.com:587'
smtp_from: 'alerts@example.com'
smtp_auth_username: 'alerts@example.com'
smtp_auth_password: 'app-password'
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'web.hook'
routes:
- match:
severity: critical
receiver: 'critical-alerts'
- match:
severity: warning
receiver: 'warning-alerts'
receivers:
- name: 'web.hook'
webhook_configs:
- url: 'http://webhook-service:5000/alerts'
- name: 'critical-alerts'
email_configs:
- to: 'admin@example.com'
subject: 'Critical Alert: {{ .GroupLabels.alertname }}'
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
Labels: {{ .Labels }}
{{ end }}
slack_configs:
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
channel: '#alerts'
title: 'Critical Alert'
text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
- name: 'warning-alerts'
email_configs:
- to: 'team@example.com'
subject: 'Warning Alert: {{ .GroupLabels.alertname }}'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'instance']
10.5 Log Management
10.5.1 Centralized Log Collection
ELK Stack Logging Solution:
# logging-stack.yml
version: '3.8'
services:
elasticsearch:
image: docker.elastic.co/elasticsearch/elasticsearch:8.5.0
environment:
- discovery.type=single-node
- xpack.security.enabled=false
- "ES_JAVA_OPTS=-Xms512m -Xmx512m"
volumes:
- es_data:/usr/share/elasticsearch/data
ports:
- "9200:9200"
networks:
- logging
logstash:
image: docker.elastic.co/logstash/logstash:8.5.0
volumes:
- ./logstash/config:/usr/share/logstash/pipeline:ro
- ./logstash/patterns:/opt/logstash/patterns:ro
ports:
- "5044:5044"
- "12201:12201/udp"
depends_on:
- elasticsearch
networks:
- logging
environment:
LS_JAVA_OPTS: "-Xmx256m -Xms256m"
kibana:
image: docker.elastic.co/kibana/kibana:8.5.0
ports:
- "5601:5601"
environment:
ELASTICSEARCH_HOSTS: http://elasticsearch:9200
depends_on:
- elasticsearch
networks:
- logging
filebeat:
image: docker.elastic.co/beats/filebeat:8.5.0
volumes:
- ./filebeat/filebeat.yml:/usr/share/filebeat/filebeat.yml:ro
- /var/lib/docker/containers:/var/lib/docker/containers:ro
- /var/run/docker.sock:/var/run/docker.sock:ro
depends_on:
- logstash
networks:
- logging
user: root
volumes:
es_data:
networks:
logging:
driver: overlay
attachable: true
Filebeat Configuration:
# filebeat/filebeat.yml
filebeat.inputs:
- type: container
paths:
- '/var/lib/docker/containers/*/*.log'
processors:
- add_docker_metadata:
host: "unix:///var/run/docker.sock"
- decode_json_fields:
fields: ["message"]
target: ""
overwrite_keys: true
output.logstash:
hosts: ["logstash:5044"]
logging.level: info
logging.to_files: true
logging.files:
path: /var/log/filebeat
name: filebeat
keepfiles: 7
permissions: 0644
Logstash Configuration:
# logstash/config/logstash.conf
input {
beats {
port => 5044
}
gelf {
port => 12201
}
}
filter {
if [container][name] {
mutate {
add_field => { "service_name" => "%{[container][name]}" }
}
}
# Parse Nginx logs
if [service_name] =~ "nginx" {
grok {
match => { "message" => "%{COMBINEDAPACHELOG}" }
}
date {
match => [ "timestamp", "dd/MMM/yyyy:HH:mm:ss Z" ]
}
mutate {
convert => { "response" => "integer" }
convert => { "bytes" => "integer" }
}
}
# Parse application logs
if [service_name] =~ "web" {
json {
source => "message"
}
date {
match => [ "timestamp", "ISO8601" ]
}
}
# Add environment label
mutate {
add_field => { "environment" => "production" }
}
}
output {
elasticsearch {
hosts => ["elasticsearch:9200"]
index => "logs-%{+YYYY.MM.dd}"
}
# Store error logs separately
if [level] == "ERROR" {
elasticsearch {
hosts => ["elasticsearch:9200"]
index => "errors-%{+YYYY.MM.dd}"
}
}
}
10.5.2 Application Logging Best Practices
Structured Log Output:
// Node.js application logging configuration
const winston = require('winston');
const logger = winston.createLogger({
level: process.env.LOG_LEVEL || 'info',
format: winston.format.combine(
winston.format.timestamp(),
winston.format.errors({ stack: true }),
winston.format.json()
),
defaultMeta: {
service: 'web-app',
version: process.env.APP_VERSION,
environment: process.env.NODE_ENV
},
transports: [
new winston.transports.Console({
format: process.env.NODE_ENV === 'development'
? winston.format.combine(
winston.format.colorize(),
winston.format.simple()
)
: winston.format.json()
})
]
});
// Usage example
logger.info('User login successful', {
userId: '12345',
ip: req.ip,
userAgent: req.get('User-Agent')
});
logger.error('Database connection failed', {
error: error.message,
stack: error.stack,
query: sqlQuery
});
Docker Logging Driver Configuration:
# docker-compose.yml
version: '3.8'
services:
web:
image: myapp:latest
logging:
driver: "fluentd"
options:
fluentd-address: localhost:24224
tag: myapp.web
labels: "environment,service"
labels:
- "environment=production"
- "service=web"
api:
image: myapi:latest
logging:
driver: "gelf"
options:
gelf-address: "udp://localhost:12201"
tag: "{{.ImageName}}/{{.Name}}/{{.ID}}"
database:
image: postgres:14
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
labels: "environment,service"
10.6 Performance Optimization
10.6.1 Container Performance Tuning
Resource Limit Optimization:
#!/bin/bash
# container-tuning.sh
# Analyze container resource usage
echo "=== Container Resource Analysis ==="
docker stats --no-stream --format "table {{.Container}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.MemPerc}}"
# Get detailed container resource information
for container in $(docker ps -q); do
echo "Container: $(docker ps --format '{{.Names}}' --filter id=$container)"
echo "Memory limit: $(docker inspect $container | jq '.[0].HostConfig.Memory')"
echo "CPU limit: $(docker inspect $container | jq '.[0].HostConfig.CpuQuota')"
echo "---"
done
# Optimization recommendation script
optimize_container() {
local container_name=$1
local cpu_usage=$(docker stats --no-stream --format "{{.CPUPerc}}" $container_name | sed 's/%//')
local mem_usage=$(docker stats --no-stream --format "{{.MemPerc}}" $container_name | sed 's/%//')
echo "Container: $container_name"
echo "CPU Usage: $cpu_usage%"
echo "Memory Usage: $mem_usage%"
if (( $(echo "$cpu_usage > 80" | bc -l) )); then
echo "High CPU usage detected. Consider:"
echo " - Increasing CPU limits"
echo " - Optimizing application code"
echo " - Adding more replicas"
fi
if (( $(echo "$mem_usage > 85" | bc -l) )); then
echo "High memory usage detected. Consider:"
echo " - Increasing memory limits"
echo " - Memory leak investigation"
echo " - Application profiling"
fi
echo ""
}
# Analyze all running containers
for container in $(docker ps --format '{{.Names}}'); do
optimize_container $container
done
Network Performance Optimization:
# docker-compose.performance.yml
version: '3.8'
services:
web:
image: myapp:latest
networks:
- app-network
deploy:
resources:
limits:
memory: 512M
cpus: '1.0'
reservations:
memory: 256M
cpus: '0.5'
# Use host network for performance boost (single server deployment)
# network_mode: host
# Adjust ulimits
ulimits:
nproc: 65535
nofile:
soft: 65535
hard: 65535
nginx:
image: nginx:alpine
volumes:
- ./nginx-optimized.conf:/etc/nginx/nginx.conf:ro
networks:
- app-network
# Enable kernel bypass
sysctls:
- net.core.somaxconn=65535
- net.ipv4.ip_local_port_range=1024 65535
networks:
app-network:
driver: bridge
driver_opts:
com.docker.network.bridge.name: "br-app"
com.docker.network.driver.mtu: 1500
Optimized Nginx Configuration:
# nginx-optimized.conf
worker_processes auto;
worker_rlimit_nofile 65535;
events {
worker_connections 65535;
use epoll;
multi_accept on;
}
http {
include /etc/nginx/mime.types;
default_type application/octet-stream;
# Performance optimization
sendfile on;
tcp_nopush on;
tcp_nodelay on;
keepalive_timeout 65;
keepalive_requests 100;
# Buffer optimization
client_body_buffer_size 128k;
client_max_body_size 50m;
client_header_buffer_size 1k;
large_client_header_buffers 4 4k;
output_buffers 1 32k;
postpone_output 1460;
# Gzip compression
gzip on;
gzip_vary on;
gzip_min_length 10240;
gzip_proxied expired no-cache no-store private must-revalidate auth;
gzip_types
text/plain
text/css
text/xml
text/javascript
application/x-javascript
application/xml+rss
application/json;
# Cache configuration
open_file_cache max=200000 inactive=20s;
open_file_cache_valid 30s;
open_file_cache_min_uses 2;
open_file_cache_errors on;
upstream backend {
least_conn;
server web:3000 weight=1 max_fails=3 fail_timeout=30s;
keepalive 32;
}
server {
listen 80 default_server;
location / {
proxy_pass http://backend;
proxy_http_version 1.1;
proxy_set_header Connection "";
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
# Buffer optimization
proxy_buffering on;
proxy_buffer_size 4k;
proxy_buffers 8 4k;
proxy_busy_buffers_size 8k;
}
# Static file caching
location ~* \.(css|js|jpg|jpeg|png|gif|ico|svg)$ {
expires 1y;
add_header Cache-Control "public, immutable";
}
}
}
10.6.2 Application Performance Monitoring
APM Integration:
// Node.js APM configuration
const apm = require('elastic-apm-node').start({
serviceName: 'web-app',
serviceVersion: process.env.APP_VERSION,
environment: process.env.NODE_ENV,
serverUrl: process.env.ELASTIC_APM_SERVER_URL,
captureBody: 'all',
captureHeaders: true,
logUncaughtExceptions: true
});
const express = require('express');
const app = express();
// Custom performance metrics
app.use((req, res, next) => {
const span = apm.startSpan('http.request');
const start = process.hrtime();
res.on('finish', () => {
const diff = process.hrtime(start);
const duration = diff[0] * 1e3 + diff[1] * 1e-6;
apm.setCustomContext({
response_time: duration,
status_code: res.statusCode,
method: req.method,
url: req.url
});
if (span) span.end();
});
next();
});
Practical Case Study
Case: E-commerce Website Production Deployment
Scenario Description: Deploy a high-traffic e-commerce website including frontend, API, order service, payment service, databases, etc.
Architecture Design:
Internet
↓
Load Balancer (HAProxy)
↓
Web Tier (Nginx + React App)
↓
API Gateway (Kong/Traefik)
↓
Microservices (Node.js/Go)
↓
Database Tier (PostgreSQL Master/Slave)
↓
Cache Tier (Redis Cluster)
Complete Deployment Configuration:
# production-stack.yml
version: '3.8'
services:
# Load balancer
haproxy:
image: haproxy:2.6-alpine
ports:
- "80:80"
- "443:443"
- "8404:8404" # Stats page
volumes:
- ./haproxy/haproxy.cfg:/usr/local/etc/haproxy/haproxy.cfg:ro
- ./ssl:/etc/ssl:ro
networks:
- frontend
deploy:
replicas: 2
placement:
constraints: [node.role == manager]
resources:
limits:
memory: 256M
cpus: '0.5'
# Frontend application
web:
image: mystore/web:${VERSION}
networks:
- frontend
deploy:
replicas: 4
placement:
constraints: [node.role == worker]
resources:
limits:
memory: 128M
cpus: '0.25'
reservations:
memory: 64M
cpus: '0.1'
update_config:
parallelism: 2
delay: 10s
failure_action: rollback
restart_policy:
condition: on-failure
max_attempts: 3
# API gateway
api-gateway:
image: kong:3.0-alpine
environment:
KONG_DATABASE: "off"
KONG_DECLARATIVE_CONFIG: /kong/declarative/kong.yml
KONG_PROXY_ACCESS_LOG: /dev/stdout
KONG_ADMIN_ACCESS_LOG: /dev/stdout
KONG_PROXY_ERROR_LOG: /dev/stderr
KONG_ADMIN_ERROR_LOG: /dev/stderr
KONG_ADMIN_LISTEN: 0.0.0.0:8001
volumes:
- ./kong/kong.yml:/kong/declarative/kong.yml:ro
networks:
- frontend
- backend
deploy:
replicas: 3
resources:
limits:
memory: 512M
cpus: '0.5'
# User service
user-service:
image: mystore/user-service:${VERSION}
environment:
DATABASE_URL: postgresql://user:password@postgres:5432/userdb
REDIS_URL: redis://redis-cluster:6379
networks:
- backend
- database
deploy:
replicas: 3
resources:
limits:
memory: 256M
cpus: '0.5'
secrets:
- db_password
- jwt_secret
# Order service
order-service:
image: mystore/order-service:${VERSION}
environment:
DATABASE_URL: postgresql://order:password@postgres:5432/orderdb
MESSAGE_QUEUE: redis://redis-cluster:6379
networks:
- backend
- database
deploy:
replicas: 5 # Order service needs more instances
resources:
limits:
memory: 512M
cpus: '1.0'
depends_on:
- postgres
- redis-cluster
# Payment service
payment-service:
image: mystore/payment-service:${VERSION}
environment:
DATABASE_URL: postgresql://payment:password@postgres:5432/paymentdb
ENCRYPTION_KEY_FILE: /run/secrets/encryption_key
networks:
- backend
- database
deploy:
replicas: 2
resources:
limits:
memory: 256M
cpus: '0.5'
placement:
constraints: [node.labels.security == high] # Deploy on high-security nodes
secrets:
- encryption_key
# Database master node
postgres-master:
image: postgres:14
environment:
POSTGRES_USER: postgres
POSTGRES_PASSWORD_FILE: /run/secrets/postgres_password
POSTGRES_REPLICATION_MODE: master
POSTGRES_REPLICATION_USER: replicator
POSTGRES_REPLICATION_PASSWORD_FILE: /run/secrets/replication_password
volumes:
- postgres_master_data:/var/lib/postgresql/data
- ./postgres/init:/docker-entrypoint-initdb.d:ro
networks:
- database
deploy:
replicas: 1
placement:
constraints: [node.labels.database == master]
resources:
limits:
memory: 2G
cpus: '2.0'
reservations:
memory: 1G
cpus: '1.0'
secrets:
- postgres_password
- replication_password
# Database slave nodes
postgres-slave:
image: postgres:14
environment:
POSTGRES_MASTER_HOST: postgres-master
POSTGRES_REPLICATION_MODE: slave
POSTGRES_REPLICATION_USER: replicator
POSTGRES_REPLICATION_PASSWORD_FILE: /run/secrets/replication_password
volumes:
- postgres_slave_data:/var/lib/postgresql/data
networks:
- database
deploy:
replicas: 2
placement:
constraints: [node.labels.database == slave]
resources:
limits:
memory: 1G
cpus: '1.0'
depends_on:
- postgres-master
secrets:
- replication_password
# Redis cluster
redis-cluster:
image: redis:7-alpine
command: redis-server --cluster-enabled yes --cluster-config-file nodes.conf --cluster-node-timeout 5000 --appendonly yes
volumes:
- redis_data:/data
networks:
- database
deploy:
replicas: 6 # 3 masters 3 slaves
resources:
limits:
memory: 512M
cpus: '0.5'
# Monitoring components
prometheus:
image: prom/prometheus:latest
volumes:
- ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- prometheus_data:/prometheus
networks:
- monitoring
deploy:
replicas: 1
placement:
constraints: [node.role == manager]
grafana:
image: grafana/grafana:latest
environment:
GF_SECURITY_ADMIN_PASSWORD_FILE: /run/secrets/grafana_password
volumes:
- grafana_data:/var/lib/grafana
networks:
- monitoring
secrets:
- grafana_password
# Network definitions
networks:
frontend:
driver: overlay
attachable: true
backend:
driver: overlay
internal: true
database:
driver: overlay
internal: true
monitoring:
driver: overlay
attachable: true
# Volume definitions
volumes:
postgres_master_data:
driver: local
driver_opts:
type: none
o: bind
device: /data/postgres/master
postgres_slave_data:
driver: local
driver_opts:
type: none
o: bind
device: /data/postgres/slave
redis_data:
driver: local
prometheus_data:
driver: local
grafana_data:
driver: local
# Secret definitions
secrets:
db_password:
external: true
postgres_password:
external: true
replication_password:
external: true
jwt_secret:
external: true
encryption_key:
external: true
grafana_password:
external: true
Deployment Script:
#!/bin/bash
# deploy-production.sh
set -e
# Configuration variables
STACK_NAME="ecommerce"
VERSION=${1:-latest}
REGISTRY="registry.example.com"
echo "Deploying $STACK_NAME version $VERSION..."
# 1. Create Swarm secrets
echo "Creating secrets..."
echo "your-db-password" | docker secret create db_password - 2>/dev/null || true
echo "your-postgres-password" | docker secret create postgres_password - 2>/dev/null || true
echo "your-jwt-secret" | docker secret create jwt_secret - 2>/dev/null || true
# 2. Set node labels
echo "Setting node labels..."
docker node update --label-add database=master swarm-manager-1
docker node update --label-add database=slave swarm-worker-1
docker node update --label-add database=slave swarm-worker-2
docker node update --label-add security=high swarm-worker-3
# 3. Create data directories
echo "Creating data directories..."
sudo mkdir -p /data/postgres/{master,slave}
sudo chown -R 999:999 /data/postgres
# 4. Deploy service stack
echo "Deploying stack..."
VERSION=$VERSION docker stack deploy -c production-stack.yml $STACK_NAME
# 5. Wait for services to start
echo "Waiting for services to start..."
sleep 30
# 6. Verify deployment
echo "Verifying deployment..."
docker stack services $STACK_NAME
docker service logs ${STACK_NAME}_web
docker service logs ${STACK_NAME}_api-gateway
# 7. Run health checks
echo "Running health checks..."
./health-check.sh
echo "Deployment completed successfully!"
Health Check Script:
#!/bin/bash
# health-check.sh
STACK_NAME="ecommerce"
FAILED=0
echo "=== Health Check Report ==="
# Check service status
echo "1. Service Status:"
for service in $(docker stack services $STACK_NAME --format "{{.Name}}"); do
replicas=$(docker service ps $service --filter "desired-state=running" --format "{{.CurrentState}}" | grep -c "Running" || echo "0")
desired=$(docker service inspect $service --format "{{.Spec.Replicas}}")
if [ "$replicas" -eq "$desired" ]; then
echo "✓ $service: $replicas/$desired replicas running"
else
echo "✗ $service: $replicas/$desired replicas running"
FAILED=1
fi
done
# Check network connectivity
echo ""
echo "2. Network Connectivity:"
if curl -f -s http://localhost/health > /dev/null; then
echo "✓ Web frontend accessible"
else
echo "✗ Web frontend not accessible"
FAILED=1
fi
if curl -f -s http://localhost:8001/status > /dev/null; then
echo "✓ API Gateway accessible"
else
echo "✗ API Gateway not accessible"
FAILED=1
fi
# Check database connection
echo ""
echo "3. Database Connectivity:"
if docker exec -it ${STACK_NAME}_postgres-master.1.$(docker service ps ${STACK_NAME}_postgres-master --format "{{.ID}}" | head -1) pg_isready -U postgres; then
echo "✓ PostgreSQL master accessible"
else
echo "✗ PostgreSQL master not accessible"
FAILED=1
fi
# Check Redis cluster
echo ""
echo "4. Cache Status:"
if docker exec -it ${STACK_NAME}_redis-cluster.1.$(docker service ps ${STACK_NAME}_redis-cluster --format "{{.ID}}" | head -1) redis-cli ping | grep -q PONG; then
echo "✓ Redis cluster accessible"
else
echo "✗ Redis cluster not accessible"
FAILED=1
fi
# Check monitoring services
echo ""
echo "5. Monitoring Status:"
if curl -f -s http://localhost:9090/api/v1/query?query=up > /dev/null; then
echo "✓ Prometheus accessible"
else
echo "✗ Prometheus not accessible"
FAILED=1
fi
echo ""
if [ $FAILED -eq 0 ]; then
echo "All health checks passed!"
exit 0
else
echo "Some health checks failed!"
exit 1
fi
Summary
Production environment Docker containerized deployment is a complex systems engineering task that requires consideration of:
Core Elements:
- High Availability Architecture: Multi-replica deployment, load balancing, failover
- Resource Management: Proper resource limits and reservations
- Data Security: Persistent storage, regular backups, disaster recovery
- Monitoring and Alerting: Comprehensive monitoring, timely alerts, fault self-healing
- Performance Optimization: Network optimization, caching strategies, resource tuning
Best Practices:
- Adopt Infrastructure as Code (IaC)
- Implement GitOps workflows
- Establish comprehensive CI/CD processes
- Regularly conduct disaster recovery drills
- Keep systems and dependencies updated promptly
Through systematic planning and implementation, Docker containerization technology can provide stable, efficient, and scalable service support for production environments.