第10章 生产环境部署与运维
2024/1/10大约 15 分钟
第10章 生产环境部署与运维
学习目标
- 掌握生产环境容器化部署策略
- 学会实现高可用和负载均衡
- 理解容器编排和集群管理
- 掌握生产环境监控和故障排查
- 学会容器化应用的性能优化
知识点详解
10.1 生产环境架构设计
10.1.1 架构模式选择
单机部署模式:
# docker-compose.prod.yml
version: '3.8'
services:
web:
image: myapp:latest
ports:
- "80:3000"
environment:
- NODE_ENV=production
deploy:
resources:
limits:
memory: 512M
cpus: '1.0'
restart: unless-stopped
nginx:
image: nginx:alpine
ports:
- "443:443"
volumes:
- ./nginx.conf:/etc/nginx/nginx.conf:ro
- ./ssl:/etc/ssl:ro
depends_on:
- web
restart: unless-stopped
集群部署模式:
# docker-stack.yml
version: '3.8'
services:
web:
image: myapp:latest
networks:
- web-network
deploy:
replicas: 3
placement:
constraints:
- node.role == worker
resources:
limits:
memory: 256M
cpus: '0.5'
reservations:
memory: 128M
cpus: '0.25'
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
window: 120s
update_config:
parallelism: 1
delay: 10s
failure_action: rollback
monitor: 60s
order: start-first
lb:
image: nginx:alpine
ports:
- "80:80"
- "443:443"
networks:
- web-network
volumes:
- ./nginx.conf:/etc/nginx/nginx.conf:ro
deploy:
replicas: 2
placement:
constraints:
- node.role == manager
networks:
web-network:
driver: overlay
attachable: true
10.1.2 资源规划
服务器资源配置:
#!/bin/bash
# resource-planning.sh
# 检查系统资源
echo "=== System Resource Check ==="
echo "CPU cores: $(nproc)"
echo "Memory: $(free -h | grep '^Mem:' | awk '{print $2}')"
echo "Disk space: $(df -h / | tail -1 | awk '{print $4}')"
echo "Docker version: $(docker --version)"
# 计算资源分配
TOTAL_MEMORY=$(free -m | grep '^Mem:' | awk '{print $2}')
TOTAL_CPUS=$(nproc)
# 为系统保留资源
SYSTEM_MEMORY=$((TOTAL_MEMORY * 20 / 100))
SYSTEM_CPUS="0.5"
# 可分配资源
AVAILABLE_MEMORY=$((TOTAL_MEMORY - SYSTEM_MEMORY))
AVAILABLE_CPUS=$((TOTAL_CPUS - 1))
echo "Available for containers:"
echo "Memory: ${AVAILABLE_MEMORY}MB"
echo "CPUs: ${AVAILABLE_CPUS}"
# 生成资源限制配置
cat > resource-limits.yml << EOF
version: '3.8'
services:
web:
deploy:
resources:
limits:
memory: ${AVAILABLE_MEMORY}m
cpus: '${AVAILABLE_CPUS}'
reservations:
memory: $((AVAILABLE_MEMORY / 2))m
cpus: '0.25'
EOF
10.2 高可用性设计
10.2.1 多副本部署
Docker Swarm 高可用配置:
# 初始化 Swarm 集群
docker swarm init --advertise-addr 192.168.1.100
# 添加 manager 节点
docker swarm join-token manager
# 添加 worker 节点
docker swarm join-token worker
# 检查集群状态
docker node ls
服务高可用部署:
#!/bin/bash
# ha-deploy.sh
# 创建 overlay 网络
docker network create -d overlay --attachable ha-network
# 部署 Web 服务(3个副本)
docker service create \
--name web-service \
--replicas 3 \
--network ha-network \
--constraint 'node.role==worker' \
--update-parallelism 1 \
--update-delay 10s \
--rollback-parallelism 1 \
--rollback-delay 10s \
--restart-condition on-failure \
--restart-max-attempts 3 \
--limit-memory 256m \
--limit-cpu 0.5 \
myapp:latest
# 部署负载均衡器
docker service create \
--name lb-service \
--replicas 2 \
--network ha-network \
--publish 80:80 \
--publish 443:443 \
--constraint 'node.role==manager' \
--mount type=bind,source=$(pwd)/nginx.conf,target=/etc/nginx/nginx.conf \
nginx:alpine
# 部署数据库(主从复制)
docker service create \
--name db-master \
--replicas 1 \
--network ha-network \
--constraint 'node.labels.db-role==master' \
--mount type=volume,source=db-master-data,target=/var/lib/mysql \
--env MYSQL_ROOT_PASSWORD=rootpassword \
--env MYSQL_REPLICATION_MODE=master \
--env MYSQL_REPLICATION_USER=replicator \
--env MYSQL_REPLICATION_PASSWORD=replicatorpassword \
mysql:8.0
docker service create \
--name db-slave \
--replicas 2 \
--network ha-network \
--constraint 'node.labels.db-role==slave' \
--env MYSQL_ROOT_PASSWORD=rootpassword \
--env MYSQL_REPLICATION_MODE=slave \
--env MYSQL_REPLICATION_USER=replicator \
--env MYSQL_REPLICATION_PASSWORD=replicatorpassword \
--env MYSQL_MASTER_HOST=db-master \
mysql:8.0
10.2.2 负载均衡配置
Nginx 负载均衡:
# nginx.conf
upstream web_backend {
least_conn;
server web-service:3000 max_fails=3 fail_timeout=30s;
server web-service:3000 max_fails=3 fail_timeout=30s;
server web-service:3000 max_fails=3 fail_timeout=30s;
}
upstream api_backend {
ip_hash;
server api-service:8080 weight=3;
server api-service:8080 weight=2;
server api-service:8080 weight=1 backup;
}
server {
listen 80;
server_name example.com;
# 重定向到 HTTPS
return 301 https://$server_name$request_uri;
}
server {
listen 443 ssl http2;
server_name example.com;
ssl_certificate /etc/ssl/cert.pem;
ssl_certificate_key /etc/ssl/private.key;
# SSL 配置
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers ECDHE+AESGCM:ECDHE+CHACHA20:DHE+AESGCM:DHE+CHACHA20:!aNULL:!MD5:!DSS;
ssl_prefer_server_ciphers off;
# 健康检查
location /health {
access_log off;
proxy_pass http://web_backend/health;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
}
# Web 应用
location / {
proxy_pass http://web_backend;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# 连接超时设置
proxy_connect_timeout 30s;
proxy_send_timeout 30s;
proxy_read_timeout 30s;
# 缓冲设置
proxy_buffering on;
proxy_buffer_size 4k;
proxy_buffers 8 4k;
}
# API 路由
location /api/ {
proxy_pass http://api_backend/;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
}
# 静态文件
location /static/ {
expires 1y;
add_header Cache-Control "public, immutable";
try_files $uri @fallback;
}
location @fallback {
proxy_pass http://web_backend;
}
}
HAProxy 配置:
# haproxy.cfg
global
daemon
maxconn 4096
log stdout local0 info
defaults
mode http
timeout connect 5000ms
timeout client 50000ms
timeout server 50000ms
option httplog
option dontlognull
option redispatch
retries 3
frontend web_frontend
bind *:80
bind *:443 ssl crt /etc/ssl/certs/example.com.pem
redirect scheme https if !{ ssl_fc }
# ACL 定义
acl is_api path_beg /api/
acl is_admin path_beg /admin/
# 路由规则
use_backend api_backend if is_api
use_backend admin_backend if is_admin
default_backend web_backend
backend web_backend
balance roundrobin
option httpchk GET /health
server web1 web-service:3000 check inter 3000 fall 3 rise 2
server web2 web-service:3000 check inter 3000 fall 3 rise 2
server web3 web-service:3000 check inter 3000 fall 3 rise 2
backend api_backend
balance leastconn
option httpchk GET /api/health
server api1 api-service:8080 check inter 3000 fall 3 rise 2
server api2 api-service:8080 check inter 3000 fall 3 rise 2
backend admin_backend
balance source
option httpchk GET /admin/health
server admin1 admin-service:9000 check inter 3000 fall 3 rise 2
listen stats
bind *:8404
stats enable
stats uri /stats
stats refresh 30s
stats admin if TRUE
10.3 数据持久化与备份
10.3.1 数据卷管理
生产环境数据卷配置:
# docker-compose.prod.yml
version: '3.8'
services:
database:
image: postgres:14
volumes:
- db_data:/var/lib/postgresql/data
- db_backup:/backup
- ./init.sql:/docker-entrypoint-initdb.d/init.sql:ro
environment:
POSTGRES_DB: myapp
POSTGRES_USER: myuser
POSTGRES_PASSWORD_FILE: /run/secrets/db_password
secrets:
- db_password
networks:
- db_network
deploy:
placement:
constraints:
- node.labels.storage == ssd
resources:
limits:
memory: 1G
cpus: '1.0'
reservations:
memory: 512M
cpus: '0.5'
redis:
image: redis:7-alpine
volumes:
- redis_data:/data
- ./redis.conf:/etc/redis/redis.conf:ro
command: redis-server /etc/redis/redis.conf
networks:
- db_network
deploy:
resources:
limits:
memory: 256M
cpus: '0.5'
volumes:
db_data:
driver: local
driver_opts:
type: none
o: bind
device: /opt/docker/data/postgres
redis_data:
driver: local
driver_opts:
type: none
o: bind
device: /opt/docker/data/redis
db_backup:
driver: local
driver_opts:
type: none
o: bind
device: /opt/docker/backup
secrets:
db_password:
external: true
networks:
db_network:
driver: overlay
internal: true
10.3.2 自动化备份策略
数据库备份脚本:
#!/bin/bash
# backup-database.sh
DB_CONTAINER="myapp_database_1"
DB_USER="myuser"
DB_NAME="myapp"
BACKUP_DIR="/opt/docker/backup"
DATE=$(date +"%Y%m%d_%H%M%S")
BACKUP_FILE="backup_${DB_NAME}_${DATE}.sql"
# 创建备份目录
mkdir -p $BACKUP_DIR
# 数据库备份
echo "Starting database backup..."
docker exec $DB_CONTAINER pg_dump -U $DB_USER $DB_NAME > $BACKUP_DIR/$BACKUP_FILE
# 压缩备份文件
gzip $BACKUP_DIR/$BACKUP_FILE
# 上传到远程存储
echo "Uploading backup to cloud storage..."
aws s3 cp $BACKUP_DIR/${BACKUP_FILE}.gz s3://myapp-backups/database/
# 清理本地旧备份(保留7天)
find $BACKUP_DIR -name "backup_${DB_NAME}_*.sql.gz" -mtime +7 -delete
# 记录备份状态
if [ $? -eq 0 ]; then
echo "$(date): Backup completed successfully - ${BACKUP_FILE}.gz" >> /var/log/backup.log
else
echo "$(date): Backup failed" >> /var/log/backup.log
# 发送告警邮件
echo "Database backup failed on $(hostname)" | mail -s "Backup Alert" admin@example.com
fi
完整备份解决方案:
#!/bin/bash
# full-backup.sh
BACKUP_BASE="/opt/docker/backup"
DATE=$(date +"%Y%m%d_%H%M%S")
RETENTION_DAYS=30
# 创建备份目录
mkdir -p $BACKUP_BASE/{database,volumes,configs}
# 1. 数据库备份
echo "Backing up databases..."
docker exec postgres_container pg_dumpall -U postgres | gzip > $BACKUP_BASE/database/full_dump_$DATE.sql.gz
# 2. 数据卷备份
echo "Backing up volumes..."
docker run --rm \
-v myapp_db_data:/source:ro \
-v $BACKUP_BASE/volumes:/backup \
alpine tar czf /backup/db_data_$DATE.tar.gz -C /source .
docker run --rm \
-v myapp_redis_data:/source:ro \
-v $BACKUP_BASE/volumes:/backup \
alpine tar czf /backup/redis_data_$DATE.tar.gz -C /source .
# 3. 配置文件备份
echo "Backing up configurations..."
tar czf $BACKUP_BASE/configs/configs_$DATE.tar.gz \
/opt/docker/compose \
/opt/docker/nginx \
/opt/docker/ssl
# 4. 上传到云存储
echo "Uploading to cloud storage..."
aws s3 sync $BACKUP_BASE s3://myapp-backups/$(date +"%Y/%m/%d")/
# 5. 清理旧备份
echo "Cleaning up old backups..."
find $BACKUP_BASE -type f -mtime +$RETENTION_DAYS -delete
# 6. 验证备份完整性
echo "Verifying backup integrity..."
BACKUP_SIZE=$(du -s $BACKUP_BASE | cut -f1)
if [ $BACKUP_SIZE -lt 1000 ]; then
echo "Warning: Backup size seems too small" | mail -s "Backup Warning" admin@example.com
fi
echo "Backup completed at $(date)"
Crontab 定时备份:
# 编辑 crontab
crontab -e
# 添加定时任务
# 每天凌晨 2 点执行完整备份
0 2 * * * /opt/scripts/full-backup.sh >> /var/log/backup.log 2>&1
# 每 6 小时执行增量备份
0 */6 * * * /opt/scripts/incremental-backup.sh >> /var/log/backup.log 2>&1
# 每周日执行数据完整性检查
0 3 * * 0 /opt/scripts/backup-verify.sh >> /var/log/backup-verify.log 2>&1
10.4 监控与告警
10.4.1 综合监控方案
Prometheus + Grafana 监控栈:
# monitoring-stack.yml
version: '3.8'
services:
prometheus:
image: prom/prometheus:latest
ports:
- "9090:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./rules:/etc/prometheus/rules:ro
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--storage.tsdb.retention.time=200h'
- '--web.enable-lifecycle'
- '--web.enable-admin-api'
networks:
- monitoring
deploy:
resources:
limits:
memory: 512M
cpus: '1.0'
grafana:
image: grafana/grafana:latest
ports:
- "3000:3000"
volumes:
- grafana_data:/var/lib/grafana
- ./grafana/provisioning:/etc/grafana/provisioning:ro
- ./grafana/dashboards:/var/lib/grafana/dashboards:ro
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin123
- GF_USERS_ALLOW_SIGN_UP=false
- GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-simple-json-datasource
networks:
- monitoring
depends_on:
- prometheus
alertmanager:
image: prom/alertmanager:latest
ports:
- "9093:9093"
volumes:
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
- alertmanager_data:/alertmanager
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
- '--web.external-url=http://localhost:9093'
- '--web.route-prefix=/'
networks:
- monitoring
node-exporter:
image: prom/node-exporter:latest
ports:
- "9100:9100"
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.rootfs=/rootfs'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
networks:
- monitoring
cadvisor:
image: gcr.io/cadvisor/cadvisor:latest
ports:
- "8080:8080"
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
devices:
- /dev/kmsg:/dev/kmsg
networks:
- monitoring
volumes:
prometheus_data:
grafana_data:
alertmanager_data:
networks:
monitoring:
driver: overlay
attachable: true
Prometheus 配置:
# prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- "rules/*.yml"
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'node-exporter'
static_configs:
- targets: ['node-exporter:9100']
- job_name: 'cadvisor'
static_configs:
- targets: ['cadvisor:8080']
- job_name: 'docker'
static_configs:
- targets: ['host.docker.internal:9323']
- job_name: 'app'
docker_sd_configs:
- host: unix:///var/run/docker.sock
port: 3000
relabel_configs:
- source_labels: [__meta_docker_container_label_monitoring]
target_label: __tmp_should_be_scraped
regex: true
- source_labels: [__tmp_should_be_scraped]
regex: true
target_label: __address__
replacement: ${1}:3000
10.4.2 告警规则配置
生产环境告警规则:
# rules/production.yml
groups:
- name: production.rules
rules:
# 服务可用性告警
- alert: ServiceDown
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Service {{ $labels.job }} is down"
description: "Service {{ $labels.job }} on {{ $labels.instance }} has been down for more than 1 minute."
# 高 CPU 使用率
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage detected"
description: "CPU usage is {{ $value }}% on {{ $labels.instance }}"
# 高内存使用率
- alert: HighMemoryUsage
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage detected"
description: "Memory usage is {{ $value }}% on {{ $labels.instance }}"
# 磁盘空间不足
- alert: DiskSpaceLow
expr: (node_filesystem_size_bytes - node_filesystem_free_bytes) / node_filesystem_size_bytes * 100 > 90
for: 5m
labels:
severity: critical
annotations:
summary: "Disk space is running low"
description: "Disk usage is {{ $value }}% on {{ $labels.instance }}"
# 容器重启频繁
- alert: ContainerRestartingOften
expr: increase(container_start_time_seconds[1h]) > 5
for: 5m
labels:
severity: warning
annotations:
summary: "Container restarting frequently"
description: "Container {{ $labels.name }} has restarted {{ $value }} times in the last hour"
# HTTP 响应时间过长
- alert: HighResponseTime
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "High HTTP response time"
description: "95th percentile response time is {{ $value }}s"
# HTTP 错误率过高
- alert: HighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) * 100 > 5
for: 5m
labels:
severity: critical
annotations:
summary: "High HTTP error rate"
description: "HTTP error rate is {{ $value }}%"
# 数据库连接异常
- alert: DatabaseConnectionFailed
expr: mysql_up == 0 or postgres_up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Database connection failed"
description: "Cannot connect to database {{ $labels.instance }}"
告警管理器配置:
# alertmanager.yml
global:
smtp_smarthost: 'smtp.gmail.com:587'
smtp_from: 'alerts@example.com'
smtp_auth_username: 'alerts@example.com'
smtp_auth_password: 'app-password'
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'web.hook'
routes:
- match:
severity: critical
receiver: 'critical-alerts'
- match:
severity: warning
receiver: 'warning-alerts'
receivers:
- name: 'web.hook'
webhook_configs:
- url: 'http://webhook-service:5000/alerts'
- name: 'critical-alerts'
email_configs:
- to: 'admin@example.com'
subject: '🚨 Critical Alert: {{ .GroupLabels.alertname }}'
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
Labels: {{ .Labels }}
{{ end }}
slack_configs:
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
channel: '#alerts'
title: 'Critical Alert'
text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
- name: 'warning-alerts'
email_configs:
- to: 'team@example.com'
subject: '⚠️ Warning Alert: {{ .GroupLabels.alertname }}'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'instance']
10.5 日志管理
10.5.1 集中式日志收集
ELK Stack 日志方案:
# logging-stack.yml
version: '3.8'
services:
elasticsearch:
image: docker.elastic.co/elasticsearch/elasticsearch:8.5.0
environment:
- discovery.type=single-node
- xpack.security.enabled=false
- "ES_JAVA_OPTS=-Xms512m -Xmx512m"
volumes:
- es_data:/usr/share/elasticsearch/data
ports:
- "9200:9200"
networks:
- logging
logstash:
image: docker.elastic.co/logstash/logstash:8.5.0
volumes:
- ./logstash/config:/usr/share/logstash/pipeline:ro
- ./logstash/patterns:/opt/logstash/patterns:ro
ports:
- "5044:5044"
- "12201:12201/udp"
depends_on:
- elasticsearch
networks:
- logging
environment:
LS_JAVA_OPTS: "-Xmx256m -Xms256m"
kibana:
image: docker.elastic.co/kibana/kibana:8.5.0
ports:
- "5601:5601"
environment:
ELASTICSEARCH_HOSTS: http://elasticsearch:9200
depends_on:
- elasticsearch
networks:
- logging
filebeat:
image: docker.elastic.co/beats/filebeat:8.5.0
volumes:
- ./filebeat/filebeat.yml:/usr/share/filebeat/filebeat.yml:ro
- /var/lib/docker/containers:/var/lib/docker/containers:ro
- /var/run/docker.sock:/var/run/docker.sock:ro
depends_on:
- logstash
networks:
- logging
user: root
volumes:
es_data:
networks:
logging:
driver: overlay
attachable: true
Filebeat 配置:
# filebeat/filebeat.yml
filebeat.inputs:
- type: container
paths:
- '/var/lib/docker/containers/*/*.log'
processors:
- add_docker_metadata:
host: "unix:///var/run/docker.sock"
- decode_json_fields:
fields: ["message"]
target: ""
overwrite_keys: true
output.logstash:
hosts: ["logstash:5044"]
logging.level: info
logging.to_files: true
logging.files:
path: /var/log/filebeat
name: filebeat
keepfiles: 7
permissions: 0644
Logstash 配置:
# logstash/config/logstash.conf
input {
beats {
port => 5044
}
gelf {
port => 12201
}
}
filter {
if [container][name] {
mutate {
add_field => { "service_name" => "%{[container][name]}" }
}
}
# 解析 Nginx 日志
if [service_name] =~ "nginx" {
grok {
match => { "message" => "%{COMBINEDAPACHELOG}" }
}
date {
match => [ "timestamp", "dd/MMM/yyyy:HH:mm:ss Z" ]
}
mutate {
convert => { "response" => "integer" }
convert => { "bytes" => "integer" }
}
}
# 解析应用日志
if [service_name] =~ "web" {
json {
source => "message"
}
date {
match => [ "timestamp", "ISO8601" ]
}
}
# 添加环境标签
mutate {
add_field => { "environment" => "production" }
}
}
output {
elasticsearch {
hosts => ["elasticsearch:9200"]
index => "logs-%{+YYYY.MM.dd}"
}
# 错误日志单独存储
if [level] == "ERROR" {
elasticsearch {
hosts => ["elasticsearch:9200"]
index => "errors-%{+YYYY.MM.dd}"
}
}
}
10.5.2 应用日志最佳实践
结构化日志输出:
// Node.js 应用日志配置
const winston = require('winston');
const logger = winston.createLogger({
level: process.env.LOG_LEVEL || 'info',
format: winston.format.combine(
winston.format.timestamp(),
winston.format.errors({ stack: true }),
winston.format.json()
),
defaultMeta: {
service: 'web-app',
version: process.env.APP_VERSION,
environment: process.env.NODE_ENV
},
transports: [
new winston.transports.Console({
format: process.env.NODE_ENV === 'development'
? winston.format.combine(
winston.format.colorize(),
winston.format.simple()
)
: winston.format.json()
})
]
});
// 使用示例
logger.info('User login successful', {
userId: '12345',
ip: req.ip,
userAgent: req.get('User-Agent')
});
logger.error('Database connection failed', {
error: error.message,
stack: error.stack,
query: sqlQuery
});
Docker 日志驱动配置:
# docker-compose.yml
version: '3.8'
services:
web:
image: myapp:latest
logging:
driver: "fluentd"
options:
fluentd-address: localhost:24224
tag: myapp.web
labels: "environment,service"
labels:
- "environment=production"
- "service=web"
api:
image: myapi:latest
logging:
driver: "gelf"
options:
gelf-address: "udp://localhost:12201"
tag: "{{.ImageName}}/{{.Name}}/{{.ID}}"
database:
image: postgres:14
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
labels: "environment,service"
10.6 性能优化
10.6.1 容器性能调优
资源限制优化:
#!/bin/bash
# container-tuning.sh
# 分析容器资源使用情况
echo "=== Container Resource Analysis ==="
docker stats --no-stream --format "table {{.Container}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.MemPerc}}"
# 获取容器详细资源信息
for container in $(docker ps -q); do
echo "Container: $(docker ps --format '{{.Names}}' --filter id=$container)"
echo "Memory limit: $(docker inspect $container | jq '.[0].HostConfig.Memory')"
echo "CPU limit: $(docker inspect $container | jq '.[0].HostConfig.CpuQuota')"
echo "---"
done
# 优化建议脚本
optimize_container() {
local container_name=$1
local cpu_usage=$(docker stats --no-stream --format "{{.CPUPerc}}" $container_name | sed 's/%//')
local mem_usage=$(docker stats --no-stream --format "{{.MemPerc}}" $container_name | sed 's/%//')
echo "Container: $container_name"
echo "CPU Usage: $cpu_usage%"
echo "Memory Usage: $mem_usage%"
if (( $(echo "$cpu_usage > 80" | bc -l) )); then
echo "⚠️ High CPU usage detected. Consider:"
echo " - Increasing CPU limits"
echo " - Optimizing application code"
echo " - Adding more replicas"
fi
if (( $(echo "$mem_usage > 85" | bc -l) )); then
echo "⚠️ High memory usage detected. Consider:"
echo " - Increasing memory limits"
echo " - Memory leak investigation"
echo " - Application profiling"
fi
echo ""
}
# 分析所有运行的容器
for container in $(docker ps --format '{{.Names}}'); do
optimize_container $container
done
网络性能优化:
# docker-compose.performance.yml
version: '3.8'
services:
web:
image: myapp:latest
networks:
- app-network
deploy:
resources:
limits:
memory: 512M
cpus: '1.0'
reservations:
memory: 256M
cpus: '0.5'
# 使用 host 网络提升性能(单机部署)
# network_mode: host
# 调整 ulimits
ulimits:
nproc: 65535
nofile:
soft: 65535
hard: 65535
nginx:
image: nginx:alpine
volumes:
- ./nginx-optimized.conf:/etc/nginx/nginx.conf:ro
networks:
- app-network
# 启用内核旁路
sysctls:
- net.core.somaxconn=65535
- net.ipv4.ip_local_port_range=1024 65535
networks:
app-network:
driver: bridge
driver_opts:
com.docker.network.bridge.name: "br-app"
com.docker.network.driver.mtu: 1500
优化的 Nginx 配置:
# nginx-optimized.conf
worker_processes auto;
worker_rlimit_nofile 65535;
events {
worker_connections 65535;
use epoll;
multi_accept on;
}
http {
include /etc/nginx/mime.types;
default_type application/octet-stream;
# 性能优化
sendfile on;
tcp_nopush on;
tcp_nodelay on;
keepalive_timeout 65;
keepalive_requests 100;
# 缓冲区优化
client_body_buffer_size 128k;
client_max_body_size 50m;
client_header_buffer_size 1k;
large_client_header_buffers 4 4k;
output_buffers 1 32k;
postpone_output 1460;
# Gzip 压缩
gzip on;
gzip_vary on;
gzip_min_length 10240;
gzip_proxied expired no-cache no-store private must-revalidate auth;
gzip_types
text/plain
text/css
text/xml
text/javascript
application/x-javascript
application/xml+rss
application/json;
# 缓存配置
open_file_cache max=200000 inactive=20s;
open_file_cache_valid 30s;
open_file_cache_min_uses 2;
open_file_cache_errors on;
upstream backend {
least_conn;
server web:3000 weight=1 max_fails=3 fail_timeout=30s;
keepalive 32;
}
server {
listen 80 default_server;
location / {
proxy_pass http://backend;
proxy_http_version 1.1;
proxy_set_header Connection "";
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
# 缓冲优化
proxy_buffering on;
proxy_buffer_size 4k;
proxy_buffers 8 4k;
proxy_busy_buffers_size 8k;
}
# 静态文件缓存
location ~* \.(css|js|jpg|jpeg|png|gif|ico|svg)$ {
expires 1y;
add_header Cache-Control "public, immutable";
}
}
}
10.6.2 应用性能监控
APM 集成:
// Node.js APM 配置
const apm = require('elastic-apm-node').start({
serviceName: 'web-app',
serviceVersion: process.env.APP_VERSION,
environment: process.env.NODE_ENV,
serverUrl: process.env.ELASTIC_APM_SERVER_URL,
captureBody: 'all',
captureHeaders: true,
logUncaughtExceptions: true
});
const express = require('express');
const app = express();
// 自定义性能指标
app.use((req, res, next) => {
const span = apm.startSpan('http.request');
const start = process.hrtime();
res.on('finish', () => {
const diff = process.hrtime(start);
const duration = diff[0] * 1e3 + diff[1] * 1e-6;
apm.setCustomContext({
response_time: duration,
status_code: res.statusCode,
method: req.method,
url: req.url
});
if (span) span.end();
});
next();
});
实战案例
案例:电商网站生产环境部署
场景描述:
部署一个高并发电商网站,包含前端、API、订单服务、支付服务、数据库等组件。
架构设计:
Internet
↓
Load Balancer (HAProxy)
↓
Web Tier (Nginx + React App)
↓
API Gateway (Kong/Traefik)
↓
Microservices (Node.js/Go)
↓
Database Tier (PostgreSQL Master/Slave)
↓
Cache Tier (Redis Cluster)
完整部署配置:
# production-stack.yml
version: '3.8'
services:
# 负载均衡器
haproxy:
image: haproxy:2.6-alpine
ports:
- "80:80"
- "443:443"
- "8404:8404" # Stats页面
volumes:
- ./haproxy/haproxy.cfg:/usr/local/etc/haproxy/haproxy.cfg:ro
- ./ssl:/etc/ssl:ro
networks:
- frontend
deploy:
replicas: 2
placement:
constraints: [node.role == manager]
resources:
limits:
memory: 256M
cpus: '0.5'
# 前端应用
web:
image: mystore/web:${VERSION}
networks:
- frontend
deploy:
replicas: 4
placement:
constraints: [node.role == worker]
resources:
limits:
memory: 128M
cpus: '0.25'
reservations:
memory: 64M
cpus: '0.1'
update_config:
parallelism: 2
delay: 10s
failure_action: rollback
restart_policy:
condition: on-failure
max_attempts: 3
# API 网关
api-gateway:
image: kong:3.0-alpine
environment:
KONG_DATABASE: "off"
KONG_DECLARATIVE_CONFIG: /kong/declarative/kong.yml
KONG_PROXY_ACCESS_LOG: /dev/stdout
KONG_ADMIN_ACCESS_LOG: /dev/stdout
KONG_PROXY_ERROR_LOG: /dev/stderr
KONG_ADMIN_ERROR_LOG: /dev/stderr
KONG_ADMIN_LISTEN: 0.0.0.0:8001
volumes:
- ./kong/kong.yml:/kong/declarative/kong.yml:ro
networks:
- frontend
- backend
deploy:
replicas: 3
resources:
limits:
memory: 512M
cpus: '0.5'
# 用户服务
user-service:
image: mystore/user-service:${VERSION}
environment:
DATABASE_URL: postgresql://user:password@postgres:5432/userdb
REDIS_URL: redis://redis-cluster:6379
networks:
- backend
- database
deploy:
replicas: 3
resources:
limits:
memory: 256M
cpus: '0.5'
secrets:
- db_password
- jwt_secret
# 订单服务
order-service:
image: mystore/order-service:${VERSION}
environment:
DATABASE_URL: postgresql://order:password@postgres:5432/orderdb
MESSAGE_QUEUE: redis://redis-cluster:6379
networks:
- backend
- database
deploy:
replicas: 5 # 订单服务需要更多实例
resources:
limits:
memory: 512M
cpus: '1.0'
depends_on:
- postgres
- redis-cluster
# 支付服务
payment-service:
image: mystore/payment-service:${VERSION}
environment:
DATABASE_URL: postgresql://payment:password@postgres:5432/paymentdb
ENCRYPTION_KEY_FILE: /run/secrets/encryption_key
networks:
- backend
- database
deploy:
replicas: 2
resources:
limits:
memory: 256M
cpus: '0.5'
placement:
constraints: [node.labels.security == high] # 部署在高安全节点
secrets:
- encryption_key
# 数据库主节点
postgres-master:
image: postgres:14
environment:
POSTGRES_USER: postgres
POSTGRES_PASSWORD_FILE: /run/secrets/postgres_password
POSTGRES_REPLICATION_MODE: master
POSTGRES_REPLICATION_USER: replicator
POSTGRES_REPLICATION_PASSWORD_FILE: /run/secrets/replication_password
volumes:
- postgres_master_data:/var/lib/postgresql/data
- ./postgres/init:/docker-entrypoint-initdb.d:ro
networks:
- database
deploy:
replicas: 1
placement:
constraints: [node.labels.database == master]
resources:
limits:
memory: 2G
cpus: '2.0'
reservations:
memory: 1G
cpus: '1.0'
secrets:
- postgres_password
- replication_password
# 数据库从节点
postgres-slave:
image: postgres:14
environment:
POSTGRES_MASTER_HOST: postgres-master
POSTGRES_REPLICATION_MODE: slave
POSTGRES_REPLICATION_USER: replicator
POSTGRES_REPLICATION_PASSWORD_FILE: /run/secrets/replication_password
volumes:
- postgres_slave_data:/var/lib/postgresql/data
networks:
- database
deploy:
replicas: 2
placement:
constraints: [node.labels.database == slave]
resources:
limits:
memory: 1G
cpus: '1.0'
depends_on:
- postgres-master
secrets:
- replication_password
# Redis 集群
redis-cluster:
image: redis:7-alpine
command: redis-server --cluster-enabled yes --cluster-config-file nodes.conf --cluster-node-timeout 5000 --appendonly yes
volumes:
- redis_data:/data
networks:
- database
deploy:
replicas: 6 # 3主3从
resources:
limits:
memory: 512M
cpus: '0.5'
# 监控组件
prometheus:
image: prom/prometheus:latest
volumes:
- ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- prometheus_data:/prometheus
networks:
- monitoring
deploy:
replicas: 1
placement:
constraints: [node.role == manager]
grafana:
image: grafana/grafana:latest
environment:
GF_SECURITY_ADMIN_PASSWORD_FILE: /run/secrets/grafana_password
volumes:
- grafana_data:/var/lib/grafana
networks:
- monitoring
secrets:
- grafana_password
# 网络定义
networks:
frontend:
driver: overlay
attachable: true
backend:
driver: overlay
internal: true
database:
driver: overlay
internal: true
monitoring:
driver: overlay
attachable: true
# 数据卷定义
volumes:
postgres_master_data:
driver: local
driver_opts:
type: none
o: bind
device: /data/postgres/master
postgres_slave_data:
driver: local
driver_opts:
type: none
o: bind
device: /data/postgres/slave
redis_data:
driver: local
prometheus_data:
driver: local
grafana_data:
driver: local
# 密钥定义
secrets:
db_password:
external: true
postgres_password:
external: true
replication_password:
external: true
jwt_secret:
external: true
encryption_key:
external: true
grafana_password:
external: true
部署脚本:
#!/bin/bash
# deploy-production.sh
set -e
# 配置变量
STACK_NAME="ecommerce"
VERSION=${1:-latest}
REGISTRY="registry.example.com"
echo "Deploying $STACK_NAME version $VERSION..."
# 1. 创建 Swarm 密钥
echo "Creating secrets..."
echo "your-db-password" | docker secret create db_password - 2>/dev/null || true
echo "your-postgres-password" | docker secret create postgres_password - 2>/dev/null || true
echo "your-jwt-secret" | docker secret create jwt_secret - 2>/dev/null || true
# 2. 设置节点标签
echo "Setting node labels..."
docker node update --label-add database=master swarm-manager-1
docker node update --label-add database=slave swarm-worker-1
docker node update --label-add database=slave swarm-worker-2
docker node update --label-add security=high swarm-worker-3
# 3. 创建数据目录
echo "Creating data directories..."
sudo mkdir -p /data/postgres/{master,slave}
sudo chown -R 999:999 /data/postgres
# 4. 部署服务栈
echo "Deploying stack..."
VERSION=$VERSION docker stack deploy -c production-stack.yml $STACK_NAME
# 5. 等待服务启动
echo "Waiting for services to start..."
sleep 30
# 6. 验证部署
echo "Verifying deployment..."
docker stack services $STACK_NAME
docker service logs ${STACK_NAME}_web
docker service logs ${STACK_NAME}_api-gateway
# 7. 运行健康检查
echo "Running health checks..."
./health-check.sh
echo "Deployment completed successfully!"
健康检查脚本:
#!/bin/bash
# health-check.sh
STACK_NAME="ecommerce"
FAILED=0
echo "=== Health Check Report ==="
# 检查服务状态
echo "1. Service Status:"
for service in $(docker stack services $STACK_NAME --format "{{.Name}}"); do
replicas=$(docker service ps $service --filter "desired-state=running" --format "{{.CurrentState}}" | grep -c "Running" || echo "0")
desired=$(docker service inspect $service --format "{{.Spec.Replicas}}")
if [ "$replicas" -eq "$desired" ]; then
echo "✅ $service: $replicas/$desired replicas running"
else
echo "❌ $service: $replicas/$desired replicas running"
FAILED=1
fi
done
# 检查网络连通性
echo ""
echo "2. Network Connectivity:"
if curl -f -s http://localhost/health > /dev/null; then
echo "✅ Web frontend accessible"
else
echo "❌ Web frontend not accessible"
FAILED=1
fi
if curl -f -s http://localhost:8001/status > /dev/null; then
echo "✅ API Gateway accessible"
else
echo "❌ API Gateway not accessible"
FAILED=1
fi
# 检查数据库连接
echo ""
echo "3. Database Connectivity:"
if docker exec -it ${STACK_NAME}_postgres-master.1.$(docker service ps ${STACK_NAME}_postgres-master --format "{{.ID}}" | head -1) pg_isready -U postgres; then
echo "✅ PostgreSQL master accessible"
else
echo "❌ PostgreSQL master not accessible"
FAILED=1
fi
# 检查 Redis 集群
echo ""
echo "4. Cache Status:"
if docker exec -it ${STACK_NAME}_redis-cluster.1.$(docker service ps ${STACK_NAME}_redis-cluster --format "{{.ID}}" | head -1) redis-cli ping | grep -q PONG; then
echo "✅ Redis cluster accessible"
else
echo "❌ Redis cluster not accessible"
FAILED=1
fi
# 检查监控服务
echo ""
echo "5. Monitoring Status:"
if curl -f -s http://localhost:9090/api/v1/query?query=up > /dev/null; then
echo "✅ Prometheus accessible"
else
echo "❌ Prometheus not accessible"
FAILED=1
fi
echo ""
if [ $FAILED -eq 0 ]; then
echo "🎉 All health checks passed!"
exit 0
else
echo "💥 Some health checks failed!"
exit 1
fi
总结
生产环境的 Docker 容器化部署是一个复杂的系统工程,需要考虑:
核心要素:
- 高可用架构:多副本部署、负载均衡、故障转移
- 资源管理:合理的资源限制和预留
- 数据安全:持久化存储、定期备份、灾难恢复
- 监控告警:全方位监控、及时告警、故障自愈
- 性能优化:网络优化、缓存策略、资源调优
最佳实践:
- 采用基础设施即代码(IaC)
- 实施 GitOps 工作流
- 建立完善的 CI/CD 流程
- 定期进行灾难恢复演练
- 保持系统和依赖的及时更新
通过系统化的规划和实施,Docker 容器化技术能够为生产环境提供稳定、高效、可扩展的服务支撑。