Consul 在生产环境中的部署和运维需要考虑高可用性、性能优化、安全性和可维护性等多个方面。
生产环境架构设计
典型架构
shell┌─────────────────┐ │ Load Balancer │ └────────┬────────┘ │ ┌────────────────────┼────────────────────┐ │ │ │ ┌────▼────┐ ┌────▼────┐ ┌────▼────┐ │ DC1 │ │ DC2 │ │ DC3 │ │ (Primary)│ │ (Backup) │ │ (Backup) │ └────┬────┘ └────┬────┘ └────┬────┘ │ │ │ ┌────▼────────────────────▼────────────────────▼────┐ │ Consul Server Cluster (3-5 nodes) │ └────────────────────────────────────────────────────┘ │ │ │ ┌────▼────┐ ┌────▼────┐ ┌────▼────┐ │ Client 1│ │ Client 2│ │ Client 3│ └─────────┘ └─────────┘ └─────────┘
节点规划
Server 节点
- 数量:3-5 个奇数节点
- 配置:高可用性、高性能
- 部署:跨可用区分布
- 资源:CPU 4 核、内存 8GB、磁盘 100GB SSD
Client 节点
- 数量:根据服务规模
- 配置:轻量级
- 部署:与应用同主机或同可用区
- 资源:CPU 2 核、内存 4GB
部署方案
1. Docker 部署
yaml# docker-compose.yml version: '3.8' services: consul-server1: image: consul:1.15 container_name: consul-server1 hostname: consul-server1 ports: - "8500:8500" - "8600:8600/udp" volumes: - consul-data1:/consul/data command: > agent -server -bootstrap-expect=3 -ui -client=0.0.0.0 -bind=0.0.0.0 -retry-join=consul-server2 -retry-join=consul-server3 -datacenter=dc1 consul-server2: image: consul:1.15 container_name: consul-server2 hostname: consul-server2 volumes: - consul-data2:/consul/data command: > agent -server -bootstrap-expect=3 -bind=0.0.0.0 -retry-join=consul-server1 -retry-join=consul-server3 -datacenter=dc1 consul-server3: image: consul:1.15 container_name: consul-server3 hostname: consul-server3 volumes: - consul-data3:/consul/data command: > agent -server -bootstrap-expect=3 -bind=0.0.0.0 -retry-join=consul-server1 -retry-join=consul-server2 -datacenter=dc1 volumes: consul-data1: consul-data2: consul-data3:
2. Kubernetes 部署
yaml# consul-statefulset.yaml apiVersion: apps/v1 kind: StatefulSet metadata: name: consul spec: serviceName: consul replicas: 3 selector: matchLabels: app: consul template: metadata: labels: app: consul spec: containers: - name: consul image: consul:1.15 ports: - containerPort: 8500 name: http - containerPort: 8600 name: dns protocol: UDP env: - name: CONSUL_BIND_INTERFACE value: eth0 - name: CONSUL_GOSSIP_ENCRYPTION_KEY valueFrom: secretKeyRef: name: consul-gossip-key key: key command: - consul - agent - -server - -bootstrap-expect=3 - -ui - -client=0.0.0.0 - -data-dir=/consul/data - -retry-join=consul-0.consul.default.svc.cluster.local - -retry-join=consul-1.consul.default.svc.cluster.local - -retry-join=consul-2.consul.default.svc.cluster.local volumeMounts: - name: consul-data mountPath: /consul/data volumeClaimTemplates: - metadata: name: consul-data spec: accessModes: [ "ReadWriteOnce" ] resources: requests: storage: 10Gi
3. Ansible 部署
yaml# consul.yml --- - hosts: consul_servers become: yes vars: consul_version: "1.15.0" consul_datacenter: "dc1" consul_encrypt_key: "{{ vault_consul_encrypt_key }}" tasks: - name: Download Consul get_url: url: "https://releases.hashicorp.com/consul/{{ consul_version }}/consul_{{ consul_version }}_linux_amd64.zip" dest: /tmp/consul.zip - name: Install Consul unarchive: src: /tmp/consul.zip dest: /usr/local/bin remote_src: yes - name: Create Consul user user: name: consul system: yes shell: /bin/false - name: Create Consul directories file: path: "{{ item }}" state: directory owner: consul group: consul loop: - /etc/consul.d - /var/consul - name: Configure Consul template: src: consul.hcl.j2 dest: /etc/consul.d/consul.hcl owner: consul group: consul notify: restart consul - name: Create Consul systemd service copy: content: | [Unit] Description=Consul After=network.target [Service] User=consul Group=consul ExecStart=/usr/local/bin/consul agent -config-dir=/etc/consul.d [Install] WantedBy=multi-user.target dest: /etc/systemd/system/consul.service notify: restart consul - name: Start Consul systemd: name: consul state: started enabled: yes handlers: - name: restart consul systemd: name: consul state: restarted
配置优化
性能优化
hcl# 性能优化配置 datacenter = "dc1" data_dir = "/var/consul" server = true bootstrap_expect = 3 # 网络优化 bind_addr = "0.0.0.0" advertise_addr = "{{ GetPrivateInterfaces | attr \"address\" }}" client_addr = "0.0.0.0" # Raft 优化 raft_protocol = 3 raft_multiplier = 8 election_timeout = "1500ms" heartbeat_timeout = "1000ms" # Gossip 优化 gossip_interval = "200ms" gossip_to_dead_time = "30s" # 快照优化 snapshot_interval = "30s" snapshot_threshold = 8192 # 连接优化 limits { http_max_conns_per_client = 1000 rpc_max_conns_per_client = 1000 }
安全配置
hcl# TLS 配置 verify_incoming = true verify_outgoing = true verify_server_hostname = true ca_file = "/etc/consul/tls/ca.crt" cert_file = "/etc/consul/tls/consul.crt" key_file = "/etc/consul/tls/consul.key" # Gossip 加密 encrypt = "{{ vault_consul_encrypt_key }}" encrypt_verify_incoming = true encrypt_verify_outgoing = true # ACL 配置 acl = { enabled = true default_policy = "deny" down_policy = "extend-cache" enable_token_persistence = true tokens = { master = "{{ vault_consul_master_token }}" agent = "{{ vault_consul_agent_token }}" } } # 审计日志 audit { enabled = true sink "file" { path = "/var/log/consul/audit.log" format = "json" delivery_mode = "async" } }
监控和告警
Prometheus 监控
yaml# prometheus.yml scrape_configs: - job_name: 'consul' consul_sd_configs: - server: 'localhost:8500' services: ['consul'] relabel_configs: - source_labels: [__meta_consul_service_metadata_prometheus_scrape] action: keep regex: true
Grafana 仪表板
json{ "dashboard": { "title": "Consul Monitoring", "panels": [ { "title": "Cluster Members", "targets": [ { "expr": "consul_memberlist_member_count" } ] }, { "title": "Service Count", "targets": [ { "expr": "consul_catalog_services" } ] }, { "title": "Health Check Status", "targets": [ { "expr": "consul_health_check_status" } ] } ] } }
告警规则
yaml# alerting_rules.yml groups: - name: consul_alerts rules: - alert: ConsulDown expr: up{job="consul"} == 0 for: 1m labels: severity: critical annotations: summary: "Consul instance down" description: "Consul instance {{ $labels.instance }} is down" - alert: ConsulLeaderMissing expr: consul_raft_leader == 0 for: 1m labels: severity: critical annotations: summary: "Consul leader missing" description: "Consul cluster has no leader" - alert: ConsulServiceUnhealthy expr: consul_health_service_status{status="passing"} == 0 for: 5m labels: severity: warning annotations: summary: "Service unhealthy" description: "Service {{ $labels.service }} is unhealthy"
备份和恢复
备份策略
bash#!/bin/bash # backup_consul.sh BACKUP_DIR="/backup/consul" DATE=$(date +%Y%m%d_%H%M%S) CONSUL_DIR="/var/consul" # 创建备份目录 mkdir -p ${BACKUP_DIR} # 备份 Consul 数据 tar -czf ${BACKUP_DIR}/consul_${DATE}.tar.gz ${CONSUL_DIR} # 备份 KV 数据 consul kv export > ${BACKUP_DIR}/kv_${DATE}.json # 删除 7 天前的备份 find ${BACKUP_DIR} -name "consul_*.tar.gz" -mtime +7 -delete find ${BACKUP_DIR} -name "kv_*.json" -mtime +7 -delete echo "Backup completed: ${BACKUP_DIR}/consul_${DATE}.tar.gz"
恢复流程
bash#!/bin/bash # restore_consul.sh BACKUP_FILE=$1 KV_FILE=$2 if [ -z "$BACKUP_FILE" ] || [ -z "$KV_FILE" ]; then echo "Usage: $0 <backup_file> <kv_file>" exit 1 fi # 停止 Consul systemctl stop consul # 恢复数据 tar -xzf ${BACKUP_FILE} -C / # 启动 Consul systemctl start consul # 恢复 KV 数据 consul kv import < ${KV_FILE} echo "Restore completed"
故障排查
常见问题
-
Leader 选举失败
bash# 检查 Raft 状态 consul operator raft list-peers # 检查网络连接 consul members -wan -
服务注册失败
bash# 检查 Agent 状态 consul info # 检查 ACL 权限 consul acl token read -accessor <token-id> -
健康检查失败
bash# 检查健康检查状态 consul health check # 查看健康检查日志 journalctl -u consul | grep "health check"
最佳实践
- 高可用部署:至少 3 个 Server 节点,跨可用区分布
- 定期备份:每日备份,保留 7-30 天
- 监控告警:监控关键指标,设置合理告警阈值
- 安全加固:启用 TLS、ACL、审计日志
- 性能调优:根据负载调整配置参数
- 文档完善:维护详细的运维文档和应急预案
Consul 在生产环境中的稳定运行需要综合考虑架构设计、部署方案、配置优化、监控告警和故障处理等多个方面。