33 KiB
33 KiB
解班客项目运维文档
1. 运维概述
1.1 运维目标
- 高可用性:确保系统7×24小时稳定运行,可用性达到99.9%
- 高性能:保证系统响应速度,接口响应时间<500ms
- 安全可靠:确保数据安全,防范各类安全威胁
- 快速恢复:故障发生时能够快速定位和恢复
1.2 运维架构
graph TB
A[监控告警系统] --> B[日志分析系统]
A --> C[性能监控]
A --> D[业务监控]
B --> E[ELK Stack]
C --> F[Prometheus+Grafana]
D --> G[自定义监控]
H[自动化运维] --> I[CI/CD流水线]
H --> J[自动化部署]
H --> K[自动化备份]
L[故障处理] --> M[告警响应]
L --> N[故障定位]
L --> O[快速恢复]
1.3 运维团队职责
| 角色 | 职责 | 技能要求 |
|---|---|---|
| 运维工程师 | 系统监控、故障处理、日常维护 | Linux、Docker、监控工具 |
| DBA | 数据库管理、性能优化、备份恢复 | MySQL、Redis、数据库优化 |
| 安全工程师 | 安全防护、漏洞扫描、安全审计 | 网络安全、渗透测试 |
| 架构师 | 架构优化、容量规划、技术选型 | 系统架构、性能调优 |
2. 系统监控
2.1 监控体系
graph TD
A[系统监控] --> B[基础设施监控]
A --> C[应用监控]
A --> D[业务监控]
B --> B1[服务器资源]
B --> B2[网络状态]
B --> B3[存储空间]
C --> C1[应用性能]
C --> C2[接口响应]
C --> C3[错误率]
D --> D1[用户行为]
D --> D2[业务指标]
D --> D3[收入数据]
2.2 Prometheus监控配置
# prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- "alert_rules.yml"
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
scrape_configs:
# 系统监控
- job_name: 'node-exporter'
static_configs:
- targets: ['localhost:9100']
scrape_interval: 5s
# 应用监控
- job_name: 'jiebanke-api'
static_configs:
- targets: ['localhost:3000']
metrics_path: '/metrics'
scrape_interval: 10s
# 数据库监控
- job_name: 'mysql-exporter'
static_configs:
- targets: ['localhost:9104']
# Redis监控
- job_name: 'redis-exporter'
static_configs:
- targets: ['localhost:9121']
# Nginx监控
- job_name: 'nginx-exporter'
static_configs:
- targets: ['localhost:9113']
2.3 告警规则配置
# alert_rules.yml
groups:
- name: system_alerts
rules:
# CPU使用率告警
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 2m
labels:
severity: warning
annotations:
summary: "CPU使用率过高"
description: "实例 {{ $labels.instance }} CPU使用率超过80%,当前值: {{ $value }}%"
# 内存使用率告警
- alert: HighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 2m
labels:
severity: warning
annotations:
summary: "内存使用率过高"
description: "实例 {{ $labels.instance }} 内存使用率超过85%,当前值: {{ $value }}%"
# 磁盘空间告警
- alert: HighDiskUsage
expr: (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100 > 90
for: 1m
labels:
severity: critical
annotations:
summary: "磁盘空间不足"
description: "实例 {{ $labels.instance }} 磁盘使用率超过90%,当前值: {{ $value }}%"
- name: application_alerts
rules:
# 接口响应时间告警
- alert: HighResponseTime
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
for: 2m
labels:
severity: warning
annotations:
summary: "接口响应时间过长"
description: "95%的请求响应时间超过1秒,当前值: {{ $value }}s"
# 错误率告警
- alert: HighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
for: 1m
labels:
severity: critical
annotations:
summary: "错误率过高"
description: "5xx错误率超过5%,当前值: {{ $value }}"
# 数据库连接数告警
- alert: HighDatabaseConnections
expr: mysql_global_status_threads_connected / mysql_global_variables_max_connections > 0.8
for: 2m
labels:
severity: warning
annotations:
summary: "数据库连接数过高"
description: "数据库连接数超过最大连接数的80%"
2.4 Grafana仪表板
{
"dashboard": {
"title": "解班客系统监控",
"panels": [
{
"title": "CPU使用率",
"type": "graph",
"targets": [
{
"expr": "100 - (avg by(instance) (irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
"legendFormat": "CPU使用率"
}
]
},
{
"title": "内存使用率",
"type": "graph",
"targets": [
{
"expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100",
"legendFormat": "内存使用率"
}
]
},
{
"title": "接口QPS",
"type": "graph",
"targets": [
{
"expr": "rate(http_requests_total[1m])",
"legendFormat": "{{ method }} {{ handler }}"
}
]
},
{
"title": "接口响应时间",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))",
"legendFormat": "95th percentile"
},
{
"expr": "histogram_quantile(0.50, rate(http_request_duration_seconds_bucket[5m]))",
"legendFormat": "50th percentile"
}
]
}
]
}
}
3. 日志管理
3.1 日志收集架构
graph LR
A[应用日志] --> B[Filebeat]
C[Nginx日志] --> B
D[系统日志] --> B
B --> E[Logstash]
E --> F[Elasticsearch]
F --> G[Kibana]
H[日志告警] --> I[ElastAlert]
I --> J[钉钉/邮件]
3.2 Logstash配置
# logstash.conf
input {
beats {
port => 5044
}
}
filter {
if [fields][log_type] == "nginx_access" {
grok {
match => {
"message" => "%{NGINXACCESS}"
}
}
date {
match => [ "timestamp", "dd/MMM/yyyy:HH:mm:ss Z" ]
}
mutate {
convert => { "response_time" => "float" }
convert => { "bytes" => "integer" }
}
}
if [fields][log_type] == "application" {
json {
source => "message"
}
date {
match => [ "timestamp", "ISO8601" ]
}
}
if [fields][log_type] == "error" {
multiline {
pattern => "^\d{4}-\d{2}-\d{2}"
negate => true
what => "previous"
}
}
}
output {
elasticsearch {
hosts => ["elasticsearch:9200"]
index => "jiebanke-logs-%{+YYYY.MM.dd}"
}
if [level] == "ERROR" {
email {
to => "ops@jiebanke.com"
subject => "应用错误告警"
body => "错误信息: %{message}"
}
}
}
3.3 日志分析脚本
#!/bin/bash
# log-analysis.sh - 日志分析脚本
LOG_DIR="/opt/jiebanke/logs"
DATE=$(date +%Y-%m-%d)
echo "=== 解班客日志分析报告 ($DATE) ==="
# 分析Nginx访问日志
echo "1. 访问统计"
echo "总访问量: $(wc -l < $LOG_DIR/nginx/access.log)"
echo "独立IP数: $(awk '{print $1}' $LOG_DIR/nginx/access.log | sort -u | wc -l)"
echo "状态码统计:"
awk '{print $9}' $LOG_DIR/nginx/access.log | sort | uniq -c | sort -nr
# 分析响应时间
echo -e "\n2. 响应时间分析"
echo "平均响应时间: $(awk '{sum+=$10; count++} END {print sum/count}' $LOG_DIR/nginx/access.log)ms"
echo "最慢的10个请求:"
sort -k10 -nr $LOG_DIR/nginx/access.log | head -10 | awk '{print $7, $10"ms"}'
# 分析错误日志
echo -e "\n3. 错误统计"
echo "应用错误数: $(grep -c "ERROR" $LOG_DIR/api/app.log)"
echo "数据库错误数: $(grep -c "database" $LOG_DIR/api/error.log)"
# 分析用户行为
echo -e "\n4. 用户行为分析"
echo "最受欢迎的API:"
awk '$7 ~ /^\/api/ {print $7}' $LOG_DIR/nginx/access.log | sort | uniq -c | sort -nr | head -10
echo -e "\n5. 异常IP分析"
echo "请求量最大的IP:"
awk '{print $1}' $LOG_DIR/nginx/access.log | sort | uniq -c | sort -nr | head -10
4. 性能优化
4.1 数据库优化
-- 慢查询分析
SELECT
query_time,
lock_time,
rows_sent,
rows_examined,
sql_text
FROM mysql.slow_log
WHERE start_time >= DATE_SUB(NOW(), INTERVAL 1 DAY)
ORDER BY query_time DESC
LIMIT 10;
-- 索引使用分析
SELECT
table_schema,
table_name,
index_name,
cardinality,
sub_part,
packed,
nullable,
index_type
FROM information_schema.statistics
WHERE table_schema = 'jiebanke'
ORDER BY cardinality DESC;
-- 表空间分析
SELECT
table_name,
ROUND(((data_length + index_length) / 1024 / 1024), 2) AS 'Size (MB)',
table_rows
FROM information_schema.tables
WHERE table_schema = 'jiebanke'
ORDER BY (data_length + index_length) DESC;
4.2 缓存优化
// Redis缓存优化配置
const redisConfig = {
// 连接池配置
pool: {
min: 5,
max: 20,
acquireTimeoutMillis: 30000,
createTimeoutMillis: 30000,
destroyTimeoutMillis: 5000,
idleTimeoutMillis: 30000,
reapIntervalMillis: 1000,
createRetryIntervalMillis: 200
},
// 缓存策略
cache: {
// 用户信息缓存30分钟
user: { ttl: 1800 },
// 旅行信息缓存1小时
trip: { ttl: 3600 },
// 热门数据缓存6小时
hot: { ttl: 21600 },
// 配置信息缓存24小时
config: { ttl: 86400 }
}
};
// 缓存预热脚本
async function warmupCache() {
console.log('开始缓存预热...');
// 预热热门旅行数据
const hotTrips = await Trip.findAll({
where: { status: 'active' },
order: [['view_count', 'DESC']],
limit: 100
});
for (const trip of hotTrips) {
await redis.setex(`trip:${trip.id}`, 3600, JSON.stringify(trip));
}
// 预热系统配置
const configs = await Config.findAll();
for (const config of configs) {
await redis.setex(`config:${config.key}`, 86400, config.value);
}
console.log('缓存预热完成');
}
4.3 应用性能优化
// 性能监控中间件
const performanceMonitor = (req, res, next) => {
const start = Date.now();
res.on('finish', () => {
const duration = Date.now() - start;
const memUsage = process.memoryUsage();
// 记录性能指标
prometheus.httpRequestDuration
.labels(req.method, req.route?.path || req.path, res.statusCode)
.observe(duration / 1000);
prometheus.memoryUsage.set(memUsage.heapUsed);
// 慢请求告警
if (duration > 1000) {
logger.warn('Slow request detected', {
method: req.method,
path: req.path,
duration,
userAgent: req.get('User-Agent')
});
}
});
next();
};
// 数据库连接池优化
const dbConfig = {
pool: {
max: 20,
min: 5,
acquire: 30000,
idle: 10000
},
logging: (sql, timing) => {
if (timing > 1000) {
logger.warn('Slow query detected', { sql, timing });
}
}
};
5. 备份策略
5.1 备份计划
gantt
title 备份计划
dateFormat HH:mm
axisFormat %H:%M
section 数据库备份
全量备份 :db-full, 02:00, 1h
增量备份 :db-inc, 06:00, 30m
增量备份 :db-inc2, 12:00, 30m
增量备份 :db-inc3, 18:00, 30m
section 文件备份
应用文件 :file-app, 03:00, 30m
日志文件 :file-log, 04:00, 30m
配置文件 :file-config, 05:00, 15m
5.2 自动化备份脚本
#!/bin/bash
# auto-backup.sh - 自动化备份脚本
set -e
# 配置变量
BACKUP_ROOT="/opt/backup"
DATE=$(date +%Y%m%d_%H%M%S)
RETENTION_DAYS=30
DB_NAME="jiebanke"
DB_USER="backup_user"
DB_PASSWORD="backup_password"
# 创建备份目录
mkdir -p "$BACKUP_ROOT"/{database,files,logs}
# 数据库备份函数
backup_database() {
echo "开始数据库备份..."
# 全量备份
mysqldump -u"$DB_USER" -p"$DB_PASSWORD" \
--single-transaction \
--routines \
--triggers \
--events \
--hex-blob \
"$DB_NAME" | gzip > "$BACKUP_ROOT/database/full_${DATE}.sql.gz"
# 二进制日志备份
mysql -u"$DB_USER" -p"$DB_PASSWORD" -e "FLUSH LOGS;"
cp /var/lib/mysql/mysql-bin.* "$BACKUP_ROOT/database/" 2>/dev/null || true
echo "数据库备份完成"
}
# 文件备份函数
backup_files() {
echo "开始文件备份..."
# 应用文件备份
tar -czf "$BACKUP_ROOT/files/app_${DATE}.tar.gz" \
-C /opt/jiebanke \
--exclude='node_modules' \
--exclude='logs' \
--exclude='tmp' \
.
# 配置文件备份
tar -czf "$BACKUP_ROOT/files/config_${DATE}.tar.gz" \
/etc/nginx \
/etc/mysql \
/etc/redis
echo "文件备份完成"
}
# 日志备份函数
backup_logs() {
echo "开始日志备份..."
# 压缩昨天的日志
find /opt/jiebanke/logs -name "*.log" -mtime +1 -exec gzip {} \;
# 备份日志文件
tar -czf "$BACKUP_ROOT/logs/logs_${DATE}.tar.gz" \
/opt/jiebanke/logs \
/var/log/nginx \
/var/log/mysql
echo "日志备份完成"
}
# 清理过期备份
cleanup_old_backups() {
echo "清理过期备份..."
find "$BACKUP_ROOT" -type f -mtime +$RETENTION_DAYS -delete
echo "过期备份清理完成"
}
# 备份验证
verify_backup() {
echo "验证备份完整性..."
# 验证数据库备份
if [ -f "$BACKUP_ROOT/database/full_${DATE}.sql.gz" ]; then
gunzip -t "$BACKUP_ROOT/database/full_${DATE}.sql.gz"
echo "数据库备份验证通过"
else
echo "数据库备份验证失败"
exit 1
fi
# 验证文件备份
if [ -f "$BACKUP_ROOT/files/app_${DATE}.tar.gz" ]; then
tar -tzf "$BACKUP_ROOT/files/app_${DATE}.tar.gz" > /dev/null
echo "文件备份验证通过"
else
echo "文件备份验证失败"
exit 1
fi
}
# 发送备份报告
send_backup_report() {
local status=$1
local message="备份任务 $status - $(date)"
# 发送钉钉通知
curl -X POST "https://oapi.dingtalk.com/robot/send?access_token=YOUR_TOKEN" \
-H "Content-Type: application/json" \
-d "{
\"msgtype\": \"text\",
\"text\": {
\"content\": \"$message\"
}
}"
}
# 主执行流程
main() {
echo "开始执行备份任务 - $(date)"
trap 'send_backup_report "失败"' ERR
backup_database
backup_files
backup_logs
verify_backup
cleanup_old_backups
send_backup_report "成功"
echo "备份任务完成 - $(date)"
}
# 执行主函数
main "$@"
5.3 备份恢复流程
#!/bin/bash
# restore.sh - 数据恢复脚本
set -e
BACKUP_ROOT="/opt/backup"
DB_NAME="jiebanke"
DB_USER="root"
DB_PASSWORD="your_password"
# 数据库恢复函数
restore_database() {
local backup_file=$1
echo "开始恢复数据库..."
# 停止应用服务
pm2 stop all
# 创建恢复数据库
mysql -u"$DB_USER" -p"$DB_PASSWORD" -e "DROP DATABASE IF EXISTS ${DB_NAME}_restore;"
mysql -u"$DB_USER" -p"$DB_PASSWORD" -e "CREATE DATABASE ${DB_NAME}_restore;"
# 恢复数据
gunzip -c "$backup_file" | mysql -u"$DB_USER" -p"$DB_PASSWORD" "${DB_NAME}_restore"
# 验证数据完整性
local table_count=$(mysql -u"$DB_USER" -p"$DB_PASSWORD" -e "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema='${DB_NAME}_restore';" -N)
if [ "$table_count" -gt 0 ]; then
echo "数据库恢复成功,表数量: $table_count"
# 切换数据库
mysql -u"$DB_USER" -p"$DB_PASSWORD" -e "RENAME TABLE ${DB_NAME} TO ${DB_NAME}_backup_$(date +%Y%m%d_%H%M%S);"
mysql -u"$DB_USER" -p"$DB_PASSWORD" -e "RENAME TABLE ${DB_NAME}_restore TO ${DB_NAME};"
# 重启应用服务
pm2 start all
echo "数据库恢复完成"
else
echo "数据库恢复失败"
exit 1
fi
}
# 文件恢复函数
restore_files() {
local backup_file=$1
local restore_path="/opt/jiebanke_restore"
echo "开始恢复文件..."
# 创建恢复目录
mkdir -p "$restore_path"
# 解压文件
tar -xzf "$backup_file" -C "$restore_path"
echo "文件恢复到: $restore_path"
}
# 使用示例
case "$1" in
database)
if [ -z "$2" ]; then
echo "请指定备份文件路径"
echo "用法: $0 database /path/to/backup.sql.gz"
exit 1
fi
restore_database "$2"
;;
files)
if [ -z "$2" ]; then
echo "请指定备份文件路径"
echo "用法: $0 files /path/to/backup.tar.gz"
exit 1
fi
restore_files "$2"
;;
*)
echo "用法: $0 {database|files} backup_file"
exit 1
;;
esac
6. 故障处理
6.1 故障响应流程
graph TD
A[故障告警] --> B[故障确认]
B --> C[影响评估]
C --> D[应急响应]
D --> E[问题定位]
E --> F[故障修复]
F --> G[服务恢复]
G --> H[故障总结]
D --> D1[服务降级]
D --> D2[流量切换]
D --> D3[紧急通知]
6.2 常见故障处理手册
| 故障类型 | 症状 | 处理步骤 | 预计恢复时间 |
|---|---|---|---|
| 服务无响应 | 接口超时、连接失败 | 1.检查进程状态 2.重启服务 3.检查日志 | 5-10分钟 |
| 数据库连接失败 | 数据库错误、连接超时 | 1.检查数据库状态 2.检查连接池 3.重启数据库 | 10-15分钟 |
| 内存不足 | OOM错误、服务崩溃 | 1.释放内存 2.重启服务 3.扩容内存 | 15-30分钟 |
| 磁盘空间不足 | 写入失败、日志错误 | 1.清理日志 2.清理临时文件 3.扩容磁盘 | 10-20分钟 |
| 网络故障 | 连接超时、丢包 | 1.检查网络连通性 2.重启网络服务 3.联系网络运营商 | 30-60分钟 |
6.3 故障处理脚本
#!/bin/bash
# emergency-fix.sh - 紧急故障处理脚本
# 服务健康检查
check_service_health() {
echo "检查服务健康状态..."
# 检查API服务
if ! curl -f http://localhost:3000/health > /dev/null 2>&1; then
echo "API服务异常,尝试重启..."
pm2 restart jiebanke-api
sleep 10
if curl -f http://localhost:3000/health > /dev/null 2>&1; then
echo "API服务重启成功"
else
echo "API服务重启失败,需要人工介入"
return 1
fi
fi
# 检查数据库连接
if ! mysql -u root -p"$DB_PASSWORD" -e "SELECT 1" > /dev/null 2>&1; then
echo "数据库连接异常"
return 1
fi
# 检查Redis连接
if ! redis-cli ping > /dev/null 2>&1; then
echo "Redis连接异常,尝试重启..."
systemctl restart redis
sleep 5
fi
echo "服务健康检查完成"
}
# 清理系统资源
cleanup_system() {
echo "清理系统资源..."
# 清理日志文件
find /opt/jiebanke/logs -name "*.log" -size +100M -exec truncate -s 50M {} \;
# 清理临时文件
find /tmp -name "jiebanke-*" -mtime +1 -delete
# 清理缓存
echo "FLUSHDB" | redis-cli
# 重启服务释放内存
pm2 restart all
echo "系统资源清理完成"
}
# 故障恢复
emergency_recovery() {
local issue_type=$1
case "$issue_type" in
"high_cpu")
echo "处理CPU使用率过高..."
# 重启服务
pm2 restart all
;;
"high_memory")
echo "处理内存使用率过高..."
cleanup_system
;;
"disk_full")
echo "处理磁盘空间不足..."
# 清理日志
find /var/log -name "*.log" -mtime +7 -delete
find /opt/jiebanke/logs -name "*.log.gz" -mtime +30 -delete
;;
"service_down")
echo "处理服务宕机..."
check_service_health
;;
*)
echo "未知故障类型: $issue_type"
return 1
;;
esac
}
# 发送故障通知
send_alert() {
local message=$1
local level=$2
# 发送钉钉告警
curl -X POST "https://oapi.dingtalk.com/robot/send?access_token=YOUR_TOKEN" \
-H "Content-Type: application/json" \
-d "{
\"msgtype\": \"text\",
\"text\": {
\"content\": \"【$level】$message\"
},
\"at\": {
\"isAtAll\": true
}
}"
}
# 主函数
main() {
local action=$1
case "$action" in
"check")
check_service_health
;;
"cleanup")
cleanup_system
;;
"recover")
emergency_recovery "$2"
;;
"alert")
send_alert "$2" "$3"
;;
*)
echo "用法: $0 {check|cleanup|recover|alert}"
echo " check - 检查服务健康状态"
echo " cleanup - 清理系统资源"
echo " recover <type> - 故障恢复"
echo " alert <message> <level> - 发送告警"
exit 1
;;
esac
}
main "$@"
7. 安全运维
7.1 安全检查清单
- 系统补丁更新
- 防火墙规则检查
- SSL证书有效性
- 密码策略检查
- 访问日志审计
- 漏洞扫描
- 备份数据加密
- 网络安全监控
7.2 安全加固脚本
#!/bin/bash
# security-hardening.sh - 安全加固脚本
# 系统安全加固
system_hardening() {
echo "开始系统安全加固..."
# 禁用不必要的服务
systemctl disable telnet
systemctl disable rsh
systemctl disable rlogin
# 设置文件权限
chmod 600 /etc/shadow
chmod 600 /etc/gshadow
chmod 644 /etc/passwd
chmod 644 /etc/group
# 配置防火墙
firewall-cmd --permanent --add-service=http
firewall-cmd --permanent --add-service=https
firewall-cmd --permanent --add-port=3000/tcp
firewall-cmd --reload
echo "系统安全加固完成"
}
# 应用安全检查
application_security_check() {
echo "开始应用安全检查..."
# 检查敏感文件权限
find /opt/jiebanke -name "*.key" -exec chmod 600 {} \;
find /opt/jiebanke -name "*.pem" -exec chmod 600 {} \;
# 检查配置文件中的敏感信息
grep -r "password\|secret\|key" /opt/jiebanke/config/ || true
# 检查开放端口
netstat -tlnp | grep LISTEN
echo "应用安全检查完成"
}
# 日志安全审计
security_audit() {
echo "开始安全审计..."
# 检查登录失败记录
grep "Failed password" /var/log/secure | tail -20
# 检查异常访问
awk '$9 ~ /4[0-9][0-9]|5[0-9][0-9]/ {print $1, $7, $9}' /var/log/nginx/access.log | sort | uniq -c | sort -nr | head -20
# 检查SQL注入尝试
grep -i "union\|select\|drop\|insert" /var/log/nginx/access.log | head -10
echo "安全审计完成"
}
main() {
case "$1" in
"harden")
system_hardening
application_security_check
;;
"audit")
security_audit
;;
"all")
system_hardening
application_security_check
security_audit
;;
*)
echo "用法: $0 {harden|audit|all}"
exit 1
;;
esac
}
main "$@"
8. 容量规划
8.1 资源使用趋势分析
#!/usr/bin/env python3
# capacity-planning.py - 容量规划分析
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
class CapacityPlanner:
def __init__(self):
self.metrics_data = {}
def load_metrics(self, metric_type, days=30):
"""加载指标数据"""
# 这里应该从Prometheus或其他监控系统获取数据
# 示例数据生成
dates = pd.date_range(end=datetime.now(), periods=days*24, freq='H')
if metric_type == 'cpu':
data = np.random.normal(45, 15, len(dates))
elif metric_type == 'memory':
data = np.random.normal(60, 20, len(dates))
elif metric_type == 'disk':
data = np.random.normal(30, 10, len(dates))
else:
data = np.random.normal(50, 10, len(dates))
return pd.DataFrame({'timestamp': dates, 'value': data})
def predict_usage(self, metric_type, days_ahead=30):
"""预测资源使用趋势"""
df = self.load_metrics(metric_type)
# 简单线性回归预测
x = np.arange(len(df))
y = df['value'].values
# 计算趋势
z = np.polyfit(x, y, 1)
trend = np.poly1d(z)
# 预测未来使用量
future_x = np.arange(len(df), len(df) + days_ahead*24)
future_values = trend(future_x)
return {
'current_avg': np.mean(y[-7*24:]), # 最近7天平均值
'predicted_avg': np.mean(future_values),
'trend_slope': z[0],
'max_predicted': np.max(future_values)
}
def generate_report(self):
"""生成容量规划报告"""
metrics = ['cpu', 'memory', 'disk']
report = {}
for metric in metrics:
prediction = self.predict_usage(metric)
report[metric] = prediction
return report
def check_capacity_alerts(self, report):
"""检查容量告警"""
alerts = []
for metric, data in report.items():
if data['max_predicted'] > 80:
alerts.append(f"{metric}使用率预计将超过80%,当前趋势: {data['trend_slope']:.2f}%/小时")
elif data['max_predicted'] > 70:
alerts.append(f"{metric}使用率预计将超过70%,建议关注")
return alerts
if __name__ == "__main__":
planner = CapacityPlanner()
report = planner.generate_report()
alerts = planner.check_capacity_alerts(report)
print("=== 容量规划报告 ===")
for metric, data in report.items():
print(f"{metric.upper()}:")
print(f" 当前平均使用率: {data['current_avg']:.1f}%")
print(f" 预测平均使用率: {data['predicted_avg']:.1f}%")
print(f" 预测最大使用率: {data['max_predicted']:.1f}%")
print(f" 增长趋势: {data['trend_slope']:.2f}%/小时")
print()
if alerts:
print("=== 容量告警 ===")
for alert in alerts:
print(f"⚠️ {alert}")
else:
print("✅ 当前容量充足,无需扩容")
9. 运维自动化
9.1 自动化运维流程
# .github/workflows/ops-automation.yml
name: 运维自动化
on:
schedule:
- cron: '0 2 * * *' # 每天凌晨2点执行
workflow_dispatch:
jobs:
daily-maintenance:
runs-on: ubuntu-latest
steps:
- name: 系统健康检查
run: |
curl -f ${{ secrets.HEALTH_CHECK_URL }} || exit 1
- name: 清理日志文件
run: |
ssh ${{ secrets.SERVER_HOST }} "find /opt/jiebanke/logs -name '*.log' -size +100M -exec truncate -s 50M {} \;"
- name: 数据库优化
run: |
ssh ${{ secrets.SERVER_HOST }} "mysql -u root -p${{ secrets.DB_PASSWORD }} -e 'OPTIMIZE TABLE jiebanke.users, jiebanke.trips;'"
- name: 缓存预热
run: |
curl -X POST ${{ secrets.API_URL }}/admin/cache/warmup
- name: 发送运维报告
run: |
curl -X POST ${{ secrets.DINGTALK_WEBHOOK }} \
-H "Content-Type: application/json" \
-d '{"msgtype": "text", "text": {"content": "日常运维任务执行完成"}}'
9.2 运维工具集
#!/bin/bash
# ops-toolkit.sh - 运维工具集
# 快速诊断工具
quick_diagnosis() {
echo "=== 系统快速诊断 ==="
# 系统负载
echo "系统负载:"
uptime
# 内存使用
echo -e "\n内存使用:"
free -h
# 磁盘使用
echo -e "\n磁盘使用:"
df -h
# 网络连接
echo -e "\n网络连接:"
netstat -an | grep :3000
# 进程状态
echo -e "\n进程状态:"
pm2 status
# 数据库状态
echo -e "\n数据库状态:"
mysql -u root -p"$DB_PASSWORD" -e "SHOW PROCESSLIST;" | head -10
}
# 性能分析工具
performance_analysis() {
echo "=== 性能分析 ==="
# CPU使用率Top10
echo "CPU使用率Top10:"
ps aux --sort=-%cpu | head -11
# 内存使用率Top10
echo -e "\n内存使用率Top10:"
ps aux --sort=-%mem | head -11
# IO等待
echo -e "\nIO等待:"
iostat -x 1 3
# 网络流量
echo -e "\n网络流量:"
iftop -t -s 10
}
# 日志分析工具
log_analysis() {
local log_type=$1
local lines=${2:-100}
case "$log_type" in
"error")
echo "最近错误日志:"
tail -n $lines /opt/jiebanke/logs/error.log
;;
"access")
echo "访问日志统计:"
tail -n $lines /var/log/nginx/access.log | awk '{print $1}' | sort | uniq -c | sort -nr | head -10
;;
"slow")
echo "慢查询日志:"
tail -n $lines /var/log/mysql/slow.log
;;
*)
echo "支持的日志类型: error, access, slow"
;;
esac
}
# 服务管理工具
service_management() {
local action=$1
local service=$2
case "$action" in
"restart")
echo "重启服务: $service"
if [ "$service" = "api" ]; then
pm2 restart jiebanke-api
elif [ "$service" = "nginx" ]; then
systemctl restart nginx
elif [ "$service" = "mysql" ]; then
systemctl restart mysql
elif [ "$service" = "redis" ]; then
systemctl restart redis
else
echo "未知服务: $service"
fi
;;
"status")
echo "服务状态: $service"
if [ "$service" = "api" ]; then
pm2 status jiebanke-api
else
systemctl status $service
fi
;;
*)
echo "支持的操作: restart, status"
;;
esac
}
# 主菜单
show_menu() {
echo "=== 解班客运维工具集 ==="
echo "1. 快速诊断"
echo "2. 性能分析"
echo "3. 日志分析"
echo "4. 服务管理"
echo "5. 退出"
echo -n "请选择操作: "
}
# 主函数
main() {
if [ $# -eq 0 ]; then
# 交互模式
while true; do
show_menu
read choice
case $choice in
1) quick_diagnosis ;;
2) performance_analysis ;;
3)
echo -n "日志类型 (error/access/slow): "
read log_type
log_analysis $log_type
;;
4)
echo -n "操作 (restart/status): "
read action
echo -n "服务名: "
read service
service_management $action $service
;;
5) exit 0 ;;
*) echo "无效选择" ;;
esac
echo -e "\n按回车键继续..."
read
done
else
# 命令行模式
case "$1" in
"diagnosis") quick_diagnosis ;;
"performance") performance_analysis ;;
"log") log_analysis "$2" "$3" ;;
"service") service_management "$2" "$3" ;;
*)
echo "用法: $0 [diagnosis|performance|log|service]"
exit 1
;;
esac
fi
}
main "$@"
10. 总结
本运维文档涵盖了解班客项目的全面运维管理,包括:
10.1 核心运维能力
- 监控告警:全方位系统监控和智能告警
- 日志管理:集中化日志收集和分析
- 性能优化:数据库、缓存、应用性能调优
- 备份恢复:自动化备份和快速恢复机制
- 故障处理:标准化故障响应和处理流程
10.2 自动化程度
- 监控自动化:自动监控、告警、报告
- 备份自动化:定时备份、验证、清理
- 部署自动化:CI/CD集成、自动部署
- 运维自动化:日常维护任务自动化
10.3 运维最佳实践
- 预防为主:通过监控和预警防范故障
- 快速响应:建立完善的故障响应机制
- 持续改进:定期回顾和优化运维流程
- 文档驱动:完善的运维文档和知识库
通过实施本运维方案,可以确保解班客项目的高可用性、高性能和高安全性。