refactor: 替换项目中的"yudao"为"AIOTAGRO",并清理相关配置文件
This commit is contained in:
574
docs/运维文档.md
Normal file
574
docs/运维文档.md
Normal file
@@ -0,0 +1,574 @@
|
||||
# 运维文档
|
||||
|
||||
## 运维概述
|
||||
|
||||
AIOTAGRO 管理系统运维文档涵盖系统监控、性能优化、故障处理、安全维护等日常运维工作。本文档为运维团队提供完整的操作指南和最佳实践。
|
||||
|
||||
## 系统监控
|
||||
|
||||
### 监控指标
|
||||
|
||||
#### 1. 应用性能指标
|
||||
|
||||
| 指标 | 阈值 | 说明 | 监控工具 |
|
||||
|------|------|------|----------|
|
||||
| 响应时间 | < 2秒 | 页面加载时间 | Prometheus |
|
||||
| 错误率 | < 1% | HTTP 错误率 | Grafana |
|
||||
| 吞吐量 | > 1000 RPM | 请求处理能力 | New Relic |
|
||||
| 可用性 | > 99.9% | 系统可用性 | Uptime Robot |
|
||||
|
||||
#### 2. 服务器资源指标
|
||||
|
||||
| 指标 | 阈值 | 说明 | 监控工具 |
|
||||
|------|------|------|----------|
|
||||
| CPU 使用率 | < 80% | CPU 负载 | Node Exporter |
|
||||
| 内存使用率 | < 85% | 内存使用 | cAdvisor |
|
||||
| 磁盘使用率 | < 90% | 磁盘空间 | Disk Usage |
|
||||
| 网络流量 | 无限制 | 网络带宽 | NetData |
|
||||
|
||||
### 监控配置
|
||||
|
||||
#### 1. Prometheus 配置
|
||||
|
||||
```yaml
|
||||
# prometheus.yml
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'aiotagro-frontend'
|
||||
static_configs:
|
||||
- targets: ['localhost:3000']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 30s
|
||||
|
||||
- job_name: 'nginx'
|
||||
static_configs:
|
||||
- targets: ['localhost:9113']
|
||||
scrape_interval: 30s
|
||||
|
||||
- job_name: 'node-exporter'
|
||||
static_configs:
|
||||
- targets: ['localhost:9100']
|
||||
scrape_interval: 30s
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets: ['localhost:9093']
|
||||
|
||||
rule_files:
|
||||
- "alerts.yml"
|
||||
```
|
||||
|
||||
#### 2. 告警规则
|
||||
|
||||
```yaml
|
||||
# alerts.yml
|
||||
groups:
|
||||
- name: aiotagro-frontend
|
||||
rules:
|
||||
- alert: HighErrorRate
|
||||
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "高错误率警报"
|
||||
description: "错误率超过 5%,当前值: {{ $value }}"
|
||||
|
||||
- alert: HighResponseTime
|
||||
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "高响应时间警报"
|
||||
description: "95% 响应时间超过 2 秒,当前值: {{ $value }}"
|
||||
|
||||
- alert: ServiceDown
|
||||
expr: up{job="aiotagro-frontend"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "服务宕机警报"
|
||||
description: "AIOTAGRO 前端服务已宕机"
|
||||
```
|
||||
|
||||
### 监控仪表板
|
||||
|
||||
#### 1. Grafana 配置
|
||||
|
||||
```json
|
||||
{
|
||||
"dashboard": {
|
||||
"title": "AIOTAGRO 监控面板",
|
||||
"panels": [
|
||||
{
|
||||
"title": "响应时间",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))",
|
||||
"legendFormat": "95% 响应时间"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "错误率",
|
||||
"type": "singlestat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(http_requests_total{status=~\"5..\"}[5m]) / rate(http_requests_total[5m]) * 100",
|
||||
"format": "percent"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## 性能优化
|
||||
|
||||
### 1. 前端优化
|
||||
|
||||
#### 代码分割配置
|
||||
|
||||
```javascript
|
||||
// vite.config.js
|
||||
export default defineConfig({
|
||||
build: {
|
||||
rollupOptions: {
|
||||
output: {
|
||||
manualChunks: {
|
||||
vendor: ['vue', 'vue-router', 'pinia'],
|
||||
ui: ['ant-design-vue', '@ant-design/icons-vue'],
|
||||
utils: ['lodash', 'dayjs', 'axios'],
|
||||
charts: ['echarts']
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
```
|
||||
|
||||
#### 缓存策略优化
|
||||
|
||||
```nginx
|
||||
# 静态资源缓存
|
||||
location ~* \.(js|css|png|jpg|jpeg|gif|ico|svg|woff|woff2)$ {
|
||||
expires 1y;
|
||||
add_header Cache-Control "public, immutable";
|
||||
add_header Vary Accept-Encoding;
|
||||
|
||||
# 启用 Brotli 压缩
|
||||
brotli_static on;
|
||||
gzip_static on;
|
||||
}
|
||||
|
||||
# API 响应缓存
|
||||
location /api/ {
|
||||
proxy_cache api_cache;
|
||||
proxy_cache_valid 200 302 5m;
|
||||
proxy_cache_valid 404 1m;
|
||||
proxy_cache_use_stale error timeout updating http_500 http_502 http_503 http_504;
|
||||
add_header X-Cache-Status $upstream_cache_status;
|
||||
}
|
||||
```
|
||||
|
||||
### 2. 服务器优化
|
||||
|
||||
#### Nginx 性能调优
|
||||
|
||||
```nginx
|
||||
# /etc/nginx/nginx.conf
|
||||
worker_processes auto;
|
||||
worker_cpu_affinity auto;
|
||||
worker_rlimit_nofile 100000;
|
||||
|
||||
events {
|
||||
worker_connections 4096;
|
||||
use epoll;
|
||||
multi_accept on;
|
||||
}
|
||||
|
||||
http {
|
||||
# 基础配置
|
||||
sendfile on;
|
||||
tcp_nopush on;
|
||||
tcp_nodelay on;
|
||||
keepalive_timeout 65;
|
||||
keepalive_requests 1000;
|
||||
|
||||
# 缓冲区优化
|
||||
client_body_buffer_size 128k;
|
||||
client_max_body_size 100m;
|
||||
client_header_buffer_size 1k;
|
||||
large_client_header_buffers 4 4k;
|
||||
output_buffers 1 32k;
|
||||
postpone_output 1460;
|
||||
|
||||
# Gzip 压缩
|
||||
gzip on;
|
||||
gzip_vary on;
|
||||
gzip_min_length 1024;
|
||||
gzip_comp_level 6;
|
||||
gzip_types
|
||||
text/plain
|
||||
text/css
|
||||
text/xml
|
||||
text/javascript
|
||||
application/json
|
||||
application/javascript
|
||||
application/xml+rss
|
||||
application/atom+xml
|
||||
image/svg+xml;
|
||||
}
|
||||
```
|
||||
|
||||
#### 系统内核优化
|
||||
|
||||
```bash
|
||||
# /etc/sysctl.conf
|
||||
# 网络优化
|
||||
net.core.somaxconn = 65535
|
||||
net.core.netdev_max_backlog = 65536
|
||||
net.ipv4.tcp_max_syn_backlog = 65536
|
||||
net.ipv4.tcp_syncookies = 1
|
||||
net.ipv4.tcp_tw_reuse = 1
|
||||
net.ipv4.tcp_tw_recycle = 0
|
||||
net.ipv4.tcp_fin_timeout = 30
|
||||
net.ipv4.tcp_keepalive_time = 1200
|
||||
|
||||
# 内存优化
|
||||
vm.swappiness = 10
|
||||
vm.dirty_ratio = 60
|
||||
vm.dirty_background_ratio = 2
|
||||
```
|
||||
|
||||
## 故障处理
|
||||
|
||||
### 1. 常见故障及解决方案
|
||||
|
||||
#### 服务不可用
|
||||
|
||||
**症状**: 网站无法访问,返回 502/503 错误
|
||||
**解决方案**:
|
||||
```bash
|
||||
# 检查服务状态
|
||||
sudo systemctl status nginx
|
||||
sudo systemctl status node-exporter
|
||||
|
||||
# 检查端口占用
|
||||
sudo netstat -tlnp | grep :80
|
||||
sudo netstat -tlnp | grep :3000
|
||||
|
||||
# 重启服务
|
||||
sudo systemctl restart nginx
|
||||
```
|
||||
|
||||
#### 性能下降
|
||||
|
||||
**症状**: 响应时间变慢,CPU/内存使用率高
|
||||
**解决方案**:
|
||||
```bash
|
||||
# 查看系统资源
|
||||
top
|
||||
htop
|
||||
iotop
|
||||
|
||||
# 检查 Nginx 状态
|
||||
sudo nginx -t
|
||||
sudo tail -f /var/log/nginx/error.log
|
||||
|
||||
# 清理缓存
|
||||
echo 3 > /proc/sys/vm/drop_caches
|
||||
```
|
||||
|
||||
#### 磁盘空间不足
|
||||
|
||||
**症状**: 写入失败,系统告警
|
||||
**解决方案**:
|
||||
```bash
|
||||
# 检查磁盘使用
|
||||
df -h
|
||||
du -sh /var/log/nginx/
|
||||
|
||||
# 清理日志文件
|
||||
sudo find /var/log/nginx -name "*.log" -mtime +7 -delete
|
||||
sudo truncate -s 0 /var/log/nginx/error.log
|
||||
|
||||
# 清理备份文件
|
||||
find /opt/aiotagro/backups -name "*.tar.gz" -mtime +30 -delete
|
||||
```
|
||||
|
||||
### 2. 故障排查流程
|
||||
|
||||
#### 快速诊断脚本
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# diagnose.sh
|
||||
|
||||
echo "=== AIOTAGRO 系统诊断 ==="
|
||||
echo "检查时间: $(date)"
|
||||
|
||||
echo -e "\n1. 系统资源检查"
|
||||
echo "CPU 使用率: $(top -bn1 | grep "Cpu(s)" | awk '{print $2}')%"
|
||||
echo "内存使用率: $(free | grep Mem | awk '{printf "%.2f%", $3/$2 * 100}')"
|
||||
echo "磁盘使用率: $(df -h / | awk 'NR==2 {print $5}')"
|
||||
|
||||
echo -e "\n2. 服务状态检查"
|
||||
services=("nginx" "node-exporter" "prometheus")
|
||||
for service in "${services[@]}"; do
|
||||
status=$(systemctl is-active $service)
|
||||
echo "$service: $status"
|
||||
done
|
||||
|
||||
echo -e "\n3. 端口检查"
|
||||
ports=("80" "3000" "9100" "9090")
|
||||
for port in "${ports[@]}"; do
|
||||
if netstat -tln | grep ":$port " > /dev/null; then
|
||||
echo "端口 $port: 正常"
|
||||
else
|
||||
echo "端口 $port: 异常"
|
||||
fi
|
||||
done
|
||||
|
||||
echo -e "\n4. 日志检查"
|
||||
if sudo tail -n 10 /var/log/nginx/error.log | grep -i error; then
|
||||
echo "发现 Nginx 错误日志"
|
||||
else
|
||||
echo "Nginx 错误日志正常"
|
||||
fi
|
||||
|
||||
echo -e "\n诊断完成"
|
||||
```
|
||||
|
||||
## 安全维护
|
||||
|
||||
### 1. 安全扫描
|
||||
|
||||
#### 漏洞扫描配置
|
||||
|
||||
```yaml
|
||||
# .github/workflows/security-scan.yml
|
||||
name: Security Scan
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 2 * * 1' # 每周一凌晨2点
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
security-scan:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Run security audit
|
||||
run: |
|
||||
npm audit --audit-level moderate
|
||||
pnpm audit
|
||||
|
||||
- name: Run SAST scan
|
||||
uses: github/codeql-action/init@v2
|
||||
with:
|
||||
languages: javascript
|
||||
|
||||
- name: Run SAST analysis
|
||||
uses: github/codeql-action/analyze@v2
|
||||
|
||||
- name: Run dependency check
|
||||
uses: dependency-check/Dependency-Check_Action@main
|
||||
with:
|
||||
project: 'AIOTAGRO Frontend'
|
||||
path: '.'
|
||||
format: 'HTML'
|
||||
|
||||
- name: Upload report
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: security-reports
|
||||
path: reports/
|
||||
```
|
||||
|
||||
### 2. 安全更新
|
||||
|
||||
#### 自动更新脚本
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# security-update.sh
|
||||
|
||||
set -e
|
||||
|
||||
echo "开始安全更新..."
|
||||
|
||||
# 更新系统包
|
||||
sudo apt update
|
||||
sudo apt upgrade -y
|
||||
|
||||
# 更新 Node.js 依赖
|
||||
pnpm update --latest
|
||||
|
||||
# 运行安全审计
|
||||
pnpm audit
|
||||
|
||||
# 修复安全漏洞
|
||||
if pnpm audit | grep -q "high"; then
|
||||
echo "发现高危漏洞,尝试修复..."
|
||||
pnpm audit fix
|
||||
fi
|
||||
|
||||
# 重新构建应用
|
||||
pnpm build:antd
|
||||
|
||||
# 重启服务
|
||||
sudo systemctl restart nginx
|
||||
|
||||
echo "安全更新完成"
|
||||
```
|
||||
|
||||
## 备份和恢复
|
||||
|
||||
### 1. 自动化备份
|
||||
|
||||
#### 完整备份脚本
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# full-backup.sh
|
||||
|
||||
set -e
|
||||
|
||||
BACKUP_DIR="/opt/aiotagro/backups"
|
||||
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
||||
BACKUP_FILE="aiotagro_full_backup_$TIMESTAMP.tar.gz"
|
||||
|
||||
echo "开始完整备份..."
|
||||
|
||||
# 创建备份目录
|
||||
mkdir -p "$BACKUP_DIR"
|
||||
|
||||
# 备份应用代码
|
||||
echo "备份应用代码..."
|
||||
tar -czf "$BACKUP_DIR/app_$TIMESTAMP.tar.gz" \
|
||||
--exclude=node_modules \
|
||||
--exclude=dist \
|
||||
--exclude=.git \
|
||||
/opt/aiotagro/frontend
|
||||
|
||||
# 备份配置文件
|
||||
echo "备份配置文件..."
|
||||
tar -czf "$BACKUP_DIR/config_$TIMESTAMP.tar.gz" \
|
||||
/etc/nginx \
|
||||
/etc/systemd/system/aiotagro.service
|
||||
|
||||
# 备份数据库(如果有)
|
||||
# echo "备份数据库..."
|
||||
# pg_dump -U postgres aiotagro > "$BACKUP_DIR/db_$TIMESTAMP.sql"
|
||||
|
||||
# 创建完整备份包
|
||||
echo "创建完整备份包..."
|
||||
tar -czf "$BACKUP_DIR/$BACKUP_FILE" \
|
||||
"$BACKUP_DIR/app_$TIMESTAMP.tar.gz" \
|
||||
"$BACKUP_DIR/config_$TIMESTAMP.tar.gz"
|
||||
|
||||
# 清理临时文件
|
||||
rm -f "$BACKUP_DIR/app_$TIMESTAMP.tar.gz" \
|
||||
"$BACKUP_DIR/config_$TIMESTAMP.tar.gz"
|
||||
|
||||
# 上传到云存储(可选)
|
||||
# echo "上传到云存储..."
|
||||
# aws s3 cp "$BACKUP_DIR/$BACKUP_FILE" s3://aiotagro-backups/
|
||||
|
||||
# 清理旧备份(保留最近30天)
|
||||
find "$BACKUP_DIR" -name "aiotagro_full_backup_*.tar.gz" -mtime +30 -delete
|
||||
|
||||
echo "备份完成: $BACKUP_DIR/$BACKUP_FILE"
|
||||
```
|
||||
|
||||
### 2. 灾难恢复
|
||||
|
||||
#### 恢复流程
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# disaster-recovery.sh
|
||||
|
||||
set -e
|
||||
|
||||
BACKUP_FILE="$1"
|
||||
|
||||
if [ -z "$BACKUP_FILE" ]; then
|
||||
echo "用法: $0 <备份文件>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "开始灾难恢复..."
|
||||
|
||||
# 停止服务
|
||||
sudo systemctl stop nginx
|
||||
|
||||
# 解压备份文件
|
||||
tar -xzf "$BACKUP_FILE" -C /tmp/
|
||||
|
||||
# 恢复应用代码
|
||||
tar -xzf /tmp/app_*.tar.gz -C /
|
||||
|
||||
# 恢复配置文件
|
||||
tar -xzf /tmp/config_*.tar.gz -C /
|
||||
|
||||
# 重新加载系统配置
|
||||
sudo systemctl daemon-reload
|
||||
|
||||
# 启动服务
|
||||
sudo systemctl start nginx
|
||||
|
||||
# 验证恢复
|
||||
sleep 5
|
||||
if curl -f http://localhost/ > /dev/null 2>&1; then
|
||||
echo "恢复成功"
|
||||
else
|
||||
echo "恢复失败,请检查日志"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 清理临时文件
|
||||
rm -rf /tmp/app_*.tar.gz /tmp/config_*.tar.gz
|
||||
|
||||
echo "灾难恢复完成"
|
||||
```
|
||||
|
||||
## 日常维护
|
||||
|
||||
### 1. 维护检查清单
|
||||
|
||||
#### 每日检查
|
||||
- [ ] 系统资源使用情况
|
||||
- [ ] 服务运行状态
|
||||
- [ ] 错误日志检查
|
||||
- [ ] 备份状态验证
|
||||
- [ ] 安全告警检查
|
||||
|
||||
#### 每周检查
|
||||
- [ ] 性能指标分析
|
||||
- [ ] 安全漏洞扫描
|
||||
- [ ] 日志文件清理
|
||||
- [ ] 备份文件验证
|
||||
- [ ] 系统更新检查
|
||||
|
||||
#### 每月检查
|
||||
- [ ] 容量规划评估
|
||||
- [ ] 安全审计
|
||||
- [ ] 性能优化评估
|
||||
- [ ] 灾难恢复演练
|
||||
- [ ] 文档更新
|
||||
|
||||
通过以上运维配置和流程,AIOTAGRO 管理系统可以实现稳定、安全、高效的运维管理。
|
||||
Reference in New Issue
Block a user