From 2c854cc6afc7b268ed34e43fa67501c8aca46eee Mon Sep 17 00:00:00 2001 From: chengkai3 Date: Sun, 12 Apr 2026 21:19:24 +0800 Subject: [PATCH] ci: harden deploy timeout and compose pull retry --- .github/workflows/main.yml | 24 +++++++++++++++++++++++- MEMORY.md | 1 + memory/2026-04-12.md | 16 ++++++++++++++++ 3 files changed, 40 insertions(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 9685ae6..6333dd9 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -110,11 +110,15 @@ jobs: host: ${{ secrets.SERVER_HOST || vars.SERVER_HOST }} username: ${{ secrets.SERVER_USER || vars.SERVER_USER }} port: ${{ secrets.SERVER_PORT || vars.SERVER_PORT || 22 }} + timeout: 120s + command_timeout: 45m key: ${{ secrets.SERVER_SSH_KEY }} password: ${{ secrets.SERVER_PASSWORD }} envs: DEPLOY_PATH,API_IMAGE,WEB_IMAGE,IMAGE_TAG,NEXT_PUBLIC_API_BASE_URL,GHCR_USERNAME,GHCR_TOKEN script: | set -euo pipefail + export DOCKER_CLIENT_TIMEOUT="${DOCKER_CLIENT_TIMEOUT:-600}" + export COMPOSE_HTTP_TIMEOUT="${COMPOSE_HTTP_TIMEOUT:-600}" DEPLOY_DIR="${DEPLOY_PATH:-/opt/fquiz}" mkdir -p "${DEPLOY_DIR}" @@ -223,6 +227,24 @@ jobs: COMPOSE_CMD="docker-compose" fi - ${COMPOSE_CMD} --env-file .env --env-file .images.env -f docker-compose.prod.yml pull + pull_with_retry() { + local max_retries=3 + local attempt=1 + while true; do + if ${COMPOSE_CMD} --env-file .env --env-file .images.env -f docker-compose.prod.yml pull; then + break + fi + if [ "${attempt}" -ge "${max_retries}" ]; then + echo "[error] docker compose pull failed after ${max_retries} attempts." + return 1 + fi + local sleep_seconds=$((attempt * 20)) + echo "[warn] docker compose pull failed (attempt ${attempt}/${max_retries}), retrying in ${sleep_seconds}s..." + sleep "${sleep_seconds}" + attempt=$((attempt + 1)) + done + } + + pull_with_retry ${COMPOSE_CMD} --env-file .env --env-file .images.env -f docker-compose.prod.yml up -d --remove-orphans ${COMPOSE_CMD} --env-file .env --env-file .images.env -f docker-compose.prod.yml ps diff --git a/MEMORY.md b/MEMORY.md index 9f224a9..fbfe0e1 100644 --- a/MEMORY.md +++ b/MEMORY.md @@ -39,3 +39,4 @@ - SQLAlchemy 关联加载选项(`selectinload/joinedload`)避免在模块导入期以全局常量初始化,优先在函数内惰性构建,防止导入顺序导致 mapper 提前配置失败。 - `app.models` 包初始化需预加载全部模型模块,确保字符串关系(如 `"AuditLog"`)在启动阶段可解析。 - 部署 compose 中 DB 镜像应通过 `POSTGRES_IMAGE` 可配置,默认使用镜像站(`docker.m.daocloud.io/library/postgres:16-alpine`)以降低 Docker Hub 网络抖动风险。 +- GitHub Actions 使用 `appleboy/ssh-action` 部署时,慢网环境需显式设置 `command_timeout`(建议 `45m`)并为 `docker compose pull` 增加重试,避免出现 `Run Command Timeout` 直接中断发布。 diff --git a/memory/2026-04-12.md b/memory/2026-04-12.md index 67a85e9..1b8cd79 100644 --- a/memory/2026-04-12.md +++ b/memory/2026-04-12.md @@ -107,3 +107,19 @@ - `docker compose build api --build-arg PIP_INDEX_URL=https://pypi.tuna.tsinghua.edu.cn/simple --build-arg PIP_DEFAULT_TIMEOUT=600 --build-arg PIP_RETRIES=30` - `docker compose up -d api` 后状态为 `Up ... (healthy)`。 - `curl http://127.0.0.1:8000/health` 返回 `{"status":"ok","service":"fquiz-api","version":"0.1.0"}`。 + +## 追加修复(GitHub 发布 Run Command Timeout) + +- 触发问题: + - 发布阶段日志持续停留在 `docker compose pull` 的 layer 下载进度。 + - `appleboy/ssh-action` 最终报错:`Run Command Timeout`,作业退出码 `1`。 +- 根因: + - 远端拉取镜像速度慢时,SSH Action 的命令执行超时先触发,未等到 `docker compose pull` 自然完成。 +- 处理: + - 更新 `.github/workflows/main.yml` 部署步骤: + - 为 `appleboy/ssh-action` 增加 `timeout: 120s` 与 `command_timeout: 45m`。 + - 脚本内新增 `DOCKER_CLIENT_TIMEOUT=600`、`COMPOSE_HTTP_TIMEOUT=600` 默认值。 + - 新增 `pull_with_retry`(最多 3 次)包装 `docker compose pull`,网络抖动时自动重试。 +- 验证建议: + - 推送触发 `main` 发布,观察部署日志不再在固定时长点报 `Run Command Timeout`。 + - 远端 `docker compose ps` 应显示 `db/api/web` 均为 `Up`(或 `healthy`)。