diff --git a/deploy/systemd/jenkins-agent@.service b/deploy/systemd/jenkins-agent@.service new file mode 100644 index 00000000..45d16c2a --- /dev/null +++ b/deploy/systemd/jenkins-agent@.service @@ -0,0 +1,24 @@ +[Unit] +Description=Jenkins inbound agent %i +Wants=network-online.target +After=network-online.target +StartLimitIntervalSec=0 + +[Service] +Type=simple +User=root +Group=root +EnvironmentFile=/etc/jenkins-agent/%i.env +WorkingDirectory=/var/lib/jenkins/agent/%i +ExecStart=/usr/local/bin/jenkins-inbound-agent-start %i +Restart=always +RestartSec=10 +KillSignal=SIGINT +TimeoutStopSec=30 + +# 当前生产流水线仍包含服务器初始化、systemd 与 Nginx 写入等特权操作。 +# 后续若将 agent 降权到 jenkins 用户,需要先把流水线命令收敛到精确 sudo 白名单。 +PrivateTmp=true + +[Install] +WantedBy=multi-user.target diff --git a/docs/technical/PRODUCTION_DEPLOYMENT_PLAN_2026-05-02.md b/docs/technical/PRODUCTION_DEPLOYMENT_PLAN_2026-05-02.md index 4ba80518..ae8e6639 100644 --- a/docs/technical/PRODUCTION_DEPLOYMENT_PLAN_2026-05-02.md +++ b/docs/technical/PRODUCTION_DEPLOYMENT_PLAN_2026-05-02.md @@ -202,10 +202,43 @@ Jenkins 可运行在 Windows 或其他机器上,本机 Windows 只作为人工 - Jenkins Job 参数不暴露真实节点名、IP 或带 IP 的标签。 - 生产机已作为独立 Linux Jenkins agent 接入,节点名使用脱敏名称 `genarrative-release-deploy-01`,调度标签只使用 `linux` 与 `genarrative-release-deploy`。 -- 生产机真实连接地址只允许保存在 Jenkins 节点 SSH launcher 的 `host` 字段中,不能写入节点名、调度标签、Job 参数默认值或文档推荐命令。 +- 生产机 agent 启动方式统一改为 inbound agent + systemd 自守护,不再依赖 Jenkins controller 通过 SSH launcher 长期拉起。SSH 只作为首次登录和安装 systemd 服务的运维通道。 +- 生产机真实连接地址只允许保存在 Jenkins 节点连接配置或人工运维 SSH 配置中,不能写入节点名、调度标签、Job 参数默认值或文档推荐命令。 - 发布 Job 通过 `DEPLOY_TARGET` 选择逻辑部署目标,再在 Jenkinsfile 内部映射到 Linux-only 脱敏调度表达式:`development -> linux && genarrative-build`,`release -> linux && genarrative-release-deploy`。 - 用途:服务器配置、发布静态网站、发布 `api-server`、发布 SpacetimeDB 模块、数据库导入导出、维护模式切换。 +### Jenkins inbound agent 自恢复 + +发布 agent 必须由目标 Linux 机器主动连接 Jenkins controller,并由 systemd 托管: + +- Jenkins 节点 Launch method 使用 inbound agent,优先启用 WebSocket。这样目标机只需要能访问 Jenkins Web 地址,不依赖 controller 每次 SSH 拉起 agent。 +- 目标机安装 `deploy/systemd/jenkins-agent@.service`、`scripts/deploy/jenkins-inbound-agent-start.sh` 与 `scripts/deploy/install-jenkins-inbound-agent.sh`。 +- systemd 服务名采用 `jenkins-agent@.service`,例如 `jenkins-agent@genarrative-release-deploy-01.service`。 +- systemd 自身 `WorkingDirectory` 保持 `/var/lib/jenkins/agent/`;Jenkins remoting `-workDir` 可继续使用旧 SSH agent 的 `/root/jenkins-agent`,避免迁移时 workspace 和缓存路径漂移。 +- inbound secret 只能放在目标机 `/etc/jenkins-agent/.secret` 或等价 Secret Text 注入位置,不能提交到 Git,也不能写入 Jenkinsfile 默认参数。 +- systemd unit 使用 `Restart=always` 和 `RestartSec=10`;agent Java 进程退出、网络短断或机器重启后由 systemd 自动恢复,不需要人工盯着 Jenkins 页面手动重启。 +- 当前 `Genarrative-Server-Provision` 仍负责 systemd、Nginx、`/opt/genarrative`、`/etc/genarrative` 等特权写入,因此 inbound agent 默认仍按现有 root 执行口径迁移。若后续改为 `jenkins` 用户运行 agent,必须先把生产流水线需要的特权命令收敛为精确 `NOPASSWD` sudoers 白名单。 + +如果 Jenkins controller 只运行在本地 Windows,不直接对目标机暴露公网地址,需要在本地控制机启动 `scripts/deploy/jenkins-agent-reverse-tunnel.ps1`。该脚本通过同一条 SSH 会话把远端 `127.0.0.1:18080` 转到本地 Jenkins Web `127.0.0.1:8080`,把远端 `127.0.0.1:50000` 转到本地 Jenkins inbound TCP agent port `127.0.0.1:50000`,并在隧道断开后自动重试。此时远端 agent 的 `JENKINS_URL` 固定写 `http://127.0.0.1:18080/`,不写本地 Windows 的 `127.0.0.1:8080`。 + +本地反向隧道脚本不内置目标机地址;注册 Windows 计划任务时必须显式传入 `-RemoteHost `,真实 IP 或主机名只保存在本地计划任务配置中,不提交到 Git。 + +首次迁移示例: + +```bash +sudo install -m 0600 /tmp/genarrative-release-deploy-01.secret /etc/jenkins-agent/genarrative-release-deploy-01.secret +sudo scripts/deploy/install-jenkins-inbound-agent.sh \ + --agent-name genarrative-release-deploy-01 \ + --jenkins-url http://127.0.0.1:18080/ \ + --secret-file /etc/jenkins-agent/genarrative-release-deploy-01.secret \ + --workdir /root/jenkins-agent \ + --java-bin /usr/bin/java +sudo systemctl status jenkins-agent@genarrative-release-deploy-01.service --no-pager -l +journalctl -u jenkins-agent@genarrative-release-deploy-01.service -f +``` + +如果 Jenkins controller 暂时仍配置为 SSH launcher,只能作为过渡方案使用:需要把 SSH launch timeout 拉长、增加 retry 和 retry wait、固定 Java 路径,并确认 `ssh user@host 'java -version'` 稳定返回。最终仍要切到 inbound + systemd,避免 SSH 连接卡住时阻塞发布队列。 + ### Git 仓库访问 Jenkins controller 与 Linux agent 看到的 Git 服务地址不同,必须拆成两层配置: @@ -538,6 +571,7 @@ WASM_SOURCE="${CARGO_TARGET_DIR}/wasm32-unknown-unknown/release/spacetime_module - [x] `deploy/systemd/spacetimedb.service` - [x] `deploy/systemd/genarrative-api.service` +- [x] `deploy/systemd/jenkins-agent@.service` - [x] `deploy/nginx/genarrative.conf` - [x] `deploy/nginx/genarrative-dev-http.conf` - [x] `deploy/nginx/snippets/genarrative-maintenance.conf` @@ -545,6 +579,9 @@ WASM_SOURCE="${CARGO_TARGET_DIR}/wasm32-unknown-unknown/release/spacetime_module - [x] `scripts/deploy/maintenance-on.sh` - [x] `scripts/deploy/maintenance-off.sh` - [x] `scripts/deploy/maintenance-status.sh` +- [x] `scripts/deploy/jenkins-agent-reverse-tunnel.ps1` +- [x] `scripts/deploy/jenkins-inbound-agent-start.sh` +- [x] `scripts/deploy/install-jenkins-inbound-agent.sh` - [x] `scripts/build-production-release.sh` - [x] `scripts/jenkins-checkout-source.sh` - [x] `scripts/deploy/production-web-deploy.sh` diff --git a/scripts/build-production-release.sh b/scripts/build-production-release.sh index ce6f3c19..adf46c33 100644 --- a/scripts/build-production-release.sh +++ b/scripts/build-production-release.sh @@ -370,10 +370,15 @@ mkdir -p "${TARGET_DIR}/scripts" "${TARGET_DIR}/deploy" cp "${SCRIPT_DIR}/deploy/maintenance-on.sh" "${TARGET_DIR}/scripts/maintenance-on.sh" cp "${SCRIPT_DIR}/deploy/maintenance-off.sh" "${TARGET_DIR}/scripts/maintenance-off.sh" cp "${SCRIPT_DIR}/deploy/maintenance-status.sh" "${TARGET_DIR}/scripts/maintenance-status.sh" +cp "${SCRIPT_DIR}/deploy/jenkins-inbound-agent-start.sh" "${TARGET_DIR}/scripts/jenkins-inbound-agent-start.sh" +cp "${SCRIPT_DIR}/deploy/install-jenkins-inbound-agent.sh" "${TARGET_DIR}/scripts/install-jenkins-inbound-agent.sh" +cp "${SCRIPT_DIR}/deploy/jenkins-agent-reverse-tunnel.ps1" "${TARGET_DIR}/scripts/jenkins-agent-reverse-tunnel.ps1" chmod +x \ "${TARGET_DIR}/scripts/maintenance-on.sh" \ "${TARGET_DIR}/scripts/maintenance-off.sh" \ - "${TARGET_DIR}/scripts/maintenance-status.sh" + "${TARGET_DIR}/scripts/maintenance-status.sh" \ + "${TARGET_DIR}/scripts/jenkins-inbound-agent-start.sh" \ + "${TARGET_DIR}/scripts/install-jenkins-inbound-agent.sh" copy_required_file "${SCRIPT_DIR}/spacetime-export-migration-json.mjs" "${TARGET_DIR}/scripts/database-export.mjs" "数据库导出脚本" copy_required_file "${SCRIPT_DIR}/spacetime-import-migration-json.mjs" "${TARGET_DIR}/scripts/database-import.mjs" "数据库导入脚本" @@ -398,7 +403,7 @@ cat >"${TARGET_DIR}/README.md" <&2 <<'EOF' +用法: + sudo scripts/deploy/install-jenkins-inbound-agent.sh \ + --agent-name genarrative-release-deploy-01 \ + --jenkins-url http://:8080/ \ + --secret-file /path/to/inbound-agent.secret + +可选参数: + --run-user systemd 运行用户,默认 root;当前生产流水线仍需要特权操作。 + --run-group systemd 运行用户组,默认跟随 --run-user。 + --workdir agent 工作目录,默认 /var/lib/jenkins/agent/。 + --jar-path agent.jar 落盘路径,默认 /opt/jenkins-agent/agent.jar。 + --java-bin Java 命令路径,默认 java;需要固定 JDK 时传绝对路径。 + --no-websocket 不使用 WebSocket inbound 连接。 + --no-enable 只安装 unit,不执行 systemctl enable。 + --no-start 只安装 unit,不立即启动服务。 + --dry-run 只打印操作,不写入系统。 + +密钥来源: + 优先使用 --secret-file;如果未传入,则读取环境变量 JENKINS_AGENT_SECRET; + 如果目标机已存在 /etc/jenkins-agent/.secret,则保留原密钥。 +EOF +} + +AGENT_NAME="" +JENKINS_URL_VALUE="" +SECRET_FILE="" +RUN_USER="root" +RUN_GROUP="" +WORKDIR="" +JAR_PATH="/opt/jenkins-agent/agent.jar" +JAVA_BIN="java" +USE_WEBSOCKET="true" +ENABLE_SERVICE="true" +START_SERVICE="true" +DRY_RUN="false" + +while [[ $# -gt 0 ]]; do + case "$1" in + --agent-name) + AGENT_NAME="${2:?缺少 --agent-name 的值}" + shift 2 + ;; + --jenkins-url) + JENKINS_URL_VALUE="${2:?缺少 --jenkins-url 的值}" + shift 2 + ;; + --secret-file) + SECRET_FILE="${2:?缺少 --secret-file 的值}" + shift 2 + ;; + --run-user) + RUN_USER="${2:?缺少 --run-user 的值}" + shift 2 + ;; + --run-group) + RUN_GROUP="${2:?缺少 --run-group 的值}" + shift 2 + ;; + --workdir) + WORKDIR="${2:?缺少 --workdir 的值}" + shift 2 + ;; + --jar-path) + JAR_PATH="${2:?缺少 --jar-path 的值}" + shift 2 + ;; + --java-bin) + JAVA_BIN="${2:?缺少 --java-bin 的值}" + shift 2 + ;; + --no-websocket) + USE_WEBSOCKET="false" + shift + ;; + --no-enable) + ENABLE_SERVICE="false" + shift + ;; + --no-start) + START_SERVICE="false" + shift + ;; + --dry-run) + DRY_RUN="true" + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "[jenkins-agent-install] 未知参数: $1" >&2 + usage + exit 2 + ;; + esac +done + +if [[ -z "${AGENT_NAME}" || -z "${JENKINS_URL_VALUE}" ]]; then + usage + exit 2 +fi + +if [[ -z "${RUN_GROUP}" ]]; then + RUN_GROUP="${RUN_USER}" +fi + +if [[ -z "${WORKDIR}" ]]; then + WORKDIR="/var/lib/jenkins/agent/${AGENT_NAME}" +fi + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" +START_SOURCE="${SCRIPT_DIR}/jenkins-inbound-agent-start.sh" +UNIT_SOURCE="${REPO_ROOT}/deploy/systemd/jenkins-agent@.service" +CONFIG_DIR="/etc/jenkins-agent" +CONFIG_FILE="${CONFIG_DIR}/${AGENT_NAME}.env" +SECRET_TARGET="${CONFIG_DIR}/${AGENT_NAME}.secret" +SERVICE_NAME="jenkins-agent@${AGENT_NAME}.service" + +run_cmd() { + echo "+ $*" + if [[ "${DRY_RUN}" != "true" ]]; then + "$@" + fi +} + +write_file() { + local target="$1" + local mode="$2" + local owner="$3" + local group="$4" + local temp_file + + temp_file="$(mktemp)" + cat >"${temp_file}" + echo "+ install -m ${mode} ${temp_file} ${target}" + if [[ "${DRY_RUN}" != "true" ]]; then + install -m "${mode}" -o "${owner}" -g "${group}" "${temp_file}" "${target}" + fi + rm -f "${temp_file}" +} + +if [[ ! -f "${START_SOURCE}" ]]; then + echo "[jenkins-agent-install] 缺少启动脚本: ${START_SOURCE}" >&2 + exit 1 +fi + +if [[ ! -f "${UNIT_SOURCE}" ]]; then + echo "[jenkins-agent-install] 缺少 systemd 模板: ${UNIT_SOURCE}" >&2 + exit 1 +fi + +if [[ "${RUN_USER}" != "root" ]] && ! id "${RUN_USER}" >/dev/null 2>&1; then + run_cmd useradd --system --create-home --home-dir "/var/lib/${RUN_USER}" --shell /bin/bash "${RUN_USER}" +fi + +run_cmd mkdir -p "${CONFIG_DIR}" "$(dirname "${JAR_PATH}")" "${WORKDIR}" +run_cmd chmod 0755 "${CONFIG_DIR}" "$(dirname "${JAR_PATH}")" + +if [[ "${DRY_RUN}" != "true" ]]; then + chown -R "${RUN_USER}:${RUN_GROUP}" "$(dirname "${JAR_PATH}")" "${WORKDIR}" +fi + +run_cmd install -m 0755 "${START_SOURCE}" /usr/local/bin/jenkins-inbound-agent-start + +UNIT_TMP="$(mktemp)" +sed \ + -e "s|^User=.*|User=${RUN_USER}|" \ + -e "s|^Group=.*|Group=${RUN_GROUP}|" \ + "${UNIT_SOURCE}" >"${UNIT_TMP}" +run_cmd install -m 0644 "${UNIT_TMP}" /etc/systemd/system/jenkins-agent@.service +rm -f "${UNIT_TMP}" + +write_file "${CONFIG_FILE}" 0644 root root <&2 + exit 1 + fi + run_cmd install -m 0600 -o "${RUN_USER}" -g "${RUN_GROUP}" "${SECRET_FILE}" "${SECRET_TARGET}" +elif [[ -n "${JENKINS_AGENT_SECRET:-}" ]]; then + write_file "${SECRET_TARGET}" 0600 "${RUN_USER}" "${RUN_GROUP}" <&2 + exit 1 +fi + +run_cmd systemctl daemon-reload + +if [[ "${ENABLE_SERVICE}" == "true" ]]; then + run_cmd systemctl enable "${SERVICE_NAME}" +fi + +if [[ "${START_SERVICE}" == "true" ]]; then + run_cmd systemctl restart "${SERVICE_NAME}" + run_cmd systemctl status "${SERVICE_NAME}" --no-pager -l +fi + +echo "[jenkins-agent-install] 完成: ${SERVICE_NAME}" diff --git a/scripts/deploy/jenkins-agent-reverse-tunnel.ps1 b/scripts/deploy/jenkins-agent-reverse-tunnel.ps1 new file mode 100644 index 00000000..c62a95c5 --- /dev/null +++ b/scripts/deploy/jenkins-agent-reverse-tunnel.ps1 @@ -0,0 +1,50 @@ +param( + [string]$RemoteHost = "", + [string]$RemoteUser = "root", + [string]$SshKeyPath = "$env:USERPROFILE\.ssh\dsk.pem", + [string]$LocalJenkinsHost = "127.0.0.1", + [int]$LocalJenkinsPort = 8080, + [int]$LocalAgentPort = 50000, + [int]$RemoteJenkinsPort = 18080, + [int]$RemoteAgentPort = 50000, + [int]$RestartDelaySeconds = 10 +) + +$ErrorActionPreference = "Stop" + +function Write-Log { + param([string]$Message) + $timestamp = Get-Date -Format "yyyy-MM-dd HH:mm:ss" + Write-Output "[$timestamp] $Message" +} + +$ssh = (Get-Command ssh.exe -ErrorAction Stop).Source +$remote = "$RemoteUser@$RemoteHost" + +if (-not $RemoteHost) { + throw "RemoteHost is required." +} + +if (-not (Test-Path -LiteralPath $SshKeyPath)) { + throw "SSH key not found: $SshKeyPath" +} + +while ($true) { + $args = @( + "-i", $SshKeyPath, + "-o", "StrictHostKeyChecking=accept-new", + "-o", "ExitOnForwardFailure=yes", + "-o", "ServerAliveInterval=30", + "-o", "ServerAliveCountMax=3", + "-N", + "-R", "127.0.0.1:${RemoteJenkinsPort}:${LocalJenkinsHost}:${LocalJenkinsPort}", + "-R", "127.0.0.1:${RemoteAgentPort}:${LocalJenkinsHost}:${LocalAgentPort}", + $remote + ) + + Write-Log "Starting Jenkins agent reverse tunnel: $remote" + & $ssh @args + $exitCode = $LASTEXITCODE + Write-Log "Reverse tunnel exited, exitCode=$exitCode; retrying in ${RestartDelaySeconds}s." + Start-Sleep -Seconds $RestartDelaySeconds +} diff --git a/scripts/deploy/jenkins-inbound-agent-start.sh b/scripts/deploy/jenkins-inbound-agent-start.sh new file mode 100644 index 00000000..fecc6f17 --- /dev/null +++ b/scripts/deploy/jenkins-inbound-agent-start.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat >&2 <<'EOF' +用法: + jenkins-inbound-agent-start + +说明: + 该脚本由 systemd 调用,读取 /etc/jenkins-agent/.env, + 下载 Jenkins agent.jar,并通过 inbound WebSocket 连接 Jenkins controller。 +EOF +} + +AGENT_INSTANCE="${1:-}" +if [[ -z "${AGENT_INSTANCE}" ]]; then + usage + exit 2 +fi + +CONFIG_FILE="${JENKINS_AGENT_CONFIG_FILE:-/etc/jenkins-agent/${AGENT_INSTANCE}.env}" +if [[ ! -r "${CONFIG_FILE}" ]]; then + echo "[jenkins-agent] 配置文件不可读: ${CONFIG_FILE}" >&2 + exit 1 +fi + +set -a +# shellcheck disable=SC1090 +source "${CONFIG_FILE}" +set +a + +JENKINS_AGENT_NAME="${JENKINS_AGENT_NAME:-${AGENT_INSTANCE}}" +JENKINS_AGENT_WORKDIR="${JENKINS_AGENT_WORKDIR:-/var/lib/jenkins/agent/${JENKINS_AGENT_NAME}}" +JENKINS_AGENT_JAR="${JENKINS_AGENT_JAR:-/opt/jenkins-agent/agent.jar}" +JENKINS_AGENT_SECRET_FILE="${JENKINS_AGENT_SECRET_FILE:-/etc/jenkins-agent/${JENKINS_AGENT_NAME}.secret}" +JENKINS_AGENT_USE_WEBSOCKET="${JENKINS_AGENT_USE_WEBSOCKET:-true}" +JENKINS_AGENT_JAVA_BIN="${JENKINS_AGENT_JAVA_BIN:-java}" + +if [[ -z "${JENKINS_URL:-}" ]]; then + echo "[jenkins-agent] JENKINS_URL 不能为空。" >&2 + exit 1 +fi + +if [[ -z "${JENKINS_AGENT_SECRET:-}" ]]; then + if [[ ! -r "${JENKINS_AGENT_SECRET_FILE}" ]]; then + echo "[jenkins-agent] 未提供 JENKINS_AGENT_SECRET,且密钥文件不可读: ${JENKINS_AGENT_SECRET_FILE}" >&2 + exit 1 + fi + JENKINS_AGENT_SECRET="$(tr -d '\r\n' <"${JENKINS_AGENT_SECRET_FILE}")" +fi + +if [[ -z "${JENKINS_AGENT_SECRET}" ]]; then + echo "[jenkins-agent] Jenkins inbound agent secret 不能为空。" >&2 + exit 1 +fi + +mkdir -p "$(dirname "${JENKINS_AGENT_JAR}")" "${JENKINS_AGENT_WORKDIR}" + +AGENT_JAR_URL="${JENKINS_URL%/}/jnlpJars/agent.jar" +AGENT_JAR_TMP="${JENKINS_AGENT_JAR}.tmp" + +echo "[jenkins-agent] 下载 agent.jar: ${AGENT_JAR_URL}" +curl -fsSL --retry 5 --retry-delay 5 "${AGENT_JAR_URL}" -o "${AGENT_JAR_TMP}" +mv "${AGENT_JAR_TMP}" "${JENKINS_AGENT_JAR}" + +agent_args=( + "${JENKINS_AGENT_JAVA_BIN}" + -jar "${JENKINS_AGENT_JAR}" + -url "${JENKINS_URL}" + -secret "${JENKINS_AGENT_SECRET}" + -name "${JENKINS_AGENT_NAME}" + -workDir "${JENKINS_AGENT_WORKDIR}" +) + +if [[ "${JENKINS_AGENT_USE_WEBSOCKET}" == "true" ]]; then + agent_args+=(-webSocket) +fi + +echo "[jenkins-agent] 启动 inbound agent: ${JENKINS_AGENT_NAME}" +exec "${agent_args[@]}"