From 487efff9c4a05a4931e79ace7144c6e1e0a2e44f Mon Sep 17 00:00:00 2001 From: kdletters <61648117+kdletters@users.noreply.github.com> Date: Thu, 21 May 2026 15:27:19 +0800 Subject: [PATCH] fix(deploy): ensure release tracking outbox path --- .hermes/shared-memory/pitfalls.md | 8 ++ ...发运维】本地开发验证与生产运维-2026-05-15.md | 11 ++ scripts/deploy/production-api-deploy.sh | 111 ++++++++++++++++++ scripts/jenkins-server-provision.sh | 37 ++++++ 4 files changed, 167 insertions(+) diff --git a/.hermes/shared-memory/pitfalls.md b/.hermes/shared-memory/pitfalls.md index 0399e186..5aedfcf7 100644 --- a/.hermes/shared-memory/pitfalls.md +++ b/.hermes/shared-memory/pitfalls.md @@ -46,6 +46,14 @@ - 验证:普通 route 请求在 SpacetimeDB 不可用时仍能返回,恢复后 sealed 文件会继续被清理。 - 关联:`server-rs/crates/api-server/src/tracking_outbox.rs`、`docs/【开发运维】本地开发验证与生产运维-2026-05-15.md`。 +## release tracking outbox 权限错误先查 env 缺失 + +- 现象:release 机器 `journalctl -u genarrative-api.service` 每秒刷 `tracking outbox 定时封存 active 文件失败 error=Permission denied (os error 13)` 和 `tracking outbox 批量写入 SpacetimeDB 失败`。 +- 原因:旧 `/etc/genarrative/api-server.env` 没有 `GENARRATIVE_TRACKING_OUTBOX_DIR` 时,api-server 会回退到本地开发默认相对路径 `server-rs/.data/tracking-outbox`;systemd 工作目录是只读发布目录 `/opt/genarrative/releases/`,`genarrative` 用户不能在其中创建 `server-rs`。 +- 处理:补齐 `GENARRATIVE_TRACKING_OUTBOX_DIR=/var/lib/genarrative/tracking-outbox` 及 batch/flush/max 配置,创建并授权 `/var/lib/genarrative/tracking-outbox` 给 `genarrative:genarrative`,再重启 `genarrative-api.service`。Server-Provision 与 API-Deploy 会保留旧 env 但自动补缺这些运行态路径。 +- 验证:`tr '\0' '\n' < /proc/$(systemctl show genarrative-api.service -p MainPID --value)/environ | grep GENARRATIVE_TRACKING_OUTBOX_DIR` 应指向 `/var/lib/genarrative/tracking-outbox`;重启后当前 PID 不再出现 `Permission denied (os error 13)`。 +- 关联:`scripts/deploy/production-api-deploy.sh`、`scripts/jenkins-server-provision.sh`、`docs/【开发运维】本地开发验证与生产运维-2026-05-15.md`。 + ## 汪汪声浪入口不要再回到独立配置阶段 - 现象:汪汪声浪入口如果继续切换到独立配置阶段,会和拼图、抓大鹅的创作页内嵌结构不一致,用户会感觉入口跳页。 diff --git a/docs/【开发运维】本地开发验证与生产运维-2026-05-15.md b/docs/【开发运维】本地开发验证与生产运维-2026-05-15.md index 15b1381b..6ba7a1b4 100644 --- a/docs/【开发运维】本地开发验证与生产运维-2026-05-15.md +++ b/docs/【开发运维】本地开发验证与生产运维-2026-05-15.md @@ -260,6 +260,17 @@ GENARRATIVE_TRACKING_OUTBOX_MAX_BYTES=268435456 outbox 采用 NDJSON 文件保存原始事件。达到 `BATCH_SIZE` 时会立刻把当前 active 文件原子封存为 sealed 文件,并马上切到新的 active 继续写入;后台 worker 异步 flush sealed 文件,HTTP 请求线程不等待 SpacetimeDB。`FLUSH_INTERVAL_MS` 只负责兜底封存长时间未满批的 active 文件。SpacetimeDB 批量 procedure 返回成功后删除 sealed 文件,失败则保留文件并重试。`MAX_BYTES` 是磁盘保护阈值,不是 flush 阈值;超过后低价值 route tracking 可以被丢弃并记录日志 / 指标,关键同步事件不进入该丢弃路径。sealed 文件若出现无法解析的坏行,会重命名为 `corrupt-*` 隔离并记录 `genarrative.tracking_outbox.files.corrupt` 指标,避免一个坏文件阻塞后续批量入库。该机制提供至少一次投递语义,依赖 `tracking_event.event_id` 幂等跳过重复事件。 +release 机器如果日志每秒刷 `tracking outbox ... Permission denied (os error 13)`,先检查 `/etc/genarrative/api-server.env` 是否缺少 `GENARRATIVE_TRACKING_OUTBOX_DIR`。缺少时 `api-server` 会回退到本地开发默认相对路径 `server-rs/.data/tracking-outbox`,而 systemd 的工作目录是只读发布目录 `/opt/genarrative/releases/`,`genarrative` 用户无法在其中创建 `server-rs`。修复顺序: + +```bash +install -d -o genarrative -g genarrative -m 0750 /var/lib/genarrative/tracking-outbox +grep -n '^GENARRATIVE_TRACKING_OUTBOX' /etc/genarrative/api-server.env +systemctl restart genarrative-api.service +journalctl -u genarrative-api.service --since '30 seconds ago' --no-pager | grep -E 'tracking outbox|Permission denied|os error 13' +``` + +`Genarrative-Server-Provision` 和 `Genarrative-Api-Deploy` 会在保留旧 `/etc/genarrative/api-server.env` 的前提下补齐缺失的 tracking outbox 与 auth-store 运行态路径,并确保 `/var/lib/genarrative/tracking-outbox`、`/var/lib/genarrative/auth` 归属 `genarrative:genarrative`。 + 常用检查思路: ```sql diff --git a/scripts/deploy/production-api-deploy.sh b/scripts/deploy/production-api-deploy.sh index 55601b18..992604cd 100644 --- a/scripts/deploy/production-api-deploy.sh +++ b/scripts/deploy/production-api-deploy.sh @@ -120,6 +120,115 @@ PY fi } +read_env_value() { + local file_path="$1" + local key="$2" + + if [[ ! -f "${file_path}" ]]; then + return 0 + fi + + local python_script=' +import sys +from pathlib import Path + +path = Path(sys.argv[1]) +key = sys.argv[2] +if not path.exists(): + raise SystemExit(0) +for raw_line in path.read_text(encoding="utf-8").splitlines(): + line = raw_line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + current_key, value = line.split("=", 1) + if current_key == key: + value = value.strip() + if len(value) >= 2 and value[0] == value[-1] and value[0] in ("\"", "'\''"): + value = value[1:-1] + print(value) + raise SystemExit(0) +' + + if [[ -r "${file_path}" ]]; then + python3 -c "${python_script}" "${file_path}" "${key}" + else + if ! sudo -n true >/dev/null 2>&1; then + echo "[production-api-deploy] 当前用户无权读取 ${file_path},且 sudo -n 不可用;无法检查运行态环境变量。" >&2 + exit 1 + fi + sudo -n python3 -c "${python_script}" "${file_path}" "${key}" + fi +} + +ensure_env_value() { + local file_path="$1" + local key="$2" + local default_value="$3" + local current_value + + current_value="$(read_env_value "${file_path}" "${key}")" + if [[ -n "${current_value}" ]]; then + return + fi + + echo "[production-api-deploy] 补齐 api-server 环境变量: ${key} -> ${file_path}" + write_env_value "${file_path}" "${key}" "${default_value}" +} + +run_privileged() { + if [[ "$(id -u)" -eq 0 ]]; then + "$@" + return + fi + if ! sudo -n true >/dev/null 2>&1; then + echo "[production-api-deploy] 当前用户不是 root,且 sudo -n 不可用;无法执行: $*" >&2 + exit 1 + fi + sudo -n "$@" +} + +ensure_runtime_dir() { + local path="$1" + local mode="$2" + + if [[ -z "${path}" ]]; then + return + fi + if [[ "${path}" != /* ]]; then + echo "[production-api-deploy] 运行态目录必须使用绝对路径,避免写入只读发布目录: ${path}" >&2 + exit 1 + fi + + echo "[production-api-deploy] 确保运行态目录可写: ${path}" + run_privileged install -d -o genarrative -g genarrative -m "${mode}" "${path}" +} + +ensure_runtime_env_and_dirs() { + local api_env_file="$1" + local tracking_enabled tracking_outbox_dir auth_store_path auth_store_dir + + # 旧生产环境文件会被 server-provision 保留,不一定包含新增的运行态写入路径。 + # 发布前只补缺省值,不覆盖线上已经定制过的目录或开关。 + ensure_env_value "${api_env_file}" "GENARRATIVE_TRACKING_OUTBOX_ENABLED" "true" + ensure_env_value "${api_env_file}" "GENARRATIVE_TRACKING_OUTBOX_DIR" "/var/lib/genarrative/tracking-outbox" + ensure_env_value "${api_env_file}" "GENARRATIVE_TRACKING_OUTBOX_BATCH_SIZE" "500" + ensure_env_value "${api_env_file}" "GENARRATIVE_TRACKING_OUTBOX_FLUSH_INTERVAL_MS" "1000" + ensure_env_value "${api_env_file}" "GENARRATIVE_TRACKING_OUTBOX_MAX_BYTES" "268435456" + ensure_env_value "${api_env_file}" "GENARRATIVE_AUTH_STORE_PATH" "/var/lib/genarrative/auth/auth-store.json" + + tracking_enabled="$(read_env_value "${api_env_file}" "GENARRATIVE_TRACKING_OUTBOX_ENABLED")" + tracking_outbox_dir="$(read_env_value "${api_env_file}" "GENARRATIVE_TRACKING_OUTBOX_DIR")" + if [[ "$(printf "%s" "${tracking_enabled}" | tr '[:upper:]' '[:lower:]')" != "false" ]]; then + ensure_runtime_dir "${tracking_outbox_dir}" "0750" + fi + + auth_store_path="$(read_env_value "${api_env_file}" "GENARRATIVE_AUTH_STORE_PATH")" + if [[ -n "${auth_store_path}" ]]; then + auth_store_dir="$(dirname "${auth_store_path}")" + ensure_runtime_dir "${auth_store_dir}" "0750" + fi +} + SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" SOURCE_DIR="" VERSION="" @@ -243,6 +352,8 @@ if [[ -n "${SPACETIME_SERVER_URL}" ]]; then write_env_value "${API_ENV_FILE}" "GENARRATIVE_SPACETIME_SERVER_URL" "${SPACETIME_SERVER_URL}" fi +ensure_runtime_env_and_dirs "${API_ENV_FILE}" + mkdir -p "$(dirname "${CURRENT_LINK}")" ln -sfn "${RELEASE_DIR}" "${CURRENT_LINK}" diff --git a/scripts/jenkins-server-provision.sh b/scripts/jenkins-server-provision.sh index 0064d904..e54b42d0 100755 --- a/scripts/jenkins-server-provision.sh +++ b/scripts/jenkins-server-provision.sh @@ -292,6 +292,42 @@ write_env_value() { chown root:root "${file}" } +ensure_env_value() { + local file="$1" + local key="$2" + local default_value="$3" + local current_value + + current_value="$(read_env_value "${file}" "${key}")" + if [[ -n "${current_value}" ]]; then + return + fi + + echo "[server-provision] 补齐 api-server 环境变量: ${key} -> ${file}" + if [[ "${DRY_RUN}" != "true" ]]; then + write_env_value "${file}" "${key}" "${default_value}" + fi +} + +ensure_api_runtime_env_defaults() { + if [[ "${DRY_RUN}" == "true" ]]; then + echo "+ ensure api-server runtime env defaults in ${API_ENV_FILE}" + return + fi + if [[ ! -f "${API_ENV_FILE}" ]]; then + echo "[server-provision] 环境文件不存在,无法补齐 api-server 运行态目录变量: ${API_ENV_FILE}" >&2 + exit 1 + fi + + # 已存在的生产 env 会被保留,不会整文件覆盖;这里仅补后续版本新增的运行态写入路径。 + ensure_env_value "${API_ENV_FILE}" "GENARRATIVE_TRACKING_OUTBOX_ENABLED" "true" + ensure_env_value "${API_ENV_FILE}" "GENARRATIVE_TRACKING_OUTBOX_DIR" "/var/lib/genarrative/tracking-outbox" + ensure_env_value "${API_ENV_FILE}" "GENARRATIVE_TRACKING_OUTBOX_BATCH_SIZE" "500" + ensure_env_value "${API_ENV_FILE}" "GENARRATIVE_TRACKING_OUTBOX_FLUSH_INTERVAL_MS" "1000" + ensure_env_value "${API_ENV_FILE}" "GENARRATIVE_TRACKING_OUTBOX_MAX_BYTES" "268435456" + ensure_env_value "${API_ENV_FILE}" "GENARRATIVE_AUTH_STORE_PATH" "/var/lib/genarrative/auth/auth-store.json" +} + parse_json_string_field() { local json="$1" local key="$2" @@ -673,6 +709,7 @@ if [[ ! -f "${API_ENV_FILE}" ]]; then else echo "[server-provision] 已存在环境文件,保留不覆盖: ${API_ENV_FILE}" fi +ensure_api_runtime_env_defaults if [[ "${ENABLE_OTELCOL:-true}" == "true" ]]; then sync_otelcol_install