From 3f8180a246464b3738e610d036774d95911f212a Mon Sep 17 00:00:00 2001 From: kikootwo Date: Fri, 13 Feb 2026 14:03:21 -0500 Subject: [PATCH] Add server readiness check & init retries Wait for the Next.js server and DB to be healthy before initializing services in docker/unified/app-start.sh. Adds a health probe with configurable timeout and retries, backoff retries for the /api/init call, improved logging, and error handling when the server process exits. In src/lib/services/scheduler.service.ts, make re-encryption of notification backends non-fatal by catching and logging errors, and make creation of default scheduled jobs robust by creating each job independently with per-job error handling and logging. Summary counts are logged for created/failed defaults so failures don't block the scheduler from starting. --- docker/unified/app-start.sh | 73 ++++++++++++++++++++++++--- src/lib/services/scheduler.service.ts | 47 +++++++++++++---- 2 files changed, 103 insertions(+), 17 deletions(-) diff --git a/docker/unified/app-start.sh b/docker/unified/app-start.sh index 1104d8f..ebf9b74 100644 --- a/docker/unified/app-start.sh +++ b/docker/unified/app-start.sh @@ -53,14 +53,75 @@ start_server() { start_server SERVER_PID=$! -echo "[App] Waiting for server to be ready..." -sleep 5 +# ============================================================================= +# WAIT FOR SERVER READINESS +# ============================================================================= +# The health endpoint (/api/health) checks both the Next.js server AND database +# connectivity. We must wait for both before initializing scheduled jobs. -# Initialize application services (creates default scheduled jobs) -echo "[App] Initializing application services..." -curl -sf http://localhost:3030/api/init || echo "[App] Warning: Failed to initialize services (may already be initialized)" +HEALTH_URL="http://localhost:3030/api/health" +INIT_URL="http://localhost:3030/api/init" +READY_TIMEOUT=${APP_READY_TIMEOUT:-60} +INIT_RETRIES=${APP_INIT_RETRIES:-5} -echo "[App] Server ready with PID $SERVER_PID" +echo "[App] Waiting for server to be ready (timeout: ${READY_TIMEOUT}s)..." + +READY=false +for i in $(seq 1 "$READY_TIMEOUT"); do + # Check if the server process is still alive + if ! kill -0 "$SERVER_PID" 2>/dev/null; then + echo "[App] ERROR: Server process (PID $SERVER_PID) exited unexpectedly" + exit 1 + fi + + if curl -sf "$HEALTH_URL" > /dev/null 2>&1; then + READY=true + echo "[App] Server is healthy (took ${i}s)" + break + fi + + # Log progress every 10 seconds + if [ $((i % 10)) -eq 0 ]; then + echo "[App] Still waiting for server... (${i}/${READY_TIMEOUT}s)" + fi + + sleep 1 +done + +if [ "$READY" = "false" ]; then + echo "[App] ERROR: Server did not become healthy within ${READY_TIMEOUT}s" + echo "[App] The scheduler will not be initialized - scheduled jobs may be missing" + echo "[App] Check server logs above for errors (database connection, port conflict, etc.)" +else + # ========================================================================= + # INITIALIZE APPLICATION SERVICES + # ========================================================================= + # Creates default scheduled jobs, runs credential migration, etc. + # Retry with backoff to handle transient failures during startup. + + echo "[App] Initializing application services..." + + INIT_SUCCESS=false + for attempt in $(seq 1 "$INIT_RETRIES"); do + HTTP_CODE=$(curl -sf -o /dev/null -w "%{http_code}" "$INIT_URL" 2>/dev/null) || HTTP_CODE="000" + + if [ "$HTTP_CODE" = "200" ]; then + INIT_SUCCESS=true + echo "[App] Services initialized successfully" + break + fi + + echo "[App] Init attempt $attempt/$INIT_RETRIES failed (HTTP $HTTP_CODE), retrying in ${attempt}s..." + sleep "$attempt" + done + + if [ "$INIT_SUCCESS" = "false" ]; then + echo "[App] ERROR: Failed to initialize services after $INIT_RETRIES attempts" + echo "[App] Scheduled jobs may be missing - check application logs for details" + fi +fi + +echo "[App] Server running with PID $SERVER_PID" # Verify the process is running with correct UID:GID (for debugging) if [ -f "/proc/$SERVER_PID/status" ]; then diff --git a/src/lib/services/scheduler.service.ts b/src/lib/services/scheduler.service.ts index 964fda0..b95ae81 100644 --- a/src/lib/services/scheduler.service.ts +++ b/src/lib/services/scheduler.service.ts @@ -51,12 +51,18 @@ export class SchedulerService { logger.info('Initializing scheduler service...'); // Re-encrypt any notification backends with plaintext sensitive fields - await getNotificationService().reEncryptUnprotectedBackends(); + try { + await getNotificationService().reEncryptUnprotectedBackends(); + } catch (error) { + logger.error('Failed to re-encrypt notification backends (non-fatal)', { + error: error instanceof Error ? error.message : String(error), + }); + } // Create default jobs if they don't exist await this.ensureDefaultJobs(); - // Load and schedule all enabled jobs + // Load and schedule all enabled jobs (works with whatever jobs exist in DB) await this.scheduleAllJobs(); // Check and trigger overdue jobs @@ -66,7 +72,8 @@ export class SchedulerService { } /** - * Ensure default jobs exist in database + * Ensure default jobs exist in database. + * Each job is created independently so a single failure doesn't block the rest. */ private async ensureDefaultJobs(): Promise { const defaults = [ @@ -128,18 +135,36 @@ export class SchedulerService { }, ]; - for (const defaultJob of defaults) { - const existing = await prisma.scheduledJob.findFirst({ - where: { type: defaultJob.type }, - }); + let created = 0; + let failed = 0; - if (!existing) { - await prisma.scheduledJob.create({ - data: defaultJob, + for (const defaultJob of defaults) { + try { + const existing = await prisma.scheduledJob.findFirst({ + where: { type: defaultJob.type }, + }); + + if (!existing) { + await prisma.scheduledJob.create({ + data: defaultJob, + }); + created++; + logger.info(`Created default job: ${defaultJob.name} (enabled: ${defaultJob.enabled})`); + } + } catch (error) { + failed++; + logger.error(`Failed to create default job: ${defaultJob.name}`, { + type: defaultJob.type, + error: error instanceof Error ? error.message : String(error), }); - logger.info(`Created default job: ${defaultJob.name} (disabled by default)`); } } + + if (failed > 0) { + logger.warn(`Default jobs: ${created} created, ${failed} failed — failed jobs will be retried on next restart`); + } else if (created > 0) { + logger.info(`Default jobs: ${created} created`); + } } /**