perf: concurrent worker with claim/execute split + graceful shutdown

- JobRepository::claim_next() — atomic SELECT FOR UPDATE SKIP LOCKED +
  UPDATE status=processing in one query, no duplicate claims
- ExecutePipelineHandler skips start() for already-claimed jobs
- Sweep spawns N concurrent tasks via JoinSet, claims are fast+sequential,
  execution is slow+concurrent
- Graceful shutdown: stop claiming, await all in-flight JoinSet tasks
- WORKER_CONCURRENCY env (default: CPU cores)
- DB_MAX_CONNECTIONS env (default: 20, was hardcoded 10)
- VolumeFileResolver impl for InMemoryFileStorage (test fix)
This commit is contained in:
2026-06-01 02:14:44 +02:00
parent 0077caa743
commit c251a5c41f
14 changed files with 178 additions and 56 deletions

View File

@@ -1,8 +1,12 @@
pub type PgPool = sqlx::PgPool;
pub async fn connect(url: &str) -> anyhow::Result<PgPool> {
let max_conn: u32 = std::env::var("DB_MAX_CONNECTIONS")
.ok()
.and_then(|v| v.parse().ok())
.unwrap_or(20);
let pool = sqlx::postgres::PgPoolOptions::new()
.max_connections(10)
.max_connections(max_conn)
.connect(url)
.await?;
Ok(pool)

View File

@@ -145,7 +145,29 @@ impl JobRepository for PostgresJobRepository {
started_at, completed_at, error_message
FROM jobs WHERE status = 'queued'
ORDER BY priority DESC, created_at ASC
LIMIT 1",
LIMIT 1
FOR UPDATE SKIP LOCKED",
)
.fetch_optional(&self.pool)
.await
.map_pg()?;
Ok(row.map(Into::into))
}
async fn claim_next(&self) -> Result<Option<Job>, DomainError> {
let row = sqlx::query_as::<_, JobRow>(
"UPDATE jobs SET status = 'processing', started_at = NOW()
WHERE job_id = (
SELECT job_id FROM jobs
WHERE status = 'queued'
ORDER BY priority DESC, created_at ASC
LIMIT 1
FOR UPDATE SKIP LOCKED
)
RETURNING job_id, job_type, target_asset_id, batch_id, status, priority,
payload, result_data, retry_count, max_retries, created_at,
started_at, completed_at, error_message",
)
.fetch_optional(&self.pool)
.await