perf: concurrent worker with claim/execute split + graceful shutdown
- JobRepository::claim_next() — atomic SELECT FOR UPDATE SKIP LOCKED + UPDATE status=processing in one query, no duplicate claims - ExecutePipelineHandler skips start() for already-claimed jobs - Sweep spawns N concurrent tasks via JoinSet, claims are fast+sequential, execution is slow+concurrent - Graceful shutdown: stop claiming, await all in-flight JoinSet tasks - WORKER_CONCURRENCY env (default: CPU cores) - DB_MAX_CONNECTIONS env (default: 20, was hardcoded 10) - VolumeFileResolver impl for InMemoryFileStorage (test fix)
This commit is contained in:
@@ -1,16 +1,16 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use application::catalog::DeleteAssetHandler;
|
||||
use application::processing::{EnqueueJobHandler, ProcessNextJobHandler};
|
||||
use application::processing::{EnqueueJobHandler, ExecutePipelineHandler};
|
||||
use domain::ports::{AssetRepository, JobRepository};
|
||||
|
||||
use crate::config::WorkerConfig;
|
||||
use crate::factories::{
|
||||
Repos, build_enqueue_handler, build_plugin_registry, build_process_next_handler,
|
||||
Repos, build_enqueue_handler, build_executor, build_plugin_registry,
|
||||
};
|
||||
|
||||
pub struct WorkerServices {
|
||||
pub process_next: Arc<ProcessNextJobHandler>,
|
||||
pub executor: Arc<ExecutePipelineHandler>,
|
||||
pub enqueue: Arc<EnqueueJobHandler>,
|
||||
pub job_repo: Arc<dyn JobRepository>,
|
||||
pub asset_repo: Arc<dyn AssetRepository>,
|
||||
@@ -57,11 +57,7 @@ pub async fn build(config: &WorkerConfig) -> anyhow::Result<WorkerServices> {
|
||||
event_pub.clone(),
|
||||
));
|
||||
|
||||
let process_next = Arc::new(build_process_next_handler(
|
||||
&repos,
|
||||
registry,
|
||||
event_pub.clone(),
|
||||
));
|
||||
let executor = Arc::new(build_executor(&repos, registry, event_pub.clone()));
|
||||
let job_repo: Arc<dyn JobRepository> = repos.job.clone();
|
||||
let asset_repo: Arc<dyn AssetRepository> = repos.asset.clone();
|
||||
let enqueue = Arc::new(build_enqueue_handler(&repos, event_pub.clone()));
|
||||
@@ -80,7 +76,7 @@ pub async fn build(config: &WorkerConfig) -> anyhow::Result<WorkerServices> {
|
||||
let event_consumer = adapters_event_transport::EventConsumerAdapter::new(consumer_source);
|
||||
|
||||
Ok(WorkerServices {
|
||||
process_next,
|
||||
executor,
|
||||
enqueue,
|
||||
job_repo,
|
||||
asset_repo,
|
||||
|
||||
@@ -5,6 +5,7 @@ pub struct WorkerConfig {
|
||||
pub fallback_sweep_secs: u64,
|
||||
pub storage_path: String,
|
||||
pub trash_retention_days: u64,
|
||||
pub concurrency: usize,
|
||||
}
|
||||
|
||||
impl WorkerConfig {
|
||||
@@ -22,6 +23,10 @@ impl WorkerConfig {
|
||||
.ok()
|
||||
.and_then(|v| v.parse().ok())
|
||||
.unwrap_or(30),
|
||||
concurrency: std::env::var("WORKER_CONCURRENCY")
|
||||
.ok()
|
||||
.and_then(|v| v.parse().ok())
|
||||
.unwrap_or_else(|| std::thread::available_parallelism().map(|n| n.get()).unwrap_or(4)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,9 +2,10 @@ use std::sync::Arc;
|
||||
|
||||
use futures::StreamExt;
|
||||
use tokio::sync::watch;
|
||||
use tracing::{error, info, warn};
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::{error, info};
|
||||
|
||||
use application::processing::{EnqueueJobCommand, ProcessNextJobCommand, ProcessNextJobHandler};
|
||||
use application::processing::EnqueueJobCommand;
|
||||
use domain::entities::JobType;
|
||||
use domain::events::DomainEvent;
|
||||
use domain::ports::{EventConsumer, JobRepository};
|
||||
@@ -25,11 +26,12 @@ fn enqueue_cmd(job_type: JobType, priority: u32, asset_id: SystemId) -> EnqueueJ
|
||||
pub async fn run(services: WorkerServices, mut shutdown: watch::Receiver<bool>) {
|
||||
info!("event loop: listening for NATS events");
|
||||
let mut stream = services.event_consumer.consume();
|
||||
let mut in_flight = JoinSet::new();
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = shutdown.changed() => {
|
||||
info!("event loop: shutting down");
|
||||
info!("event loop: shutdown, waiting for {} in-flight jobs", in_flight.len());
|
||||
break;
|
||||
}
|
||||
msg = stream.next() => {
|
||||
@@ -69,7 +71,11 @@ pub async fn run(services: WorkerServices, mut shutdown: watch::Receiver<bool>)
|
||||
}
|
||||
DomainEvent::JobEnqueued { job_id, job_type, .. } => {
|
||||
info!(job_id = %job_id, job_type = %job_type, "JobEnqueued → process");
|
||||
drain_one(&services.process_next).await;
|
||||
crate::sweep::spawn_one(
|
||||
&services.job_repo,
|
||||
&services.executor,
|
||||
&mut in_flight,
|
||||
);
|
||||
}
|
||||
other => {
|
||||
tracing::debug!(event = ?other, "unhandled event, acked");
|
||||
@@ -78,6 +84,9 @@ pub async fn run(services: WorkerServices, mut shutdown: watch::Receiver<bool>)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
while in_flight.join_next().await.is_some() {}
|
||||
info!("event loop: all in-flight jobs finished");
|
||||
}
|
||||
|
||||
async fn handle_job_completed(
|
||||
@@ -97,16 +106,3 @@ async fn handle_job_completed(
|
||||
}
|
||||
}
|
||||
|
||||
async fn drain_one(handler: &Arc<ProcessNextJobHandler>) {
|
||||
match handler.execute(ProcessNextJobCommand).await {
|
||||
Ok(Some(job)) => {
|
||||
info!(job_id = %job.job_id, status = ?job.status, "processed job");
|
||||
}
|
||||
Ok(None) => {
|
||||
warn!("JobEnqueued but no queued job found");
|
||||
}
|
||||
Err(e) => {
|
||||
error!(error = %e, "error processing job");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,4 +4,4 @@ mod processing;
|
||||
|
||||
pub use infra::Repos;
|
||||
pub use plugins::build_plugin_registry;
|
||||
pub use processing::{build_enqueue_handler, build_process_next_handler};
|
||||
pub use processing::{build_enqueue_handler, build_executor};
|
||||
|
||||
@@ -1,24 +1,22 @@
|
||||
use application::processing::{EnqueueJobHandler, ExecutePipelineHandler, ProcessNextJobHandler};
|
||||
use application::processing::{EnqueueJobHandler, ExecutePipelineHandler};
|
||||
use domain::ports::{EventPublisher, PluginRegistry};
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::Repos;
|
||||
|
||||
pub fn build_process_next_handler(
|
||||
pub fn build_executor(
|
||||
repos: &Repos,
|
||||
registry: Arc<dyn PluginRegistry>,
|
||||
event_pub: Arc<dyn EventPublisher>,
|
||||
) -> ProcessNextJobHandler {
|
||||
let execute_pipeline = Arc::new(ExecutePipelineHandler::new(
|
||||
) -> ExecutePipelineHandler {
|
||||
ExecutePipelineHandler::new(
|
||||
repos.job.clone(),
|
||||
repos.batch.clone(),
|
||||
repos.pipeline.clone(),
|
||||
repos.plugin.clone(),
|
||||
registry,
|
||||
event_pub,
|
||||
));
|
||||
|
||||
ProcessNextJobHandler::new(repos.job.clone(), execute_pipeline)
|
||||
)
|
||||
}
|
||||
|
||||
pub fn build_enqueue_handler(
|
||||
|
||||
@@ -22,7 +22,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
.init();
|
||||
|
||||
let config = config::WorkerConfig::from_env();
|
||||
info!("Worker starting");
|
||||
info!(concurrency = config.concurrency, "Worker starting");
|
||||
|
||||
let services = bootstrap::build(&config).await?;
|
||||
|
||||
@@ -48,8 +48,10 @@ async fn main() -> anyhow::Result<()> {
|
||||
});
|
||||
|
||||
let sweep_interval = Duration::from_secs(config.fallback_sweep_secs);
|
||||
tokio::spawn(sweep::run(
|
||||
services.process_next.clone(),
|
||||
let sweep_handle = tokio::spawn(sweep::run(
|
||||
services.job_repo.clone(),
|
||||
services.executor.clone(),
|
||||
config.concurrency,
|
||||
sweep_interval,
|
||||
shutdown_rx.clone(),
|
||||
));
|
||||
@@ -62,6 +64,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
));
|
||||
|
||||
event_loop::run(services, shutdown_rx).await;
|
||||
let _ = sweep_handle.await;
|
||||
|
||||
info!("worker shutdown complete");
|
||||
Ok(())
|
||||
|
||||
@@ -2,42 +2,97 @@ use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use tokio::sync::watch;
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::{error, info};
|
||||
|
||||
use application::catalog::DeleteAssetHandler;
|
||||
use application::processing::{ProcessNextJobCommand, ProcessNextJobHandler};
|
||||
use domain::ports::AssetRepository;
|
||||
use application::processing::{ExecutePipelineCommand, ExecutePipelineHandler};
|
||||
use domain::ports::{AssetRepository, JobRepository};
|
||||
|
||||
pub async fn run(
|
||||
handler: Arc<ProcessNextJobHandler>,
|
||||
job_repo: Arc<dyn JobRepository>,
|
||||
executor: Arc<ExecutePipelineHandler>,
|
||||
concurrency: usize,
|
||||
interval: Duration,
|
||||
mut shutdown: watch::Receiver<bool>,
|
||||
) {
|
||||
info!(every_secs = interval.as_secs(), "sweep task started");
|
||||
let mut in_flight = JoinSet::new();
|
||||
info!(every_secs = interval.as_secs(), concurrency, "sweep task started");
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = shutdown.changed() => {
|
||||
info!("sweep task: shutting down");
|
||||
info!("sweep: shutdown, waiting for {} in-flight jobs", in_flight.len());
|
||||
break;
|
||||
}
|
||||
_ = tokio::time::sleep(interval) => {}
|
||||
}
|
||||
info!("sweep: draining queued jobs");
|
||||
loop {
|
||||
match handler.execute(ProcessNextJobCommand).await {
|
||||
Ok(Some(job)) => {
|
||||
info!(job_id = %job.job_id, status = ?job.status, "sweep: processed job");
|
||||
}
|
||||
Ok(None) => break,
|
||||
Err(e) => {
|
||||
error!(error = %e, "sweep: error processing job");
|
||||
break;
|
||||
}
|
||||
drain(&job_repo, &executor, concurrency, &mut in_flight).await;
|
||||
}
|
||||
|
||||
while in_flight.join_next().await.is_some() {}
|
||||
info!("sweep: all in-flight jobs finished");
|
||||
}
|
||||
|
||||
async fn drain(
|
||||
job_repo: &Arc<dyn JobRepository>,
|
||||
executor: &Arc<ExecutePipelineHandler>,
|
||||
concurrency: usize,
|
||||
in_flight: &mut JoinSet<()>,
|
||||
) {
|
||||
loop {
|
||||
while in_flight.len() >= concurrency {
|
||||
if in_flight.join_next().await.is_none() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let job = match job_repo.claim_next().await {
|
||||
Ok(Some(j)) => j,
|
||||
Ok(None) => break,
|
||||
Err(e) => {
|
||||
error!(error = %e, "sweep: error claiming job");
|
||||
break;
|
||||
}
|
||||
};
|
||||
|
||||
info!(job_id = %job.job_id, job_type = ?job.job_type, "sweep: claimed");
|
||||
let exec = executor.clone();
|
||||
in_flight.spawn(async move {
|
||||
let job_id = job.job_id;
|
||||
match exec.execute(ExecutePipelineCommand { job_id }).await {
|
||||
Ok(j) => info!(job_id = %j.job_id, status = ?j.status, "sweep: done"),
|
||||
Err(e) => error!(job_id = %job_id, error = %e, "sweep: failed"),
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
pub fn spawn_one(
|
||||
job_repo: &Arc<dyn JobRepository>,
|
||||
executor: &Arc<ExecutePipelineHandler>,
|
||||
in_flight: &mut JoinSet<()>,
|
||||
) {
|
||||
let repo = job_repo.clone();
|
||||
let exec = executor.clone();
|
||||
in_flight.spawn(async move {
|
||||
let job = match repo.claim_next().await {
|
||||
Ok(Some(j)) => j,
|
||||
Ok(None) => return,
|
||||
Err(e) => {
|
||||
error!(error = %e, "error claiming job");
|
||||
return;
|
||||
}
|
||||
};
|
||||
let job_id = job.job_id;
|
||||
match exec.execute(ExecutePipelineCommand { job_id }).await {
|
||||
Ok(j) => info!(job_id = %j.job_id, status = ?j.status, "done"),
|
||||
Err(e) => error!(job_id = %job_id, error = %e, "failed"),
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
pub async fn purge_trash(
|
||||
asset_repo: Arc<dyn AssetRepository>,
|
||||
delete_handler: Arc<DeleteAssetHandler>,
|
||||
|
||||
Reference in New Issue
Block a user