diff --git a/Cargo.lock b/Cargo.lock index 6d2724c..1289ba0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,6 +17,15 @@ dependencies = [ "uuid", ] +[[package]] +name = "adapters-exif" +version = "0.1.0" +dependencies = [ + "bytes", + "domain", + "nom-exif", +] + [[package]] name = "adapters-nats" version = "0.1.0" @@ -119,6 +128,15 @@ dependencies = [ "uuid", ] +[[package]] +name = "approx" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cab112f0a86d568ea0e627cc1d6be74a1e9cd55214684db5561995f6dad897c6" +dependencies = [ + "num-traits", +] + [[package]] name = "async-nats" version = "0.48.0" @@ -836,6 +854,17 @@ dependencies = [ "version_check", ] +[[package]] +name = "geo-types" +version = "0.7.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94776032c45f950d30a13af6113c2ad5625316c9abfbccee4dd5a6695f8fe0f5" +dependencies = [ + "approx", + "num-traits", + "serde", +] + [[package]] name = "getrandom" version = "0.2.17" @@ -1232,6 +1261,16 @@ version = "2.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2" +[[package]] +name = "iso6709parse" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5090db9c6a716d1f4eeb729957e889e9c28156061c825cbccd44950cf0f3c66" +dependencies = [ + "geo-types", + "nom", +] + [[package]] name = "itertools" version = "0.13.0" @@ -1385,6 +1424,12 @@ version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + [[package]] name = "mio" version = "1.1.1" @@ -1428,6 +1473,32 @@ dependencies = [ "signatory", ] +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "nom-exif" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d759833b65510dc55d774e34b7ef6665ffd293eae44844e189a9da2bea53d47a" +dependencies = [ + "bytes", + "chrono", + "iso6709parse", + "nom", + "regex", + "serde", + "thiserror", + "tracing", +] + [[package]] name = "nu-ansi-term" version = "0.50.3" @@ -3563,6 +3634,7 @@ dependencies = [ name = "worker" version = "0.1.0" dependencies = [ + "adapters-exif", "adapters-nats", "adapters-postgres", "adapters-storage", diff --git a/Cargo.toml b/Cargo.toml index 7943174..e12b254 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,6 +10,7 @@ members = [ "crates/adapters/event-payload", "crates/adapters/event-transport", "crates/adapters/nats", + "crates/adapters/exif", "crates/presentation", "crates/bootstrap", "crates/worker", @@ -46,6 +47,7 @@ adapters-storage = { path = "crates/adapters/storage" } event-payload = { path = "crates/adapters/event-payload" } event-transport = { path = "crates/adapters/event-transport" } adapters-nats = { path = "crates/adapters/nats" } +adapters-exif = { path = "crates/adapters/exif" } async-nats = "0.48" async-stream = "0.3" presentation = { path = "crates/presentation" } diff --git a/crates/adapters/exif/Cargo.toml b/crates/adapters/exif/Cargo.toml new file mode 100644 index 0000000..dc0b38d --- /dev/null +++ b/crates/adapters/exif/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "adapters-exif" +version = "0.1.0" +edition = "2024" + +[dependencies] +domain = { workspace = true } +bytes = { workspace = true } +nom-exif = { version = "2.5", features = ["serde"] } diff --git a/crates/adapters/exif/src/lib.rs b/crates/adapters/exif/src/lib.rs new file mode 100644 index 0000000..74fe814 --- /dev/null +++ b/crates/adapters/exif/src/lib.rs @@ -0,0 +1,80 @@ +use bytes::Bytes; +use domain::{ + errors::DomainError, + ports::MetadataExtractorPort, + value_objects::{MetadataValue, StructuredData}, +}; +use nom_exif::{ExifIter, MediaParser, MediaSource, TrackInfo}; +use std::io::Cursor; + +pub struct NomExifExtractor; + +impl MetadataExtractorPort for NomExifExtractor { + fn extract(&self, bytes: &Bytes) -> Result { + if bytes.is_empty() { + return Ok(StructuredData::new()); + } + + let ms = match MediaSource::seekable(Cursor::new(bytes.as_ref())) { + Ok(ms) => ms, + Err(_) => return Ok(StructuredData::new()), + }; + + let mut parser = MediaParser::new(); + let mut data = StructuredData::new(); + + if ms.has_exif() { + let iter: ExifIter = match parser.parse(ms) { + Ok(iter) => iter, + Err(_) => return Ok(data), + }; + + for mut entry in iter { + let tag_name = match entry.tag() { + Some(t) => t.to_string(), + None => continue, + }; + + if tag_name.starts_with("Unknown(") { + continue; + } + + let value = match entry.take_result() { + Ok(v) => v.to_string(), + Err(_) => continue, + }; + + if is_noisy_value(&value) { + continue; + } + + data.insert(tag_name, MetadataValue::String(value)); + } + } else { + let track_info = match parser.parse::<_, _, TrackInfo>(ms) { + Ok(info) => info, + Err(_) => return Ok(data), + }; + + for (key, val) in track_info { + data.insert( + format!("track:{}", key), + MetadataValue::String(val.to_string()), + ); + } + } + + Ok(data) + } +} + +fn is_noisy_value(v: &str) -> bool { + v.starts_with("U16Array") + || v.starts_with("U32Array") + || v.starts_with("U8Array") + || v.starts_with("URationalArray") + || v.starts_with("Undefined") +} + +#[cfg(test)] +mod tests; diff --git a/crates/adapters/exif/src/tests.rs b/crates/adapters/exif/src/tests.rs new file mode 100644 index 0000000..5f0190d --- /dev/null +++ b/crates/adapters/exif/src/tests.rs @@ -0,0 +1,19 @@ +use crate::NomExifExtractor; +use bytes::Bytes; +use domain::ports::MetadataExtractorPort; + +#[test] +fn empty_bytes_returns_empty_data() { + let extractor = NomExifExtractor; + let result = extractor.extract(&Bytes::new()); + assert!(result.is_ok()); + assert!(result.unwrap().is_empty()); +} + +#[test] +fn garbage_bytes_returns_empty_data() { + let extractor = NomExifExtractor; + let result = extractor.extract(&Bytes::from_static(b"not a real image file")); + assert!(result.is_ok()); + assert!(result.unwrap().is_empty()); +} diff --git a/crates/domain/src/catalog/ports.rs b/crates/domain/src/catalog/ports.rs index ba870d2..dd37720 100644 --- a/crates/domain/src/catalog/ports.rs +++ b/crates/domain/src/catalog/ports.rs @@ -3,8 +3,9 @@ use super::entities::{ MetadataSource, }; use crate::common::errors::DomainError; -use crate::common::value_objects::{Checksum, SystemId}; +use crate::common::value_objects::{Checksum, StructuredData, SystemId}; use async_trait::async_trait; +use bytes::Bytes; // --- AssetRepository --- @@ -74,3 +75,9 @@ pub trait DuplicateRepository: Send + Sync { async fn find_by_asset(&self, asset_id: &SystemId) -> Result, DomainError>; async fn save(&self, group: &DuplicateGroup) -> Result<(), DomainError>; } + +// --- MetadataExtractorPort --- + +pub trait MetadataExtractorPort: Send + Sync { + fn extract(&self, bytes: &Bytes) -> Result; +} diff --git a/crates/worker/Cargo.toml b/crates/worker/Cargo.toml index ad85ae4..5d49b1e 100644 --- a/crates/worker/Cargo.toml +++ b/crates/worker/Cargo.toml @@ -15,6 +15,7 @@ adapters-postgres = { path = "../adapters/postgres" } adapters-storage = { workspace = true } adapters-nats = { workspace = true } event-transport = { workspace = true } +adapters-exif = { workspace = true } async-nats = { workspace = true } futures = { workspace = true } diff --git a/crates/worker/src/factories/plugins.rs b/crates/worker/src/factories/plugins.rs index 88c46b6..1e50772 100644 --- a/crates/worker/src/factories/plugins.rs +++ b/crates/worker/src/factories/plugins.rs @@ -1,6 +1,6 @@ use crate::plugin_registry::InMemoryPluginRegistry; use crate::plugins::{MetadataExtractorPlugin, NoOpPlugin, SidecarSyncPlugin}; -use domain::ports::SidecarWriterPort; +use domain::ports::{MetadataExtractorPort, SidecarWriterPort}; use std::sync::Arc; use super::Repos; @@ -9,6 +9,7 @@ pub fn build_plugin_registry( repos: &Repos, file_storage: Arc, sidecar_writer: Arc, + extractor: Arc, ) -> InMemoryPluginRegistry { let mut registry = InMemoryPluginRegistry::new(); @@ -17,6 +18,7 @@ pub fn build_plugin_registry( repos.asset.clone(), file_storage, repos.metadata.clone(), + extractor, ))); let export_handler = Arc::new(application::sidecar::ExportSidecarHandler::new( diff --git a/crates/worker/src/main.rs b/crates/worker/src/main.rs index 3060303..81efd29 100644 --- a/crates/worker/src/main.rs +++ b/crates/worker/src/main.rs @@ -51,7 +51,14 @@ async fn main() -> anyhow::Result<()> { event_transport::CompositeEventPublisher::new(nats_publisher, event_store), ); - let registry = Arc::new(build_plugin_registry(&repos, file_storage, sidecar_writer)); + let extractor: Arc = + Arc::new(adapters_exif::NomExifExtractor); + let registry = Arc::new(build_plugin_registry( + &repos, + file_storage, + sidecar_writer, + extractor, + )); let process_next = Arc::new(build_process_next_handler( &repos, registry, diff --git a/crates/worker/src/plugins/metadata_extractor.rs b/crates/worker/src/plugins/metadata_extractor.rs index 0b3529e..aeb410c 100644 --- a/crates/worker/src/plugins/metadata_extractor.rs +++ b/crates/worker/src/plugins/metadata_extractor.rs @@ -2,7 +2,10 @@ use async_trait::async_trait; use domain::{ entities::{AssetMetadata, MetadataSource}, errors::DomainError, - ports::{AssetMetadataRepository, AssetRepository, FileStoragePort, PluginExecutor}, + ports::{ + AssetMetadataRepository, AssetRepository, FileStoragePort, MetadataExtractorPort, + PluginExecutor, + }, value_objects::{MetadataValue, StructuredData, SystemId}, }; use std::sync::Arc; @@ -12,6 +15,7 @@ pub struct MetadataExtractorPlugin { asset_repo: Arc, file_storage: Arc, metadata_repo: Arc, + extractor: Arc, } impl MetadataExtractorPlugin { @@ -19,11 +23,13 @@ impl MetadataExtractorPlugin { asset_repo: Arc, file_storage: Arc, metadata_repo: Arc, + extractor: Arc, ) -> Self { Self { asset_repo, file_storage, metadata_repo, + extractor, } } } @@ -52,17 +58,16 @@ impl PluginExecutor for MetadataExtractorPlugin { let path = &asset.source_reference.relative_path; let data = self.file_storage.read_file(path).await?; - let file_size = data.len() as i64; - let mut extracted = StructuredData::new(); - extracted.insert("file_size_bytes", MetadataValue::Integer(file_size)); + let mut extracted = self.extractor.extract(&data)?; + extracted.insert("file_size_bytes", MetadataValue::Integer(data.len() as i64)); extracted.insert("mime_type", MetadataValue::String(asset.mime_type.clone())); let metadata = AssetMetadata::new(asset_id, MetadataSource::ExifExtracted, extracted.clone()); self.metadata_repo.save(&metadata).await?; - info!(asset_id = %asset_id, file_size, "extracted basic metadata"); + info!(asset_id = %asset_id, tags = extracted.len(), "extracted metadata"); Ok(extracted) } }