feat: real EXIF extraction via adapters-exif crate
- MetadataExtractorPort in domain (bytes → StructuredData) - adapters-exif: NomExifExtractor using nom-exif, handles EXIF + TrackInfo - Worker's MetadataExtractorPlugin delegates to port, no longer knows nom-exif - Filters noisy binary tags (U8Array, Undefined, Unknown)
This commit is contained in:
72
Cargo.lock
generated
72
Cargo.lock
generated
@@ -17,6 +17,15 @@ dependencies = [
|
|||||||
"uuid",
|
"uuid",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "adapters-exif"
|
||||||
|
version = "0.1.0"
|
||||||
|
dependencies = [
|
||||||
|
"bytes",
|
||||||
|
"domain",
|
||||||
|
"nom-exif",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "adapters-nats"
|
name = "adapters-nats"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
@@ -119,6 +128,15 @@ dependencies = [
|
|||||||
"uuid",
|
"uuid",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "approx"
|
||||||
|
version = "0.5.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "cab112f0a86d568ea0e627cc1d6be74a1e9cd55214684db5561995f6dad897c6"
|
||||||
|
dependencies = [
|
||||||
|
"num-traits",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "async-nats"
|
name = "async-nats"
|
||||||
version = "0.48.0"
|
version = "0.48.0"
|
||||||
@@ -836,6 +854,17 @@ dependencies = [
|
|||||||
"version_check",
|
"version_check",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "geo-types"
|
||||||
|
version = "0.7.19"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "94776032c45f950d30a13af6113c2ad5625316c9abfbccee4dd5a6695f8fe0f5"
|
||||||
|
dependencies = [
|
||||||
|
"approx",
|
||||||
|
"num-traits",
|
||||||
|
"serde",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "getrandom"
|
name = "getrandom"
|
||||||
version = "0.2.17"
|
version = "0.2.17"
|
||||||
@@ -1232,6 +1261,16 @@ version = "2.12.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2"
|
checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "iso6709parse"
|
||||||
|
version = "0.1.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b5090db9c6a716d1f4eeb729957e889e9c28156061c825cbccd44950cf0f3c66"
|
||||||
|
dependencies = [
|
||||||
|
"geo-types",
|
||||||
|
"nom",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "itertools"
|
name = "itertools"
|
||||||
version = "0.13.0"
|
version = "0.13.0"
|
||||||
@@ -1385,6 +1424,12 @@ version = "0.3.17"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
|
checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "minimal-lexical"
|
||||||
|
version = "0.2.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "mio"
|
name = "mio"
|
||||||
version = "1.1.1"
|
version = "1.1.1"
|
||||||
@@ -1428,6 +1473,32 @@ dependencies = [
|
|||||||
"signatory",
|
"signatory",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "nom"
|
||||||
|
version = "7.1.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
|
||||||
|
dependencies = [
|
||||||
|
"memchr",
|
||||||
|
"minimal-lexical",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "nom-exif"
|
||||||
|
version = "2.8.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d759833b65510dc55d774e34b7ef6665ffd293eae44844e189a9da2bea53d47a"
|
||||||
|
dependencies = [
|
||||||
|
"bytes",
|
||||||
|
"chrono",
|
||||||
|
"iso6709parse",
|
||||||
|
"nom",
|
||||||
|
"regex",
|
||||||
|
"serde",
|
||||||
|
"thiserror",
|
||||||
|
"tracing",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "nu-ansi-term"
|
name = "nu-ansi-term"
|
||||||
version = "0.50.3"
|
version = "0.50.3"
|
||||||
@@ -3563,6 +3634,7 @@ dependencies = [
|
|||||||
name = "worker"
|
name = "worker"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"adapters-exif",
|
||||||
"adapters-nats",
|
"adapters-nats",
|
||||||
"adapters-postgres",
|
"adapters-postgres",
|
||||||
"adapters-storage",
|
"adapters-storage",
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ members = [
|
|||||||
"crates/adapters/event-payload",
|
"crates/adapters/event-payload",
|
||||||
"crates/adapters/event-transport",
|
"crates/adapters/event-transport",
|
||||||
"crates/adapters/nats",
|
"crates/adapters/nats",
|
||||||
|
"crates/adapters/exif",
|
||||||
"crates/presentation",
|
"crates/presentation",
|
||||||
"crates/bootstrap",
|
"crates/bootstrap",
|
||||||
"crates/worker",
|
"crates/worker",
|
||||||
@@ -46,6 +47,7 @@ adapters-storage = { path = "crates/adapters/storage" }
|
|||||||
event-payload = { path = "crates/adapters/event-payload" }
|
event-payload = { path = "crates/adapters/event-payload" }
|
||||||
event-transport = { path = "crates/adapters/event-transport" }
|
event-transport = { path = "crates/adapters/event-transport" }
|
||||||
adapters-nats = { path = "crates/adapters/nats" }
|
adapters-nats = { path = "crates/adapters/nats" }
|
||||||
|
adapters-exif = { path = "crates/adapters/exif" }
|
||||||
async-nats = "0.48"
|
async-nats = "0.48"
|
||||||
async-stream = "0.3"
|
async-stream = "0.3"
|
||||||
presentation = { path = "crates/presentation" }
|
presentation = { path = "crates/presentation" }
|
||||||
|
|||||||
9
crates/adapters/exif/Cargo.toml
Normal file
9
crates/adapters/exif/Cargo.toml
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
[package]
|
||||||
|
name = "adapters-exif"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2024"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
domain = { workspace = true }
|
||||||
|
bytes = { workspace = true }
|
||||||
|
nom-exif = { version = "2.5", features = ["serde"] }
|
||||||
80
crates/adapters/exif/src/lib.rs
Normal file
80
crates/adapters/exif/src/lib.rs
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
use bytes::Bytes;
|
||||||
|
use domain::{
|
||||||
|
errors::DomainError,
|
||||||
|
ports::MetadataExtractorPort,
|
||||||
|
value_objects::{MetadataValue, StructuredData},
|
||||||
|
};
|
||||||
|
use nom_exif::{ExifIter, MediaParser, MediaSource, TrackInfo};
|
||||||
|
use std::io::Cursor;
|
||||||
|
|
||||||
|
pub struct NomExifExtractor;
|
||||||
|
|
||||||
|
impl MetadataExtractorPort for NomExifExtractor {
|
||||||
|
fn extract(&self, bytes: &Bytes) -> Result<StructuredData, DomainError> {
|
||||||
|
if bytes.is_empty() {
|
||||||
|
return Ok(StructuredData::new());
|
||||||
|
}
|
||||||
|
|
||||||
|
let ms = match MediaSource::seekable(Cursor::new(bytes.as_ref())) {
|
||||||
|
Ok(ms) => ms,
|
||||||
|
Err(_) => return Ok(StructuredData::new()),
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut parser = MediaParser::new();
|
||||||
|
let mut data = StructuredData::new();
|
||||||
|
|
||||||
|
if ms.has_exif() {
|
||||||
|
let iter: ExifIter = match parser.parse(ms) {
|
||||||
|
Ok(iter) => iter,
|
||||||
|
Err(_) => return Ok(data),
|
||||||
|
};
|
||||||
|
|
||||||
|
for mut entry in iter {
|
||||||
|
let tag_name = match entry.tag() {
|
||||||
|
Some(t) => t.to_string(),
|
||||||
|
None => continue,
|
||||||
|
};
|
||||||
|
|
||||||
|
if tag_name.starts_with("Unknown(") {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let value = match entry.take_result() {
|
||||||
|
Ok(v) => v.to_string(),
|
||||||
|
Err(_) => continue,
|
||||||
|
};
|
||||||
|
|
||||||
|
if is_noisy_value(&value) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
data.insert(tag_name, MetadataValue::String(value));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
let track_info = match parser.parse::<_, _, TrackInfo>(ms) {
|
||||||
|
Ok(info) => info,
|
||||||
|
Err(_) => return Ok(data),
|
||||||
|
};
|
||||||
|
|
||||||
|
for (key, val) in track_info {
|
||||||
|
data.insert(
|
||||||
|
format!("track:{}", key),
|
||||||
|
MetadataValue::String(val.to_string()),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(data)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_noisy_value(v: &str) -> bool {
|
||||||
|
v.starts_with("U16Array")
|
||||||
|
|| v.starts_with("U32Array")
|
||||||
|
|| v.starts_with("U8Array")
|
||||||
|
|| v.starts_with("URationalArray")
|
||||||
|
|| v.starts_with("Undefined")
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests;
|
||||||
19
crates/adapters/exif/src/tests.rs
Normal file
19
crates/adapters/exif/src/tests.rs
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
use crate::NomExifExtractor;
|
||||||
|
use bytes::Bytes;
|
||||||
|
use domain::ports::MetadataExtractorPort;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn empty_bytes_returns_empty_data() {
|
||||||
|
let extractor = NomExifExtractor;
|
||||||
|
let result = extractor.extract(&Bytes::new());
|
||||||
|
assert!(result.is_ok());
|
||||||
|
assert!(result.unwrap().is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn garbage_bytes_returns_empty_data() {
|
||||||
|
let extractor = NomExifExtractor;
|
||||||
|
let result = extractor.extract(&Bytes::from_static(b"not a real image file"));
|
||||||
|
assert!(result.is_ok());
|
||||||
|
assert!(result.unwrap().is_empty());
|
||||||
|
}
|
||||||
@@ -3,8 +3,9 @@ use super::entities::{
|
|||||||
MetadataSource,
|
MetadataSource,
|
||||||
};
|
};
|
||||||
use crate::common::errors::DomainError;
|
use crate::common::errors::DomainError;
|
||||||
use crate::common::value_objects::{Checksum, SystemId};
|
use crate::common::value_objects::{Checksum, StructuredData, SystemId};
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
|
use bytes::Bytes;
|
||||||
|
|
||||||
// --- AssetRepository ---
|
// --- AssetRepository ---
|
||||||
|
|
||||||
@@ -74,3 +75,9 @@ pub trait DuplicateRepository: Send + Sync {
|
|||||||
async fn find_by_asset(&self, asset_id: &SystemId) -> Result<Vec<DuplicateGroup>, DomainError>;
|
async fn find_by_asset(&self, asset_id: &SystemId) -> Result<Vec<DuplicateGroup>, DomainError>;
|
||||||
async fn save(&self, group: &DuplicateGroup) -> Result<(), DomainError>;
|
async fn save(&self, group: &DuplicateGroup) -> Result<(), DomainError>;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// --- MetadataExtractorPort ---
|
||||||
|
|
||||||
|
pub trait MetadataExtractorPort: Send + Sync {
|
||||||
|
fn extract(&self, bytes: &Bytes) -> Result<StructuredData, DomainError>;
|
||||||
|
}
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ adapters-postgres = { path = "../adapters/postgres" }
|
|||||||
adapters-storage = { workspace = true }
|
adapters-storage = { workspace = true }
|
||||||
adapters-nats = { workspace = true }
|
adapters-nats = { workspace = true }
|
||||||
event-transport = { workspace = true }
|
event-transport = { workspace = true }
|
||||||
|
adapters-exif = { workspace = true }
|
||||||
async-nats = { workspace = true }
|
async-nats = { workspace = true }
|
||||||
|
|
||||||
futures = { workspace = true }
|
futures = { workspace = true }
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
use crate::plugin_registry::InMemoryPluginRegistry;
|
use crate::plugin_registry::InMemoryPluginRegistry;
|
||||||
use crate::plugins::{MetadataExtractorPlugin, NoOpPlugin, SidecarSyncPlugin};
|
use crate::plugins::{MetadataExtractorPlugin, NoOpPlugin, SidecarSyncPlugin};
|
||||||
use domain::ports::SidecarWriterPort;
|
use domain::ports::{MetadataExtractorPort, SidecarWriterPort};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use super::Repos;
|
use super::Repos;
|
||||||
@@ -9,6 +9,7 @@ pub fn build_plugin_registry(
|
|||||||
repos: &Repos,
|
repos: &Repos,
|
||||||
file_storage: Arc<dyn domain::ports::FileStoragePort>,
|
file_storage: Arc<dyn domain::ports::FileStoragePort>,
|
||||||
sidecar_writer: Arc<dyn SidecarWriterPort>,
|
sidecar_writer: Arc<dyn SidecarWriterPort>,
|
||||||
|
extractor: Arc<dyn MetadataExtractorPort>,
|
||||||
) -> InMemoryPluginRegistry {
|
) -> InMemoryPluginRegistry {
|
||||||
let mut registry = InMemoryPluginRegistry::new();
|
let mut registry = InMemoryPluginRegistry::new();
|
||||||
|
|
||||||
@@ -17,6 +18,7 @@ pub fn build_plugin_registry(
|
|||||||
repos.asset.clone(),
|
repos.asset.clone(),
|
||||||
file_storage,
|
file_storage,
|
||||||
repos.metadata.clone(),
|
repos.metadata.clone(),
|
||||||
|
extractor,
|
||||||
)));
|
)));
|
||||||
|
|
||||||
let export_handler = Arc::new(application::sidecar::ExportSidecarHandler::new(
|
let export_handler = Arc::new(application::sidecar::ExportSidecarHandler::new(
|
||||||
|
|||||||
@@ -51,7 +51,14 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
event_transport::CompositeEventPublisher::new(nats_publisher, event_store),
|
event_transport::CompositeEventPublisher::new(nats_publisher, event_store),
|
||||||
);
|
);
|
||||||
|
|
||||||
let registry = Arc::new(build_plugin_registry(&repos, file_storage, sidecar_writer));
|
let extractor: Arc<dyn domain::ports::MetadataExtractorPort> =
|
||||||
|
Arc::new(adapters_exif::NomExifExtractor);
|
||||||
|
let registry = Arc::new(build_plugin_registry(
|
||||||
|
&repos,
|
||||||
|
file_storage,
|
||||||
|
sidecar_writer,
|
||||||
|
extractor,
|
||||||
|
));
|
||||||
let process_next = Arc::new(build_process_next_handler(
|
let process_next = Arc::new(build_process_next_handler(
|
||||||
&repos,
|
&repos,
|
||||||
registry,
|
registry,
|
||||||
|
|||||||
@@ -2,7 +2,10 @@ use async_trait::async_trait;
|
|||||||
use domain::{
|
use domain::{
|
||||||
entities::{AssetMetadata, MetadataSource},
|
entities::{AssetMetadata, MetadataSource},
|
||||||
errors::DomainError,
|
errors::DomainError,
|
||||||
ports::{AssetMetadataRepository, AssetRepository, FileStoragePort, PluginExecutor},
|
ports::{
|
||||||
|
AssetMetadataRepository, AssetRepository, FileStoragePort, MetadataExtractorPort,
|
||||||
|
PluginExecutor,
|
||||||
|
},
|
||||||
value_objects::{MetadataValue, StructuredData, SystemId},
|
value_objects::{MetadataValue, StructuredData, SystemId},
|
||||||
};
|
};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
@@ -12,6 +15,7 @@ pub struct MetadataExtractorPlugin {
|
|||||||
asset_repo: Arc<dyn AssetRepository>,
|
asset_repo: Arc<dyn AssetRepository>,
|
||||||
file_storage: Arc<dyn FileStoragePort>,
|
file_storage: Arc<dyn FileStoragePort>,
|
||||||
metadata_repo: Arc<dyn AssetMetadataRepository>,
|
metadata_repo: Arc<dyn AssetMetadataRepository>,
|
||||||
|
extractor: Arc<dyn MetadataExtractorPort>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl MetadataExtractorPlugin {
|
impl MetadataExtractorPlugin {
|
||||||
@@ -19,11 +23,13 @@ impl MetadataExtractorPlugin {
|
|||||||
asset_repo: Arc<dyn AssetRepository>,
|
asset_repo: Arc<dyn AssetRepository>,
|
||||||
file_storage: Arc<dyn FileStoragePort>,
|
file_storage: Arc<dyn FileStoragePort>,
|
||||||
metadata_repo: Arc<dyn AssetMetadataRepository>,
|
metadata_repo: Arc<dyn AssetMetadataRepository>,
|
||||||
|
extractor: Arc<dyn MetadataExtractorPort>,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
Self {
|
Self {
|
||||||
asset_repo,
|
asset_repo,
|
||||||
file_storage,
|
file_storage,
|
||||||
metadata_repo,
|
metadata_repo,
|
||||||
|
extractor,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -52,17 +58,16 @@ impl PluginExecutor for MetadataExtractorPlugin {
|
|||||||
|
|
||||||
let path = &asset.source_reference.relative_path;
|
let path = &asset.source_reference.relative_path;
|
||||||
let data = self.file_storage.read_file(path).await?;
|
let data = self.file_storage.read_file(path).await?;
|
||||||
let file_size = data.len() as i64;
|
|
||||||
|
|
||||||
let mut extracted = StructuredData::new();
|
let mut extracted = self.extractor.extract(&data)?;
|
||||||
extracted.insert("file_size_bytes", MetadataValue::Integer(file_size));
|
extracted.insert("file_size_bytes", MetadataValue::Integer(data.len() as i64));
|
||||||
extracted.insert("mime_type", MetadataValue::String(asset.mime_type.clone()));
|
extracted.insert("mime_type", MetadataValue::String(asset.mime_type.clone()));
|
||||||
|
|
||||||
let metadata =
|
let metadata =
|
||||||
AssetMetadata::new(asset_id, MetadataSource::ExifExtracted, extracted.clone());
|
AssetMetadata::new(asset_id, MetadataSource::ExifExtracted, extracted.clone());
|
||||||
self.metadata_repo.save(&metadata).await?;
|
self.metadata_repo.save(&metadata).await?;
|
||||||
|
|
||||||
info!(asset_id = %asset_id, file_size, "extracted basic metadata");
|
info!(asset_id = %asset_id, tags = extracted.len(), "extracted metadata");
|
||||||
Ok(extracted)
|
Ok(extracted)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user