From 1a0458653ac412cb4d11d1f0a35734ae01787600 Mon Sep 17 00:00:00 2001 From: Gabriel Kaszewski Date: Wed, 8 Apr 2026 01:44:06 +0200 Subject: [PATCH] feat(ug-parser): implement UgHtmlParser metadata extraction --- crates/infrastructure/ug-parser/src/parser.rs | 126 ++++++++++++++++++ 1 file changed, 126 insertions(+) diff --git a/crates/infrastructure/ug-parser/src/parser.rs b/crates/infrastructure/ug-parser/src/parser.rs index 878a83b..749eda5 100644 --- a/crates/infrastructure/ug-parser/src/parser.rs +++ b/crates/infrastructure/ug-parser/src/parser.rs @@ -1 +1,127 @@ +#[allow(unused_imports)] +use domain::{ + Chord, ChordPosition, LyricLine, ParseError, Section, SectionKind, Song, SongMeta, + TabParserPort, +}; +use scraper::{Html, Selector}; + pub struct UgHtmlParser; + +impl UgHtmlParser { + fn parse_meta(document: &Html) -> Result { + let title_sel = Selector::parse("title").unwrap(); + let raw_title = document + .select(&title_sel) + .next() + .map(|el| el.text().collect::()) + .unwrap_or_default(); + let raw_title = raw_title.trim(); + + let (title, artist) = if let Some(by_pos) = raw_title.rfind(" by ") { + let after_by = &raw_title[by_pos + 4..]; + let artist = after_by + .split(" @ ") + .next() + .unwrap_or(after_by) + .trim() + .to_string(); + let title_part = raw_title[..by_pos].trim(); + // Strip leading "(N) " + let title_part = title_part + .trim_start_matches(|c: char| c == '(' || c.is_numeric() || c == ')' || c == ' '); + // Strip " CHORDS (ver N)" suffix + let title_part = title_part + .split(" CHORDS") + .next() + .unwrap_or(title_part) + .trim(); + let title_cased = title_part + .split_whitespace() + .map(|w| { + let mut c = w.chars(); + match c.next() { + None => String::new(), + Some(f) => f.to_uppercase().to_string() + &c.as_str().to_lowercase(), + } + }) + .collect::>() + .join(" "); + (title_cased, artist) + } else { + (raw_title.to_string(), String::new()) + }; + + let span_sel = Selector::parse("span").unwrap(); + let mut capo: Option = None; + let mut found_capo_label = false; + for span in document.select(&span_sel) { + let text = span.text().collect::(); + let text = text.trim().to_string(); + if text == "Capo: " || text == "Capo:" { + found_capo_label = true; + } else if found_capo_label { + if text != "No capo" && !text.is_empty() { + capo = text.parse::().ok(); + } + found_capo_label = false; + } + } + + Ok(SongMeta { + title, + artist, + capo, + original_key: None, + tuning: None, + tempo: None, + }) + } + + fn parse_sections(_document: &Html) -> Result, ParseError> { + Ok(vec![]) + } +} + +impl TabParserPort for UgHtmlParser { + fn parse(&self, html: &str) -> Result { + let document = Html::parse_document(html); + let meta = Self::parse_meta(&document)?; + let sections = Self::parse_sections(&document)?; + Ok(Song { meta, sections }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use domain::TabParserPort; + + fn sample_html(name: &str) -> String { + let path = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .parent() + .unwrap() + .parent() + .unwrap() + .parent() + .unwrap() + .join(format!("samples/{}", name)); + std::fs::read_to_string(path).unwrap() + } + + #[test] + fn parses_artist_and_title() { + let parser = UgHtmlParser; + let html = sample_html("A DROP IN THE OCEAN.html"); + let song = parser.parse(&html).unwrap(); + assert_eq!(song.meta.artist, "Ron Pope"); + assert_eq!(song.meta.title, "A Drop In The Ocean"); + } + + #[test] + fn capo_is_none_when_no_capo() { + let parser = UgHtmlParser; + let html = sample_html("A DROP IN THE OCEAN.html"); + let song = parser.parse(&html).unwrap(); + assert_eq!(song.meta.capo, None); + } +}