From 776389af49adf195042d455b9d76213942fecf20 Mon Sep 17 00:00:00 2001 From: Gabriel Kaszewski Date: Wed, 8 Apr 2026 01:47:17 +0200 Subject: [PATCH] feat(ug-parser): parse sections, lyrics, and chord positions from UG HTML --- crates/infrastructure/ug-parser/src/parser.rs | 143 +++++++++++++++++- 1 file changed, 140 insertions(+), 3 deletions(-) diff --git a/crates/infrastructure/ug-parser/src/parser.rs b/crates/infrastructure/ug-parser/src/parser.rs index 749eda5..8f0c2d9 100644 --- a/crates/infrastructure/ug-parser/src/parser.rs +++ b/crates/infrastructure/ug-parser/src/parser.rs @@ -1,4 +1,3 @@ -#[allow(unused_imports)] use domain::{ Chord, ChordPosition, LyricLine, ParseError, Section, SectionKind, Song, SongMeta, TabParserPort, @@ -77,8 +76,117 @@ impl UgHtmlParser { }) } - fn parse_sections(_document: &Html) -> Result, ParseError> { - Ok(vec![]) + fn parse_sections(document: &Html) -> Result, ParseError> { + let pre_sel = Selector::parse("pre").unwrap(); + let pre = document + .select(&pre_sel) + .next() + .ok_or(ParseError::MissingContent)?; + + let inner_html = pre.inner_html(); + let raw_lines: Vec<&str> = inner_html.split('\n').collect(); + + let mut sections: Vec
= Vec::new(); + let mut current_section: Option
= None; + let mut pending_chords: Vec = Vec::new(); + + for raw_line in &raw_lines { + let text_only = Self::strip_html(raw_line); + let trimmed = text_only.trim(); + + // Section header: "[Chorus]", "[Verse 1]", etc. + if let Some(label) = Self::extract_section_label(trimmed) { + if let Some(sec) = current_section.take() { + sections.push(sec); + } + current_section = Some(Section { + kind: SectionKind::from_label(&label), + label: Some(label), + lines: Vec::new(), + }); + pending_chords.clear(); + continue; + } + + // Blank line — flush pending chords if any (chord line with no following lyric) + if trimmed.is_empty() { + if !pending_chords.is_empty() { + if let Some(sec) = current_section.as_mut() { + sec.lines.push(LyricLine { + text: String::new(), + chords: pending_chords.drain(..).collect(), + }); + } + } + continue; + } + + // Chord line: contains span elements + if raw_line.contains(" String { + let frag = Html::parse_fragment(s); + frag.root_element().text().collect() + } + + /// If `s` matches `[Label]`, return `Label`. Else None. + fn extract_section_label(s: &str) -> Option { + let s = s.trim(); + if s.starts_with('[') && s.ends_with(']') && s.len() > 2 { + Some(s[1..s.len() - 1].to_string()) + } else { + None + } + } + + /// Parse a chord line (raw HTML) into chord positions. + /// Walks text nodes and span[data-name] elements in order to compute offsets. + fn parse_chord_line(line_html: &str) -> Vec { + let frag = Html::parse_fragment(line_html); + let root = frag.root_element(); + let mut chords = Vec::new(); + let mut offset = 0usize; + + for child in root.children() { + use scraper::node::Node; + match child.value() { + Node::Text(text) => { + offset += text.chars().count(); + } + Node::Element(el) => { + if el.name() == "span" { + if let Some(chord_name) = el.attr("data-name") { + if let Some(chord) = Chord::parse(chord_name) { + chords.push(ChordPosition { offset, chord }); + offset += chord_name.chars().count(); + } + } + } + } + _ => {} + } + } + chords } } @@ -124,4 +232,33 @@ mod tests { let song = parser.parse(&html).unwrap(); assert_eq!(song.meta.capo, None); } + + #[test] + fn parses_sections() { + let parser = UgHtmlParser; + let html = sample_html("A DROP IN THE OCEAN.html"); + let song = parser.parse(&html).unwrap(); + assert!( + song.sections.len() >= 3, + "expected >=3 sections, got {}", + song.sections.len() + ); + assert_eq!(song.sections[0].kind, domain::SectionKind::Chorus); + } + + #[test] + fn parses_chord_positions() { + let parser = UgHtmlParser; + let html = sample_html("A DROP IN THE OCEAN.html"); + let song = parser.parse(&html).unwrap(); + // First section, first line: "A drop in the ocean," + // Chord "Em" should be at offset 0 (or small offset from leading whitespace) + let first_line = &song.sections[0].lines[0]; + assert_eq!(first_line.text, "A drop in the ocean,"); + assert!( + first_line.chords[0].chord.name(true) == "Em", + "expected Em chord, got {}", + first_line.chords[0].chord.name(true) + ); + } }