feat(ug-parser): parse sections, lyrics, and chord positions from UG HTML

This commit is contained in:
2026-04-08 01:47:17 +02:00
parent 1a0458653a
commit 776389af49

View File

@@ -1,4 +1,3 @@
#[allow(unused_imports)]
use domain::{ use domain::{
Chord, ChordPosition, LyricLine, ParseError, Section, SectionKind, Song, SongMeta, Chord, ChordPosition, LyricLine, ParseError, Section, SectionKind, Song, SongMeta,
TabParserPort, TabParserPort,
@@ -77,8 +76,117 @@ impl UgHtmlParser {
}) })
} }
fn parse_sections(_document: &Html) -> Result<Vec<Section>, ParseError> { fn parse_sections(document: &Html) -> Result<Vec<Section>, ParseError> {
Ok(vec![]) let pre_sel = Selector::parse("pre").unwrap();
let pre = document
.select(&pre_sel)
.next()
.ok_or(ParseError::MissingContent)?;
let inner_html = pre.inner_html();
let raw_lines: Vec<&str> = inner_html.split('\n').collect();
let mut sections: Vec<Section> = Vec::new();
let mut current_section: Option<Section> = None;
let mut pending_chords: Vec<ChordPosition> = Vec::new();
for raw_line in &raw_lines {
let text_only = Self::strip_html(raw_line);
let trimmed = text_only.trim();
// Section header: "[Chorus]", "[Verse 1]", etc.
if let Some(label) = Self::extract_section_label(trimmed) {
if let Some(sec) = current_section.take() {
sections.push(sec);
}
current_section = Some(Section {
kind: SectionKind::from_label(&label),
label: Some(label),
lines: Vec::new(),
});
pending_chords.clear();
continue;
}
// Blank line — flush pending chords if any (chord line with no following lyric)
if trimmed.is_empty() {
if !pending_chords.is_empty() {
if let Some(sec) = current_section.as_mut() {
sec.lines.push(LyricLine {
text: String::new(),
chords: pending_chords.drain(..).collect(),
});
}
}
continue;
}
// Chord line: contains span elements
if raw_line.contains("<span") {
pending_chords = Self::parse_chord_line(raw_line);
continue;
}
// Lyric line
if let Some(sec) = current_section.as_mut() {
sec.lines.push(LyricLine {
text: trimmed.to_string(),
chords: pending_chords.drain(..).collect(),
});
}
}
if let Some(sec) = current_section {
sections.push(sec);
}
Ok(sections)
}
/// Strip all HTML tags, returning plain text.
fn strip_html(s: &str) -> String {
let frag = Html::parse_fragment(s);
frag.root_element().text().collect()
}
/// If `s` matches `[Label]`, return `Label`. Else None.
fn extract_section_label(s: &str) -> Option<String> {
let s = s.trim();
if s.starts_with('[') && s.ends_with(']') && s.len() > 2 {
Some(s[1..s.len() - 1].to_string())
} else {
None
}
}
/// Parse a chord line (raw HTML) into chord positions.
/// Walks text nodes and span[data-name] elements in order to compute offsets.
fn parse_chord_line(line_html: &str) -> Vec<ChordPosition> {
let frag = Html::parse_fragment(line_html);
let root = frag.root_element();
let mut chords = Vec::new();
let mut offset = 0usize;
for child in root.children() {
use scraper::node::Node;
match child.value() {
Node::Text(text) => {
offset += text.chars().count();
}
Node::Element(el) => {
if el.name() == "span" {
if let Some(chord_name) = el.attr("data-name") {
if let Some(chord) = Chord::parse(chord_name) {
chords.push(ChordPosition { offset, chord });
offset += chord_name.chars().count();
}
}
}
}
_ => {}
}
}
chords
} }
} }
@@ -124,4 +232,33 @@ mod tests {
let song = parser.parse(&html).unwrap(); let song = parser.parse(&html).unwrap();
assert_eq!(song.meta.capo, None); assert_eq!(song.meta.capo, None);
} }
#[test]
fn parses_sections() {
let parser = UgHtmlParser;
let html = sample_html("A DROP IN THE OCEAN.html");
let song = parser.parse(&html).unwrap();
assert!(
song.sections.len() >= 3,
"expected >=3 sections, got {}",
song.sections.len()
);
assert_eq!(song.sections[0].kind, domain::SectionKind::Chorus);
}
#[test]
fn parses_chord_positions() {
let parser = UgHtmlParser;
let html = sample_html("A DROP IN THE OCEAN.html");
let song = parser.parse(&html).unwrap();
// First section, first line: "A drop in the ocean,"
// Chord "Em" should be at offset 0 (or small offset from leading whitespace)
let first_line = &song.sections[0].lines[0];
assert_eq!(first_line.text, "A drop in the ocean,");
assert!(
first_line.chords[0].chord.name(true) == "Em",
"expected Em chord, got {}",
first_line.chords[0].chord.name(true)
);
}
} }