feat(ug-parser): parse sections, lyrics, and chord positions from UG HTML
This commit is contained in:
@@ -1,4 +1,3 @@
|
||||
#[allow(unused_imports)]
|
||||
use domain::{
|
||||
Chord, ChordPosition, LyricLine, ParseError, Section, SectionKind, Song, SongMeta,
|
||||
TabParserPort,
|
||||
@@ -77,8 +76,117 @@ impl UgHtmlParser {
|
||||
})
|
||||
}
|
||||
|
||||
fn parse_sections(_document: &Html) -> Result<Vec<Section>, ParseError> {
|
||||
Ok(vec![])
|
||||
fn parse_sections(document: &Html) -> Result<Vec<Section>, ParseError> {
|
||||
let pre_sel = Selector::parse("pre").unwrap();
|
||||
let pre = document
|
||||
.select(&pre_sel)
|
||||
.next()
|
||||
.ok_or(ParseError::MissingContent)?;
|
||||
|
||||
let inner_html = pre.inner_html();
|
||||
let raw_lines: Vec<&str> = inner_html.split('\n').collect();
|
||||
|
||||
let mut sections: Vec<Section> = Vec::new();
|
||||
let mut current_section: Option<Section> = None;
|
||||
let mut pending_chords: Vec<ChordPosition> = Vec::new();
|
||||
|
||||
for raw_line in &raw_lines {
|
||||
let text_only = Self::strip_html(raw_line);
|
||||
let trimmed = text_only.trim();
|
||||
|
||||
// Section header: "[Chorus]", "[Verse 1]", etc.
|
||||
if let Some(label) = Self::extract_section_label(trimmed) {
|
||||
if let Some(sec) = current_section.take() {
|
||||
sections.push(sec);
|
||||
}
|
||||
current_section = Some(Section {
|
||||
kind: SectionKind::from_label(&label),
|
||||
label: Some(label),
|
||||
lines: Vec::new(),
|
||||
});
|
||||
pending_chords.clear();
|
||||
continue;
|
||||
}
|
||||
|
||||
// Blank line — flush pending chords if any (chord line with no following lyric)
|
||||
if trimmed.is_empty() {
|
||||
if !pending_chords.is_empty() {
|
||||
if let Some(sec) = current_section.as_mut() {
|
||||
sec.lines.push(LyricLine {
|
||||
text: String::new(),
|
||||
chords: pending_chords.drain(..).collect(),
|
||||
});
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Chord line: contains span elements
|
||||
if raw_line.contains("<span") {
|
||||
pending_chords = Self::parse_chord_line(raw_line);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Lyric line
|
||||
if let Some(sec) = current_section.as_mut() {
|
||||
sec.lines.push(LyricLine {
|
||||
text: trimmed.to_string(),
|
||||
chords: pending_chords.drain(..).collect(),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(sec) = current_section {
|
||||
sections.push(sec);
|
||||
}
|
||||
|
||||
Ok(sections)
|
||||
}
|
||||
|
||||
/// Strip all HTML tags, returning plain text.
|
||||
fn strip_html(s: &str) -> String {
|
||||
let frag = Html::parse_fragment(s);
|
||||
frag.root_element().text().collect()
|
||||
}
|
||||
|
||||
/// If `s` matches `[Label]`, return `Label`. Else None.
|
||||
fn extract_section_label(s: &str) -> Option<String> {
|
||||
let s = s.trim();
|
||||
if s.starts_with('[') && s.ends_with(']') && s.len() > 2 {
|
||||
Some(s[1..s.len() - 1].to_string())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse a chord line (raw HTML) into chord positions.
|
||||
/// Walks text nodes and span[data-name] elements in order to compute offsets.
|
||||
fn parse_chord_line(line_html: &str) -> Vec<ChordPosition> {
|
||||
let frag = Html::parse_fragment(line_html);
|
||||
let root = frag.root_element();
|
||||
let mut chords = Vec::new();
|
||||
let mut offset = 0usize;
|
||||
|
||||
for child in root.children() {
|
||||
use scraper::node::Node;
|
||||
match child.value() {
|
||||
Node::Text(text) => {
|
||||
offset += text.chars().count();
|
||||
}
|
||||
Node::Element(el) => {
|
||||
if el.name() == "span" {
|
||||
if let Some(chord_name) = el.attr("data-name") {
|
||||
if let Some(chord) = Chord::parse(chord_name) {
|
||||
chords.push(ChordPosition { offset, chord });
|
||||
offset += chord_name.chars().count();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
chords
|
||||
}
|
||||
}
|
||||
|
||||
@@ -124,4 +232,33 @@ mod tests {
|
||||
let song = parser.parse(&html).unwrap();
|
||||
assert_eq!(song.meta.capo, None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parses_sections() {
|
||||
let parser = UgHtmlParser;
|
||||
let html = sample_html("A DROP IN THE OCEAN.html");
|
||||
let song = parser.parse(&html).unwrap();
|
||||
assert!(
|
||||
song.sections.len() >= 3,
|
||||
"expected >=3 sections, got {}",
|
||||
song.sections.len()
|
||||
);
|
||||
assert_eq!(song.sections[0].kind, domain::SectionKind::Chorus);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parses_chord_positions() {
|
||||
let parser = UgHtmlParser;
|
||||
let html = sample_html("A DROP IN THE OCEAN.html");
|
||||
let song = parser.parse(&html).unwrap();
|
||||
// First section, first line: "A drop in the ocean,"
|
||||
// Chord "Em" should be at offset 0 (or small offset from leading whitespace)
|
||||
let first_line = &song.sections[0].lines[0];
|
||||
assert_eq!(first_line.text, "A drop in the ocean,");
|
||||
assert!(
|
||||
first_line.chords[0].chord.name(true) == "Em",
|
||||
"expected Em chord, got {}",
|
||||
first_line.chords[0].chord.name(true)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user