feat(ug-parser): implement UgHtmlParser metadata extraction

This commit is contained in:
2026-04-08 01:44:06 +02:00
parent 36d38b9a78
commit 1a0458653a

View File

@@ -1 +1,127 @@
#[allow(unused_imports)]
use domain::{
Chord, ChordPosition, LyricLine, ParseError, Section, SectionKind, Song, SongMeta,
TabParserPort,
};
use scraper::{Html, Selector};
pub struct UgHtmlParser;
impl UgHtmlParser {
fn parse_meta(document: &Html) -> Result<SongMeta, ParseError> {
let title_sel = Selector::parse("title").unwrap();
let raw_title = document
.select(&title_sel)
.next()
.map(|el| el.text().collect::<String>())
.unwrap_or_default();
let raw_title = raw_title.trim();
let (title, artist) = if let Some(by_pos) = raw_title.rfind(" by ") {
let after_by = &raw_title[by_pos + 4..];
let artist = after_by
.split(" @ ")
.next()
.unwrap_or(after_by)
.trim()
.to_string();
let title_part = raw_title[..by_pos].trim();
// Strip leading "(N) "
let title_part = title_part
.trim_start_matches(|c: char| c == '(' || c.is_numeric() || c == ')' || c == ' ');
// Strip " CHORDS (ver N)" suffix
let title_part = title_part
.split(" CHORDS")
.next()
.unwrap_or(title_part)
.trim();
let title_cased = title_part
.split_whitespace()
.map(|w| {
let mut c = w.chars();
match c.next() {
None => String::new(),
Some(f) => f.to_uppercase().to_string() + &c.as_str().to_lowercase(),
}
})
.collect::<Vec<_>>()
.join(" ");
(title_cased, artist)
} else {
(raw_title.to_string(), String::new())
};
let span_sel = Selector::parse("span").unwrap();
let mut capo: Option<u8> = None;
let mut found_capo_label = false;
for span in document.select(&span_sel) {
let text = span.text().collect::<String>();
let text = text.trim().to_string();
if text == "Capo: " || text == "Capo:" {
found_capo_label = true;
} else if found_capo_label {
if text != "No capo" && !text.is_empty() {
capo = text.parse::<u8>().ok();
}
found_capo_label = false;
}
}
Ok(SongMeta {
title,
artist,
capo,
original_key: None,
tuning: None,
tempo: None,
})
}
fn parse_sections(_document: &Html) -> Result<Vec<Section>, ParseError> {
Ok(vec![])
}
}
impl TabParserPort for UgHtmlParser {
fn parse(&self, html: &str) -> Result<Song, ParseError> {
let document = Html::parse_document(html);
let meta = Self::parse_meta(&document)?;
let sections = Self::parse_sections(&document)?;
Ok(Song { meta, sections })
}
}
#[cfg(test)]
mod tests {
use super::*;
use domain::TabParserPort;
fn sample_html(name: &str) -> String {
let path = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.parent()
.unwrap()
.parent()
.unwrap()
.parent()
.unwrap()
.join(format!("samples/{}", name));
std::fs::read_to_string(path).unwrap()
}
#[test]
fn parses_artist_and_title() {
let parser = UgHtmlParser;
let html = sample_html("A DROP IN THE OCEAN.html");
let song = parser.parse(&html).unwrap();
assert_eq!(song.meta.artist, "Ron Pope");
assert_eq!(song.meta.title, "A Drop In The Ocean");
}
#[test]
fn capo_is_none_when_no_capo() {
let parser = UgHtmlParser;
let html = sample_html("A DROP IN THE OCEAN.html");
let song = parser.parse(&html).unwrap();
assert_eq!(song.meta.capo, None);
}
}