Worked on table parsing

This commit is contained in:
Lukas Wölfer
2025-10-04 00:19:37 +02:00
parent 28c4f4595e
commit 965d1560f7
3 changed files with 107 additions and 73 deletions

View File

@@ -36,8 +36,8 @@ impl TryFrom<&str> for DanceRole {
fn try_from(value: &str) -> Result<Self, Self::Error> { fn try_from(value: &str) -> Result<Self, Self::Error> {
match value.to_lowercase().as_str() { match value.to_lowercase().as_str() {
"leader" => Ok(DanceRole::Leader), "leader" => Ok(Self::Leader),
"follower" => Ok(DanceRole::Follower), "follower" => Ok(Self::Follower),
_ => Err(ParseDanceRoleError), _ => Err(ParseDanceRoleError),
} }
} }

View File

@@ -1,11 +1,11 @@
use std::collections::HashMap; use std::collections::HashMap;
use reqwest::ClientBuilder;
use crate::{ use crate::{
app_signature, app_signature,
dance_info::{CompState, DanceInfo, DanceRank, DanceRole}, dance_info::{CompState, DanceInfo, DanceRank, DanceRole},
}; };
use reqwest::ClientBuilder;
mod scoringdance;
pub async fn fetch_wsdc_info(id: u32) -> Result<DanceInfo, DanceInfoError> { pub async fn fetch_wsdc_info(id: u32) -> Result<DanceInfo, DanceInfoError> {
let client = ClientBuilder::new() let client = ClientBuilder::new()
@@ -36,75 +36,6 @@ pub async fn fetch_wsdc_info(id: u32) -> Result<DanceInfo, DanceInfoError> {
Ok(x.into()) Ok(x.into())
} }
use scraper::{Html, Selector};
fn extract_tables(html: &str) -> Vec<(String, Vec<(String, String)>)> {
let document = Html::parse_document(html);
let card_selector = Selector::parse("div:has( > div.card-header)").unwrap();
let title_selector = Selector::parse("div.card-header").unwrap();
let table_selector = Selector::parse("div.card-body > table").unwrap();
let row_selector = Selector::parse("tr").unwrap();
let header_selector = Selector::parse("th").unwrap();
let cell_selector = Selector::parse("td").unwrap();
let mut pairs = Vec::new();
for card in document.select(&card_selector) {
let table = card.select(&table_selector).next().unwrap();
let title = card
.select(&title_selector)
.next()
.unwrap()
.text()
.collect::<Vec<_>>()
.join("sep")
.trim()
.to_owned();
for row in table.select(&row_selector) {
let header = row.select(&header_selector).next();
let cell = row.select(&cell_selector).next();
if let (Some(h), Some(c)) = (header, cell) {
let key = h.text().collect::<String>().trim().to_string();
let value = c.text().collect::<String>().trim().to_string();
pairs.push((key, value));
}
}
}
todo!()
}
fn parse_table(html: &str) {
let tables = extract_tables(html);
}
#[test]
fn test_parse_table() {
parse_table(include_str!("../../polina.html"));
}
pub async fn fetch_wsdc_info_scoring_dance(id: u32) -> Result<DanceInfo, DanceInfoError> {
let client = ClientBuilder::new()
.user_agent(app_signature())
.build()
.map_err(DanceInfoError::ClientBuild)?;
let url = format!("https://scoring.dance/enUS/wsdc/registry/{id}.html");
let request = client
.request(reqwest::Method::GET, url)
.build()
.map_err(DanceInfoError::RequestBuild)?;
let response = client
.execute(request)
.await
.map_err(DanceInfoError::Request)?;
let x: DanceInfoParser = response.json().await.map_err(DanceInfoError::JsonParse)?;
Ok(x.into())
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
#![allow(clippy::unwrap_used, reason = "Allow unwrap in tests")] #![allow(clippy::unwrap_used, reason = "Allow unwrap in tests")]

View File

@@ -0,0 +1,103 @@
use reqwest::ClientBuilder;
use scraper::{ElementRef, Html, Selector};
use crate::{
app_signature,
dance_info::DanceInfo,
worldsdc::{DanceInfoError, DanceInfoParser},
};
fn parse_card(t: ElementRef) -> (String, Vec<Vec<String>>) {
let title_selector = Selector::parse("div.card-header").unwrap();
let table_selector = Selector::parse("div.card-body > table").unwrap();
let row_selector = Selector::parse("tr").unwrap();
let cell_selector = Selector::parse("th,td").unwrap();
let table = t.select(&table_selector).next().unwrap();
let title = t
.select(&title_selector)
.next()
.unwrap()
.text()
.collect::<Vec<_>>()
.join("")
.trim()
.to_owned();
let parsed_table = table
.select(&row_selector)
.map(|row| {
row.select(&cell_selector)
.map(|v| v.text().collect::<String>().trim().to_string())
.collect::<Vec<_>>()
})
.collect::<Vec<_>>();
(title, parsed_table)
}
fn parse_details(d: &[Vec<String>]) {
let first_name_row = d.iter().find(|v| {
v.first()
.is_some_and(|v| v.to_lowercase().contains("first name"))
});
let last_name_row = d.iter().find(|v| {
v.first()
.is_some_and(|v| v.to_lowercase().contains("last name"))
});
let first_name = first_name_row.unwrap().last().unwrap();
let last_name = last_name_row.unwrap().last().unwrap();
dbg!(first_name, last_name);
}
fn parse_stats(d: &[Vec<String>]) {
let chapters = d.chunk_by(|_, b| b.len() != 1);
dbg!(chapters.collect::<Vec<_>>());
}
fn extract_tables(html: &str) -> Vec<(String, Vec<Vec<String>>)> {
let document = Html::parse_document(html);
let card_selector = Selector::parse("div:has( > div.card-header)").unwrap();
document.select(&card_selector).map(parse_card).collect()
}
fn parse_info(html: &str) {
let tables = extract_tables(html);
let details = &tables
.iter()
.find(|(v, _)| v.to_lowercase().contains("detail"))
.unwrap()
.1;
let stats = &tables
.iter()
.find(|(v, _)| v.to_lowercase().contains("stats"))
.unwrap()
.1;
dbg!(parse_stats(&stats));
dbg!(parse_details(&details));
}
#[test]
fn test_parse_table() {
parse_info(include_str!("../../polina.html"));
}
pub async fn fetch_wsdc_info_scoring_dance(id: u32) -> Result<DanceInfo, DanceInfoError> {
let client = ClientBuilder::new()
.user_agent(app_signature())
.build()
.map_err(DanceInfoError::ClientBuild)?;
let url = format!("https://scoring.dance/enUS/wsdc/registry/{id}.html");
let request = client
.request(reqwest::Method::GET, url)
.build()
.map_err(DanceInfoError::RequestBuild)?;
let response = client
.execute(request)
.await
.map_err(DanceInfoError::Request)?;
let x: DanceInfoParser = response.json().await.map_err(DanceInfoError::JsonParse)?;
Ok(x.into())
}