Worked on html scraping

This commit is contained in:
Lukas Wölfer
2025-10-02 21:43:29 +02:00
parent 95d13370c2
commit 28c4f4595e
4 changed files with 693 additions and 1 deletions

View File

@@ -7,7 +7,6 @@ use crate::{
dance_info::{CompState, DanceInfo, DanceRank, DanceRole},
};
// mod caching;
pub async fn fetch_wsdc_info(id: u32) -> Result<DanceInfo, DanceInfoError> {
let client = ClientBuilder::new()
.user_agent(app_signature())
@@ -37,6 +36,75 @@ pub async fn fetch_wsdc_info(id: u32) -> Result<DanceInfo, DanceInfoError> {
Ok(x.into())
}
use scraper::{Html, Selector};
fn extract_tables(html: &str) -> Vec<(String, Vec<(String, String)>)> {
let document = Html::parse_document(html);
let card_selector = Selector::parse("div:has( > div.card-header)").unwrap();
let title_selector = Selector::parse("div.card-header").unwrap();
let table_selector = Selector::parse("div.card-body > table").unwrap();
let row_selector = Selector::parse("tr").unwrap();
let header_selector = Selector::parse("th").unwrap();
let cell_selector = Selector::parse("td").unwrap();
let mut pairs = Vec::new();
for card in document.select(&card_selector) {
let table = card.select(&table_selector).next().unwrap();
let title = card
.select(&title_selector)
.next()
.unwrap()
.text()
.collect::<Vec<_>>()
.join("sep")
.trim()
.to_owned();
for row in table.select(&row_selector) {
let header = row.select(&header_selector).next();
let cell = row.select(&cell_selector).next();
if let (Some(h), Some(c)) = (header, cell) {
let key = h.text().collect::<String>().trim().to_string();
let value = c.text().collect::<String>().trim().to_string();
pairs.push((key, value));
}
}
}
todo!()
}
fn parse_table(html: &str) {
let tables = extract_tables(html);
}
#[test]
fn test_parse_table() {
parse_table(include_str!("../../polina.html"));
}
pub async fn fetch_wsdc_info_scoring_dance(id: u32) -> Result<DanceInfo, DanceInfoError> {
let client = ClientBuilder::new()
.user_agent(app_signature())
.build()
.map_err(DanceInfoError::ClientBuild)?;
let url = format!("https://scoring.dance/enUS/wsdc/registry/{id}.html");
let request = client
.request(reqwest::Method::GET, url)
.build()
.map_err(DanceInfoError::RequestBuild)?;
let response = client
.execute(request)
.await
.map_err(DanceInfoError::Request)?;
let x: DanceInfoParser = response.json().await.map_err(DanceInfoError::JsonParse)?;
Ok(x.into())
}
#[cfg(test)]
mod tests {
#![allow(clippy::unwrap_used, reason = "Allow unwrap in tests")]