Made parsoid work

This commit is contained in:
Lukas Wölfer
2025-07-21 01:53:23 +02:00
parent 7c3fc0a2ca
commit 7ce728ef8e
4 changed files with 740 additions and 196 deletions

View File

@@ -1,13 +1,16 @@
use mwbot::{
Bot,
generators::{CategoryMemberSort, Generator, SortDirection},
parsoid::{self, WikinodeIterator},
generators::{Generator, SortDirection, categories::CategoryMemberSort},
};
use std::{error::Error, path::Path};
use crate::old_style::get_description;
mod old_style;
fn list_teacher_pages(bot: &Bot) -> tokio::sync::mpsc::Receiver<Result<mwbot::Page, mwbot::Error>> {
let category_title = "Category:Teachers";
let pages = mwbot::generators::CategoryMembers::new(category_title)
let pages = mwbot::generators::categories::CategoryMembers::new(category_title)
.dir(SortDirection::Descending)
.sort(CategoryMemberSort::Timestamp);
@@ -17,18 +20,18 @@ fn list_teacher_pages(bot: &Bot) -> tokio::sync::mpsc::Receiver<Result<mwbot::Pa
}
async fn print_teachers(bot: &Bot) {
while let Some(page) = list_teacher_pages(bot).recv().await {
let mut v = list_teacher_pages(bot);
while let Some(page) = v.recv().await {
let p = page.unwrap();
println!("- {}", p.as_title().dbkey());
print_wsdc_id(bot, &p).await;
}
}
async fn print_wsdc_id(bot: &Bot, page: &mwbot::Page) {
println!("{}", page.wikitext().await.unwrap());
let x = page.html().await.unwrap().into_mutable();
for w in &x.filter_external_links() {
println!("{}", w.text_contents());
println!(
"- {} [{}]",
p.as_title().dbkey(),
old_style::get_wsdc_id(bot, &p)
.await
.map(|x| x.to_string())
.unwrap_or("Unknown".to_string())
);
get_description(bot, &p).await;
}
}

47
src/old_style.rs Normal file
View File

@@ -0,0 +1,47 @@
use mwbot::{Bot, parsoid::WikinodeIterator as _};
fn extract_number_from_url(url: &str) -> Option<u32> {
// Split the URL into parts using '/'
let parts: Vec<&str> = url.split('/').collect();
// Iterate over the parts to find the one that ends with ".html"
for part in parts {
if part.ends_with(".html") {
// Remove the ".html" suffix
let number_str = part.trim_end_matches(".html");
// Parse the remaining string into an integer
return number_str.parse().ok();
}
}
None
}
pub async fn get_wsdc_id(bot: &Bot, page: &mwbot::Page) -> Option<u32> {
let x = page.html().await.unwrap().into_mutable();
for w in &x.filter_external_links() {
if let Some(id) = extract_number_from_url(&w.target()) {
return Some(id);
}
}
None
}
pub async fn get_description(bot: &Bot, page: &mwbot::Page) -> Option<String> {
let x = page.html().await.unwrap().into_mutable();
for w in &x.iter_sections() {
dbg!(w);
let Some(h) = w.heading() else {
println!("No heading");
continue;
};
dbg!(&h);
let Some(t) = h.as_text() else {
println!("No text");
continue;
};
dbg!(t.borrow());
}
None
}