From 893f551b68200365b5dae9fe05937cc12ba74a2c Mon Sep 17 00:00:00 2001 From: Julio Biason Date: Fri, 23 Apr 2021 13:22:21 -0300 Subject: [PATCH] Now with some parsing (and a lot of debug messages) --- html5test/src/main.rs | 118 +++++++++++++++++++++++++++++++----------- 1 file changed, 88 insertions(+), 30 deletions(-) diff --git a/html5test/src/main.rs b/html5test/src/main.rs index cf07c42..5b9793c 100644 --- a/html5test/src/main.rs +++ b/html5test/src/main.rs @@ -1,44 +1,102 @@ use html5ever::parse_document; use html5ever::tendril::TendrilSink; +use markup5ever_rcdom::Handle; +use markup5ever_rcdom::NodeData; use markup5ever_rcdom::RcDom; +use std::borrow::Borrow; use std::default::Default; +fn go_children(input: &Handle, result: &mut String) { + for child in input.children.borrow().iter() { + walk(child.borrow(), result); + } +} + +fn walk(input: &Handle, result: &mut String) { + println!(">>> {:?}", input.data); + match input.data { + NodeData::Text { ref contents } => { + let text = contents.borrow().to_string(); + println!("Text: {:?}", text); + result.push_str(&text); + } + NodeData::Element { ref name, .. } => { + let tag = name.local.to_string(); + println!("Tag: {:?}", tag); + match tag.as_ref() { + "html" | "head" | "body" => { + println!("\tIgnored tag"); + go_children(input, result); + } + "p" => { + println!("\tParagraph"); + result.push_str("\n\n"); + go_children(input, result); + } + "span" => { + println!("\tSpan"); + if let NodeData::Element { ref attrs, .. } = input.data { + let attrs = attrs.borrow(); + let classes = attrs + .iter() + .find(|attr| attr.name.local.to_string() == "class"); + if let Some(class) = classes { + if !class.value.to_string().contains("invisible") { + go_children(input, result); + } + } else { + go_children(input, result); + } + } + } + "a" => { + println!("\tAnchor"); + if let NodeData::Element { ref attrs, .. } = input.data { + let attrs = attrs.borrow(); + let rels = attrs + .iter() + .find(|attr| attr.name.local.to_string() == "rel"); + let hrefs = attrs + .iter() + .find(|attr| attr.name.local.to_string() == "href"); + match (rels, hrefs) { + (Some(rel), Some(href)) => { + if !rel.value.to_string().contains("tag") { + result.push_str("[["); + result.push_str(&href.value.to_string()); + result.push_str("]["); + go_children(input, result); + result.push_str("]]"); + } else { + go_children(input, result); + } + } + _ => {} + } + } + } + _ => {} + } + } + _ => { + go_children(input, result); + } + } +} + fn main() { let source = String::from( - r#"

Today I finally moved with my contact and calendar management into - the terminal with #vdirsyncer - #khal and #khard.

Thank you - @hund for your - great post: hund.tty1.se/2020/08/12/how-to

#carddav - #caldav #terminal

"#, + r#"

Today I finally moved with my contact and calendar management into the terminal with #vdirsyncer #khal and #khard.

Thank you @hund for your great post: hund.tty1.se/2020/08/12/how-to

#carddav #caldav #terminal

"#, ); println!("Source: {}", &source); + println!("---------------------------------"); - let _dom = parse_document(RcDom::default(), Default::default()) + let dom = parse_document(RcDom::default(), Default::default()) .from_utf8() .read_from(&mut source.as_bytes()) .unwrap(); + let mut result = String::new(); + walk(&dom.document, &mut result); + println!("---------------------------------"); + println!("{}", result); }