diff --git a/html5test/src/main.rs b/html5test/src/main.rs index f1d9048..d088f63 100644 --- a/html5test/src/main.rs +++ b/html5test/src/main.rs @@ -1,161 +1,21 @@ use html5ever::parse_document; -use html5ever::tendril::StrTendril; +// use html5ever::tendril::StrTendril; use html5ever::tendril::TendrilSink; -use markup5ever::interface::Attribute; +// use markup5ever::interface::Attribute; use markup5ever_rcdom::Handle; use markup5ever_rcdom::NodeData; use markup5ever_rcdom::RcDom; use std::borrow::Borrow; -use std::cell::RefCell; +// use std::cell::RefCell; use std::default::Default; -// This go_children/walk is stupid, but I shot myself in the foot by adding -// things after the children, link on links. -// -// So, I'm rethinking this, and I'll, basically redo the same thing Tendril is -// doing by building a tree of elements. -// -// So, say, if we have -// -//

Text textthe linkother text, for italics,

is for code
-//

-// -// That would build -// root --------------------------\ -// / | \ Code() -// Text(Text text) | ------ Italic() | -// Link(link) | Text(is for code) -// | Text(for italics) -// Text(the link) -// -// Tree things to do, then: -// 1. Walk the DOM tree and build the text tree. -// 2. Tree elements could return if the DOM three should continue processing or -// ignore incoming children (that would cut the "invisible span" processing, -// for example). -// 3. Build a walker for the new tree, to produce the final text. And, on that, -// we could work on the text wrap, 'cause there are elements that can't be -// wrapped (for example, Links) - -/// Nodes in the text tree -#[derive(Debug)] -enum NodeType { - /// The root element; produces nothing, but has the base content. - Root, - /// A text block. Contains the text itself. - Text(String), - /// A line break - LineBreak, - // /// A link to somewhere. Contains the link. - // Link(String), - // /// Italics - // Italic, - // /// Code block - // Code, - // /// A block with an ellipsis at the end - // Ellipsis, -} - -#[derive(Debug)] -struct Node { - r#type: NodeType, - children: Vec, -} - -impl Node { - /// Build the root node - fn root() -> Self { - Self { - r#type: NodeType::Root, - children: Vec::new(), - } - } - - /// Build a text node - fn text(text: &str) -> Self { - Self { - r#type: NodeType::Text(text.into()), - children: Vec::new(), - } - } - - /// Build a linebreak node - fn line_break() -> Self { - Self { - r#type: NodeType::LineBreak, - children: Vec::new(), +/// Simplify the process of keep walking through the results +macro_rules! keep_going { + ($source:ident, $target:ident) => { + for child in $source.children.borrow().iter() { + walk(child.borrow(), $target); } - } - - // /// Build a link node - // fn link(href: &str) -> Self { - // Self { - // r#type: NodeType::Link(href.into()), - // children: Vec::new(), - // } - // } - - // /// Build a ellipsis node - // fn ellipsis() -> Self { - // Self { - // r#type: NodeType::Ellipsis, - // children: Vec::new(), - // } - // } - - /// Add a child node to this node - fn add_child(&mut self, node: Node) { - self.children.push(node); - } -} - -// Handle functions can return a three state result: -// 1. Do not process the children of the current Handle -// 2. Process the children and add to the same parent -// 3. Use the new Node as parent for future children. - -/// Result of the handling functions -enum HandleResult { - /// Stop processing, don't continue generating nodes - Stop, - /// Follow the children, but don't add any nodes in the current level - Follow, - // /// Produce a new node, but don't attach any children to it - // AddAndStay(Node), - /// Assume a new parent node - AddAndAdopt(Node), -} - -/// Handle a simple block of text -fn handle_text(node: &mut Node, contents: &RefCell) -> HandleResult { - let text = contents.borrow().to_string(); - node.add_child(Node::text(&text)); - HandleResult::Stop -} - -/// Handle an incoming line break -fn handle_line_break() -> HandleResult { - let line_break = Node::line_break(); - HandleResult::AddAndAdopt(line_break) -} - -/// Process the span content -fn handle_span(attrs: &RefCell>) -> HandleResult { - let attrs = attrs.borrow(); - let classes_attr = attrs - .iter() - .find(|attr| attr.name.local.to_string() == "class"); - match classes_attr { - Some(classes) => { - if classes.value.contains("invisible") { - HandleResult::Stop - } else { - HandleResult::Follow - } - } - None => HandleResult::Follow, - } + }; } // fn handle_anchor(node: &mut Node, attrs: &RefCell>) -> HandleResult { @@ -180,10 +40,13 @@ fn handle_span(attrs: &RefCell>) -> HandleResult { // } // } -fn walk(input: &Handle, parent: &mut Node) { - // println!(">>> {:?}", input.data); - let element = match input.data { - NodeData::Text { ref contents } => handle_text(parent, contents), +fn walk(input: &Handle, result: &mut String) { + match input.data { + NodeData::Text { ref contents } => { + let text = contents.borrow().to_string(); + result.push_str(&text); + keep_going!(input, result); + } NodeData::Element { ref name, ref attrs, @@ -191,46 +54,66 @@ fn walk(input: &Handle, parent: &mut Node) { } => { let tag = name.local.to_string(); match tag.as_ref() { - "html" | "head" | "body" => HandleResult::Follow, - "p" => handle_line_break(), - "span" => handle_span(attrs), - // "a" => handle_anchor(parent, attrs), - _ => HandleResult::Stop, + "html" | "head" | "body" => keep_going!(input, result), + "p" => { + keep_going!(input, result); + result.push_str("\n"); + } + "span" => { + let attrs = attrs.borrow(); + let classes_attr = attrs + .iter() + .find(|attr| attr.name.local.to_string() == "class"); + match classes_attr { + Some(classes) => { + if !classes.value.contains("invisible") { + keep_going!(input, result); + } + } + None => keep_going!(input, result), + } + } + "a" => { + let attrs = attrs.borrow(); + let rels = attrs + .iter() + .find(|attr| attr.name.local.to_string() == "rel"); + let hrefs = attrs + .iter() + .find(|attr| attr.name.local.to_string() == "href"); + println!("Rels: {:?}, Hrefs: {:?}", rels, hrefs); + match (rels, hrefs) { + (Some(rel), Some(href)) => { + if !rel.value.to_string().contains("tag") { + result.push_str("[["); + result.push_str(&href.value); + result.push_str("]["); + keep_going!(input, result); + result.push_str("]]"); + } else { + keep_going!(input, result); + } + } + _ => keep_going!(input, result), + } + } + _ => {} } } - _ => HandleResult::Follow, // if we can't deal with it, just keep going - }; - - match element { - HandleResult::Stop => {} - HandleResult::Follow => { - for child in input.children.borrow().iter() { - walk(child.borrow(), parent); - } - } - // HandleResult::AddAndStay(new_node) => { - // parent.add_child(new_node); - // for child in input.children.borrow().iter() { - // walk(child.borrow(), parent); - // } - // } - HandleResult::AddAndAdopt(mut new_node) => { - for child in input.children.borrow().iter() { - walk(child.borrow(), &mut new_node); - } - parent.add_child(new_node); + _ => { + keep_going!(input, result); } - } + }; } -fn build_nodes(text: &str) { +fn build_nodes(source: &str) { let dom = parse_document(RcDom::default(), Default::default()) .from_utf8() - .read_from(&mut text.as_bytes()) + .read_from(&mut source.as_bytes()) .unwrap(); - let mut tree = Node::root(); - walk(&dom.document, &mut tree); - println!("Tree: {:?}", tree); + let mut result = String::new(); + walk(&dom.document, &mut result); + println!("Result: {:?}", result); } fn main() { @@ -242,6 +125,11 @@ fn main() { ); build_nodes(&example_2); + let example_3 = String::from( + r#"

@This is a mention and #this is a tag

"#, + ); + build_nodes(&example_3); + // let example_1 = String::from( // r#"

Today I finally moved with my contact and calendar management into the terminal with #vdirsyncer #khal and #khard.

Thank you @hund for your great post: hund.tty1.se/2020/08/12/how-to

#carddav #caldav #terminal

"#, // );