diff --git a/html5test/Cargo.lock b/html5test/Cargo.lock index 1288027..550cacc 100644 --- a/html5test/Cargo.lock +++ b/html5test/Cargo.lock @@ -46,6 +46,7 @@ name = "html5test" version = "0.1.0" dependencies = [ "html5ever", + "markup5ever", "markup5ever_rcdom", "textwrap", ] diff --git a/html5test/Cargo.toml b/html5test/Cargo.toml index 5dc1170..3879a9d 100644 --- a/html5test/Cargo.toml +++ b/html5test/Cargo.toml @@ -9,4 +9,5 @@ edition = "2018" [dependencies] html5ever = "0.25" markup5ever_rcdom = "0.1" +markup5ever = "0.10" textwrap = "0.13" diff --git a/html5test/src/main.rs b/html5test/src/main.rs index 33dc812..f1d9048 100644 --- a/html5test/src/main.rs +++ b/html5test/src/main.rs @@ -8,9 +8,6 @@ use markup5ever_rcdom::RcDom; use std::borrow::Borrow; use std::cell::RefCell; use std::default::Default; -use textwrap::fill; -use textwrap::NoHyphenation; -use textwrap::Options; // This go_children/walk is stupid, but I shot myself in the foot by adding // things after the children, link on links. @@ -42,21 +39,25 @@ use textwrap::Options; // wrapped (for example, Links) /// Nodes in the text tree +#[derive(Debug)] enum NodeType { /// The root element; produces nothing, but has the base content. Root, /// A text block. Contains the text itself. Text(String), - /// A link to somewhere. Contains the link. - Link(String), - /// Italics - Italic, - /// Code block - Code, /// A line break LineBreak, + // /// A link to somewhere. Contains the link. + // Link(String), + // /// Italics + // Italic, + // /// Code block + // Code, + // /// A block with an ellipsis at the end + // Ellipsis, } +#[derive(Debug)] struct Node { r#type: NodeType, children: Vec, @@ -75,14 +76,6 @@ impl Node { fn text(text: &str) -> Self { Self { r#type: NodeType::Text(text.into()), - children: Vec::new(), // XXX text nodes will never have children - } - } - - /// Build a link node - fn link(href: &str) -> Self { - Self { - r#type: NodeType::Link(href.into()), children: Vec::new(), } } @@ -91,53 +84,105 @@ impl Node { fn line_break() -> Self { Self { r#type: NodeType::LineBreak, - children: Vec::new(), // XXX linebreaks will never have children + children: Vec::new(), } } + // /// Build a link node + // fn link(href: &str) -> Self { + // Self { + // r#type: NodeType::Link(href.into()), + // children: Vec::new(), + // } + // } + + // /// Build a ellipsis node + // fn ellipsis() -> Self { + // Self { + // r#type: NodeType::Ellipsis, + // children: Vec::new(), + // } + // } + /// Add a child node to this node fn add_child(&mut self, node: Node) { self.children.push(node); } } -fn handle_text(node: &mut Node, contents: &RefCell) -> bool { +// Handle functions can return a three state result: +// 1. Do not process the children of the current Handle +// 2. Process the children and add to the same parent +// 3. Use the new Node as parent for future children. + +/// Result of the handling functions +enum HandleResult { + /// Stop processing, don't continue generating nodes + Stop, + /// Follow the children, but don't add any nodes in the current level + Follow, + // /// Produce a new node, but don't attach any children to it + // AddAndStay(Node), + /// Assume a new parent node + AddAndAdopt(Node), +} + +/// Handle a simple block of text +fn handle_text(node: &mut Node, contents: &RefCell) -> HandleResult { let text = contents.borrow().to_string(); node.add_child(Node::text(&text)); - true + HandleResult::Stop } -fn handle_line_break(node: &mut Node) -> bool { - node.add_child(Node::line_break()); - true +/// Handle an incoming line break +fn handle_line_break() -> HandleResult { + let line_break = Node::line_break(); + HandleResult::AddAndAdopt(line_break) } -fn handle_span(node: &mut Node, attrs: &RefCell>) -> bool { +/// Process the span content +fn handle_span(attrs: &RefCell>) -> HandleResult { let attrs = attrs.borrow(); - let classes = attrs + let classes_attr = attrs .iter() .find(|attr| attr.name.local.to_string() == "class"); - if let Some(class) = classes { - let classes = class.value.to_string(); - // just keep going if not invisible - !classes.contains("invisible") - - // if !classes.contains("invisible") { - // true - // if classes.contains("ellipsis") { - // result.push_str("..."); - // } - // } - } else { - // with no classes, we consider the element visible and just keep - // processing the list. - true + match classes_attr { + Some(classes) => { + if classes.value.contains("invisible") { + HandleResult::Stop + } else { + HandleResult::Follow + } + } + None => HandleResult::Follow, } } +// fn handle_anchor(node: &mut Node, attrs: &RefCell>) -> HandleResult { +// let attrs = attrs.borrow(); +// let rels = attrs +// .iter() +// .find(|attr| attr.name.local.to_string() == "rel"); +// let hrefs = attrs +// .iter() +// .find(|attr| attr.name.local.to_string() == "href"); +// match (rels, hrefs) { +// (Some(rel), Some(href)) => { +// if !rel.value.to_string().contains("tag") { +// let new_node = Node::link(&href.value); +// node.add_child(new_node); +// HandleResult::NewNode(new_node) +// } else { +// HandleResult::Keep +// } +// } +// _ => HandleResult::Stop, +// } +// } + fn walk(input: &Handle, parent: &mut Node) { - println!(">>> {:?}", input.data); - let process_children = match input.data { + // println!(">>> {:?}", input.data); + let element = match input.data { NodeData::Text { ref contents } => handle_text(parent, contents), NodeData::Element { ref name, @@ -145,67 +190,68 @@ fn walk(input: &Handle, parent: &mut Node) { .. } => { let tag = name.local.to_string(); - println!("Tag: {:?}", tag); match tag.as_ref() { - "html" | "head" | "body" => true, // just keep going - "p" => handle_line_break(parent), - "span" => handle_span(parent, attrs), - "a" => { - println!("\tAnchor"); - if let NodeData::Element { ref attrs, .. } = input.data { - let attrs = attrs.borrow(); - let rels = attrs - .iter() - .find(|attr| attr.name.local.to_string() == "rel"); - let hrefs = attrs - .iter() - .find(|attr| attr.name.local.to_string() == "href"); - match (rels, hrefs) { - (Some(rel), Some(href)) => { - if !rel.value.to_string().contains("tag") { - result.push_str("[["); - result.push_str(&href.value.to_string()); - result.push_str("]["); - go_children(input, result); - result.push_str("]]"); - } else { - go_children(input, result); - } - } - _ => {} - } - } - } - _ => false, + "html" | "head" | "body" => HandleResult::Follow, + "p" => handle_line_break(), + "span" => handle_span(attrs), + // "a" => handle_anchor(parent, attrs), + _ => HandleResult::Stop, } } - _ => true, // if we can't deal with it, just keep going + _ => HandleResult::Follow, // if we can't deal with it, just keep going }; - if process_children { - for child in input.children.borrow().iter() { - walk(child.borrow(), parent); + match element { + HandleResult::Stop => {} + HandleResult::Follow => { + for child in input.children.borrow().iter() { + walk(child.borrow(), parent); + } + } + // HandleResult::AddAndStay(new_node) => { + // parent.add_child(new_node); + // for child in input.children.borrow().iter() { + // walk(child.borrow(), parent); + // } + // } + HandleResult::AddAndAdopt(mut new_node) => { + for child in input.children.borrow().iter() { + walk(child.borrow(), &mut new_node); + } + parent.add_child(new_node); } } } -fn main() { - let source = String::from( - r#"

Today I finally moved with my contact and calendar management into the terminal with #vdirsyncer #khal and #khard.

Thank you @hund for your great post: hund.tty1.se/2020/08/12/how-to

#carddav #caldav #terminal

"#, - ); - println!("Source: {}", &source); - println!("---------------------------------"); - +fn build_nodes(text: &str) { let dom = parse_document(RcDom::default(), Default::default()) .from_utf8() - .read_from(&mut source.as_bytes()) + .read_from(&mut text.as_bytes()) .unwrap(); let mut tree = Node::root(); - walk(&dom.document, &mut result); - println!("---------------------------------"); - let options = Options::new(70) - .initial_indent(" ") - .subsequent_indent(" ") - .splitter(NoHyphenation); - println!("{}", fill(&result.trim(), &options)); + walk(&dom.document, &mut tree); + println!("Tree: {:?}", tree); +} + +fn main() { + let example_1 = String::from(r#"

A simple text component

"#); + build_nodes(&example_1); + + let example_2 = String::from( + r#"

but this is

"#, + ); + build_nodes(&example_2); + + // let example_1 = String::from( + // r#"

Today I finally moved with my contact and calendar management into the terminal with #vdirsyncer #khal and #khard.

Thank you @hund for your great post: hund.tty1.se/2020/08/12/how-to

#carddav #caldav #terminal

"#, + // ); + // println!("Source: {}", &example_1); + // println!("---------------------------------"); + + // let dom = parse_document(RcDom::default(), Default::default()) + // .from_utf8() + // .read_from(&mut example_1.as_bytes()) + // .unwrap(); + // let mut tree = Node::root(); + // walk(&dom.document, &mut tree); }