From f6bb83026ff6dd798f7b277493745745f80df1ba Mon Sep 17 00:00:00 2001
From: Julio Biason
Date: Tue, 27 Apr 2021 13:51:36 -0300
Subject: [PATCH] WIP: Build a simpler element tree
With the simpler tree, it seems it will be easier to generate the org,
without the chance of exploding the call stack.
---
html5test/src/main.rs | 167 ++++++++++++++++++++++++++++++------------
1 file changed, 119 insertions(+), 48 deletions(-)
diff --git a/html5test/src/main.rs b/html5test/src/main.rs
index c2f6ea6..33dc812 100644
--- a/html5test/src/main.rs
+++ b/html5test/src/main.rs
@@ -1,9 +1,12 @@
use html5ever::parse_document;
+use html5ever::tendril::StrTendril;
use html5ever::tendril::TendrilSink;
+use markup5ever::interface::Attribute;
use markup5ever_rcdom::Handle;
use markup5ever_rcdom::NodeData;
use markup5ever_rcdom::RcDom;
use std::borrow::Borrow;
+use std::cell::RefCell;
use std::default::Default;
use textwrap::fill;
use textwrap::NoHyphenation;
@@ -22,10 +25,12 @@ use textwrap::Options;
//
//
// That would build
-// root
-// / | |\Code(is for code)
-// Text(Text text) | Italic(for italics)
-// Link(the link, link)
+// root --------------------------\
+// / | \ Code()
+// Text(Text text) | ------ Italic() |
+// Link(link) | Text(is for code)
+// | Text(for italics)
+// Text(the link)
//
// Tree things to do, then:
// 1. Walk the DOM tree and build the text tree.
@@ -36,53 +41,115 @@ use textwrap::Options;
// we could work on the text wrap, 'cause there are elements that can't be
// wrapped (for example, Links)
-fn go_children(input: &Handle, result: &mut String) {
- for child in input.children.borrow().iter() {
- walk(child.borrow(), result);
+/// Nodes in the text tree
+enum NodeType {
+ /// The root element; produces nothing, but has the base content.
+ Root,
+ /// A text block. Contains the text itself.
+ Text(String),
+ /// A link to somewhere. Contains the link.
+ Link(String),
+ /// Italics
+ Italic,
+ /// Code block
+ Code,
+ /// A line break
+ LineBreak,
+}
+
+struct Node {
+ r#type: NodeType,
+ children: Vec,
+}
+
+impl Node {
+ /// Build the root node
+ fn root() -> Self {
+ Self {
+ r#type: NodeType::Root,
+ children: Vec::new(),
+ }
+ }
+
+ /// Build a text node
+ fn text(text: &str) -> Self {
+ Self {
+ r#type: NodeType::Text(text.into()),
+ children: Vec::new(), // XXX text nodes will never have children
+ }
+ }
+
+ /// Build a link node
+ fn link(href: &str) -> Self {
+ Self {
+ r#type: NodeType::Link(href.into()),
+ children: Vec::new(),
+ }
+ }
+
+ /// Build a linebreak node
+ fn line_break() -> Self {
+ Self {
+ r#type: NodeType::LineBreak,
+ children: Vec::new(), // XXX linebreaks will never have children
+ }
+ }
+
+ /// Add a child node to this node
+ fn add_child(&mut self, node: Node) {
+ self.children.push(node);
+ }
+}
+
+fn handle_text(node: &mut Node, contents: &RefCell) -> bool {
+ let text = contents.borrow().to_string();
+ node.add_child(Node::text(&text));
+ true
+}
+
+fn handle_line_break(node: &mut Node) -> bool {
+ node.add_child(Node::line_break());
+ true
+}
+
+fn handle_span(node: &mut Node, attrs: &RefCell>) -> bool {
+ let attrs = attrs.borrow();
+ let classes = attrs
+ .iter()
+ .find(|attr| attr.name.local.to_string() == "class");
+ if let Some(class) = classes {
+ let classes = class.value.to_string();
+ // just keep going if not invisible
+ !classes.contains("invisible")
+
+ // if !classes.contains("invisible") {
+ // true
+ // if classes.contains("ellipsis") {
+ // result.push_str("...");
+ // }
+ // }
+ } else {
+ // with no classes, we consider the element visible and just keep
+ // processing the list.
+ true
}
}
-fn walk(input: &Handle, result: &mut String) {
+fn walk(input: &Handle, parent: &mut Node) {
println!(">>> {:?}", input.data);
- match input.data {
- NodeData::Text { ref contents } => {
- let text = contents.borrow().to_string();
- println!("Text: {:?}", text);
- result.push_str(&text);
- }
- NodeData::Element { ref name, .. } => {
+ let process_children = match input.data {
+ NodeData::Text { ref contents } => handle_text(parent, contents),
+ NodeData::Element {
+ ref name,
+ ref attrs,
+ ..
+ } => {
let tag = name.local.to_string();
println!("Tag: {:?}", tag);
match tag.as_ref() {
- "html" | "head" | "body" => {
- println!("\tIgnored tag");
- go_children(input, result);
- }
- "p" => {
- println!("\tParagraph");
- result.push_str("\n\n");
- go_children(input, result);
- }
- "span" => {
- println!("\tSpan");
- if let NodeData::Element { ref attrs, .. } = input.data {
- let attrs = attrs.borrow();
- let classes = attrs
- .iter()
- .find(|attr| attr.name.local.to_string() == "class");
- if let Some(class) = classes {
- let classes = class.value.to_string();
- if !classes.contains("invisible") {
- go_children(input, result); // bollocks!
- if classes.contains("ellipsis") {
- result.push_str("...");
- }
- }
- } else {
- go_children(input, result);
- }
- }
- }
+ "html" | "head" | "body" => true, // just keep going
+ "p" => handle_line_break(parent),
+ "span" => handle_span(parent, attrs),
"a" => {
println!("\tAnchor");
if let NodeData::Element { ref attrs, .. } = input.data {
@@ -109,11 +176,15 @@ fn walk(input: &Handle, result: &mut String) {
}
}
}
- _ => {}
+ _ => false,
}
}
- _ => {
- go_children(input, result);
+ _ => true, // if we can't deal with it, just keep going
+ };
+
+ if process_children {
+ for child in input.children.borrow().iter() {
+ walk(child.borrow(), parent);
}
}
}
@@ -129,7 +200,7 @@ fn main() {
.from_utf8()
.read_from(&mut source.as_bytes())
.unwrap();
- let mut result = String::new();
+ let mut tree = Node::root();
walk(&dom.document, &mut result);
println!("---------------------------------");
let options = Options::new(70)