399 lines
14 KiB
Rust
399 lines
14 KiB
Rust
use kuchiki::NodeRef;
|
|
use lazy_static::lazy_static;
|
|
use regex::Regex;
|
|
use serde::{Deserialize, Serialize};
|
|
use std::borrow::Borrow;
|
|
use std::ops::Deref;
|
|
|
|
#[derive(Serialize, Deserialize, Clone, Debug)]
|
|
pub struct DenseDocument {
|
|
pub head: DenseHead,
|
|
pub body_content: Vec<DenseTree>,
|
|
pub body_remainder: Vec<DenseTree>,
|
|
}
|
|
|
|
#[derive(Serialize, Deserialize, Clone, Debug)]
|
|
pub struct DenseHead {
|
|
pub title: String,
|
|
/// Language of the page. May be empty if not discovered.
|
|
pub language: String,
|
|
/// URL to icon of the page. May be empty if none were discovered.
|
|
pub icon: String,
|
|
}
|
|
|
|
#[derive(Serialize, Deserialize, Clone, Debug)]
|
|
pub enum DenseTree {
|
|
Heading1(Vec<DenseTree>),
|
|
Heading2(Vec<DenseTree>),
|
|
Heading3(Vec<DenseTree>),
|
|
Heading4(Vec<DenseTree>),
|
|
Heading5(Vec<DenseTree>),
|
|
Heading6(Vec<DenseTree>),
|
|
Link {
|
|
children: Vec<DenseTree>,
|
|
href: String,
|
|
nofollow: bool,
|
|
},
|
|
Image {
|
|
src: String,
|
|
alt: String,
|
|
// title? I don't know if it'd be very useful.
|
|
},
|
|
Text(String),
|
|
}
|
|
|
|
impl DenseTree {
|
|
pub fn from_body(body_node: NodeRef) -> Vec<DenseTree> {
|
|
let mut builder = DenseTreeBuilder::new();
|
|
builder.add_children_of_node(body_node);
|
|
builder.into_tree()
|
|
}
|
|
|
|
pub fn is_text(&self) -> bool {
|
|
match self {
|
|
DenseTree::Text(_) => true,
|
|
_ => false,
|
|
}
|
|
}
|
|
|
|
pub fn generate_textual_format(nodes: &Vec<DenseTree>) -> String {
|
|
let mut buf = String::new();
|
|
for node in nodes {
|
|
node.append_in_textual_format(&mut buf);
|
|
}
|
|
simplify_newlines(&buf)
|
|
}
|
|
|
|
fn append_in_textual_format(&self, string: &mut String) {
|
|
match self {
|
|
DenseTree::Heading1(children) => {
|
|
string.push_str("\n\n# ");
|
|
for child in children {
|
|
child.append_in_textual_format(string);
|
|
}
|
|
string.push_str("\n");
|
|
}
|
|
DenseTree::Heading2(children) => {
|
|
string.push_str("\n\n## ");
|
|
for child in children {
|
|
child.append_in_textual_format(string);
|
|
}
|
|
string.push_str("\n");
|
|
}
|
|
DenseTree::Heading3(children) => {
|
|
string.push_str("\n\n### ");
|
|
for child in children {
|
|
child.append_in_textual_format(string);
|
|
}
|
|
string.push_str("\n");
|
|
}
|
|
DenseTree::Heading4(children) => {
|
|
string.push_str("\n\n#### ");
|
|
for child in children {
|
|
child.append_in_textual_format(string);
|
|
}
|
|
string.push_str("\n");
|
|
}
|
|
DenseTree::Heading5(children) => {
|
|
string.push_str("\n\n##### ");
|
|
for child in children {
|
|
child.append_in_textual_format(string);
|
|
}
|
|
string.push_str("\n");
|
|
}
|
|
DenseTree::Heading6(children) => {
|
|
string.push_str("\n\n###### ");
|
|
for child in children {
|
|
child.append_in_textual_format(string);
|
|
}
|
|
string.push_str("\n");
|
|
}
|
|
DenseTree::Link { children, href, .. } => {
|
|
string.push('[');
|
|
for child in children {
|
|
child.append_in_textual_format(string);
|
|
}
|
|
string.push_str(&format!("]({})", href));
|
|
}
|
|
DenseTree::Image { .. } => {
|
|
string.push_str("[IMG]");
|
|
}
|
|
DenseTree::Text(text) => {
|
|
string.push_str(text);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
struct DenseTreeBuilder {
|
|
/// Siblings in the buffer.
|
|
nodes: Vec<DenseTree>,
|
|
|
|
/// Number of preceding newlines at the end of the buffer.
|
|
/// Used for generating text that preserves some vague structure.
|
|
preceding_newlines: u32,
|
|
}
|
|
|
|
impl DenseTreeBuilder {
|
|
pub fn new() -> Self {
|
|
DenseTreeBuilder {
|
|
nodes: vec![],
|
|
preceding_newlines: 0,
|
|
}
|
|
}
|
|
|
|
pub fn into_tree(mut self) -> Vec<DenseTree> {
|
|
self.simplify();
|
|
self.nodes
|
|
}
|
|
|
|
/// Simplify the DenseTree nodes: coalesce Text nodes and
|
|
pub fn simplify(&mut self) {
|
|
// First coalesce all text nodes
|
|
// TODO(perf): Do it in a better way to reduce the cost.
|
|
let mut idx = 1;
|
|
while idx < self.nodes.len() {
|
|
if self.nodes[idx].is_text() && self.nodes[idx - 1].is_text() {
|
|
// Merge the two text nodes is a text node, consume it and merge it in.
|
|
match self.nodes.remove(idx) {
|
|
DenseTree::Text(append_text) => {
|
|
match &mut self.nodes[idx - 1] {
|
|
DenseTree::Text(string) => {
|
|
string.push_str(&append_text);
|
|
// Continue so we don't advance, as we just moved the list down a
|
|
// bit.
|
|
continue;
|
|
}
|
|
_ => {
|
|
panic!(
|
|
"Should be unreachable: checked to be text first. ({})",
|
|
idx - 1
|
|
);
|
|
}
|
|
}
|
|
}
|
|
_ => {
|
|
panic!("Should be unreachable: checked to be text first. ({})", idx);
|
|
}
|
|
}
|
|
}
|
|
|
|
idx += 1;
|
|
}
|
|
|
|
for node in &mut self.nodes {
|
|
match node {
|
|
DenseTree::Text(text) => {
|
|
// Coalesce newlines so there are never more than 2 in a row.
|
|
*text = simplify_newlines(&simplify_whitespace(&text));
|
|
}
|
|
_ => { /* nop */ }
|
|
}
|
|
}
|
|
|
|
match self.nodes.get_mut(0) {
|
|
Some(DenseTree::Text(text)) => {
|
|
*text = text.trim_start().to_owned();
|
|
}
|
|
_ => (),
|
|
}
|
|
|
|
let num_nodes = self.nodes.len();
|
|
if num_nodes > 1 {
|
|
match self.nodes.get_mut(num_nodes - 1) {
|
|
Some(DenseTree::Text(text)) => {
|
|
*text = text.trim_end().to_owned();
|
|
}
|
|
_ => (),
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Convert a HTML node's children into DenseTree nodes.
|
|
pub fn add_children_of_node(&mut self, node: NodeRef) {
|
|
for child in node.children() {
|
|
if let Some(element) = child.as_element() {
|
|
match element.name.local.deref() {
|
|
"h1" => {
|
|
self.nodes
|
|
.push(DenseTree::Heading1(DenseTree::from_body(child)));
|
|
self.preceding_newlines = 2;
|
|
}
|
|
"h2" => {
|
|
self.nodes
|
|
.push(DenseTree::Heading2(DenseTree::from_body(child)));
|
|
self.preceding_newlines = 2;
|
|
}
|
|
"h3" => {
|
|
self.nodes
|
|
.push(DenseTree::Heading3(DenseTree::from_body(child)));
|
|
self.preceding_newlines = 2;
|
|
}
|
|
"h4" => {
|
|
self.nodes
|
|
.push(DenseTree::Heading4(DenseTree::from_body(child)));
|
|
self.preceding_newlines = 2;
|
|
}
|
|
"h5" => {
|
|
self.nodes
|
|
.push(DenseTree::Heading5(DenseTree::from_body(child)));
|
|
self.preceding_newlines = 2;
|
|
}
|
|
"h6" => {
|
|
self.nodes
|
|
.push(DenseTree::Heading6(DenseTree::from_body(child)));
|
|
self.preceding_newlines = 2;
|
|
}
|
|
"a" => {
|
|
let attrs = element.attributes.borrow();
|
|
let href = attrs.get("href").unwrap_or("").to_owned();
|
|
|
|
if href.starts_with("javascript:") || href.starts_with("data:") {
|
|
// Skip this link. Just unwrap it.
|
|
self.add_children_of_node(child.clone());
|
|
continue;
|
|
}
|
|
|
|
let nofollow = attrs
|
|
.get("rel")
|
|
.map(|rel: &str| {
|
|
rel.split_whitespace()
|
|
.any(|rel_word: &str| rel_word.eq_ignore_ascii_case("nofollow"))
|
|
})
|
|
.unwrap_or(false);
|
|
drop(attrs);
|
|
|
|
self.nodes.push(DenseTree::Link {
|
|
children: DenseTree::from_body(child),
|
|
href,
|
|
nofollow,
|
|
});
|
|
|
|
self.preceding_newlines = 0;
|
|
}
|
|
"img" => {
|
|
// TODO Decide if this is worth the space...
|
|
let attrs = element.attributes.borrow();
|
|
let src = attrs.get("src").unwrap_or("").to_owned();
|
|
|
|
if src.starts_with("javascript:") || src.starts_with("data:") {
|
|
// Skip this image.
|
|
continue;
|
|
}
|
|
|
|
let alt = simplify_whitespace(attrs.get("alt").unwrap_or("").trim());
|
|
|
|
self.nodes.push(DenseTree::Image { src, alt });
|
|
}
|
|
"p" | "pre" => {
|
|
// Paragraphs must have 2 preceding newlines.
|
|
if self.preceding_newlines < 2 {
|
|
self.nodes.push(DenseTree::Text(
|
|
match self.preceding_newlines {
|
|
0 => "\n\n",
|
|
1 => "\n",
|
|
_ => unreachable!(),
|
|
}
|
|
.to_owned(),
|
|
));
|
|
self.preceding_newlines = 2;
|
|
}
|
|
|
|
self.add_children_of_node(child);
|
|
|
|
// Paragraphs must have 2 trailing newlines.
|
|
if self.preceding_newlines < 2 {
|
|
self.nodes.push(DenseTree::Text(
|
|
match self.preceding_newlines {
|
|
0 => "\n\n",
|
|
1 => "\n",
|
|
_ => unreachable!(),
|
|
}
|
|
.to_owned(),
|
|
));
|
|
self.preceding_newlines = 2;
|
|
}
|
|
}
|
|
"br" => {
|
|
self.nodes.push(DenseTree::Text("\n".to_owned()));
|
|
self.preceding_newlines += 1;
|
|
}
|
|
"div" | "li" => {
|
|
// Divs must have 1 preceding newline.
|
|
if self.preceding_newlines < 1 {
|
|
self.nodes.push(DenseTree::Text("\n".to_owned()));
|
|
self.preceding_newlines = 1;
|
|
}
|
|
|
|
self.add_children_of_node(child);
|
|
|
|
// Divs must have 1 trailing newline.
|
|
if self.preceding_newlines < 1 {
|
|
self.nodes.push(DenseTree::Text("\n".to_owned()));
|
|
self.preceding_newlines = 1;
|
|
}
|
|
}
|
|
"script" | "style" | "svg" | "noscript" => {
|
|
// We just prune these, as we don't want them.
|
|
// (noscript tends just to be noisy 'enable JS now!!' messages, so prune those too.)
|
|
continue;
|
|
}
|
|
_ => {
|
|
// Simply unwrap the unknown element.
|
|
self.add_children_of_node(child);
|
|
}
|
|
}
|
|
//element.name.local
|
|
} else if let Some(text) = child.as_text() {
|
|
let text_to_add =
|
|
simplify_whitespace(&simplify_newlines(&text.borrow().replace("\n", " ")));
|
|
self.preceding_newlines =
|
|
text_to_add.chars().rev().take_while(|c| *c == '\n').count() as u32;
|
|
self.nodes.push(DenseTree::Text(text_to_add));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
lazy_static! {
|
|
static ref MANY_WHITESPACE: Regex = Regex::new(r"[ \t]+").unwrap();
|
|
static ref THREE_OR_MORE_NEWLINES: Regex = Regex::new(r"\n+[ \t\n]+\n+").unwrap();
|
|
static ref UNNECESSARY_LS_WHITESPACE: Regex = Regex::new(r"\n[ \s]+").unwrap();
|
|
static ref UNNECESSARY_LE_WHITESPACE: Regex = Regex::new(r"[ \s]+\n").unwrap();
|
|
}
|
|
|
|
pub fn simplify_whitespace(input: &str) -> String {
|
|
let s = MANY_WHITESPACE.replace_all(input, " ");
|
|
let s = UNNECESSARY_LS_WHITESPACE.replace_all(s.borrow(), "\n");
|
|
UNNECESSARY_LE_WHITESPACE
|
|
.replace_all(s.borrow(), "\n")
|
|
.into_owned()
|
|
}
|
|
|
|
pub fn simplify_newlines(input: &str) -> String {
|
|
THREE_OR_MORE_NEWLINES
|
|
.replace_all(&input.replace("\r", ""), "\n\n")
|
|
.into_owned()
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod test {
|
|
use crate::{simplify_newlines, simplify_whitespace};
|
|
|
|
#[test]
|
|
pub fn test_simplify_whitespace() {
|
|
assert_eq!(
|
|
simplify_whitespace("hello cat\tdog \t bat"),
|
|
"hello cat dog bat"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
pub fn test_simplify_newlines() {
|
|
assert_eq!(
|
|
simplify_newlines("hello\n\n\n\nare\n\n\nyou\n\n\n\n\n\n\t\n\n\nthere?"),
|
|
"hello\n\nare\n\nyou\n\nthere?"
|
|
);
|
|
}
|
|
}
|