quickpeep/quickpeep_densedoc/src/lib.rs

399 lines
14 KiB
Rust

use kuchiki::NodeRef;
use lazy_static::lazy_static;
use regex::Regex;
use serde::{Deserialize, Serialize};
use std::borrow::Borrow;
use std::ops::Deref;
#[derive(Serialize, Deserialize, Clone, Debug)]
pub struct DenseDocument {
pub head: DenseHead,
pub body_content: Vec<DenseTree>,
pub body_remainder: Vec<DenseTree>,
}
#[derive(Serialize, Deserialize, Clone, Debug)]
pub struct DenseHead {
pub title: String,
/// Language of the page. May be empty if not discovered.
pub language: String,
/// URL to icon of the page. May be empty if none were discovered.
pub icon: String,
}
#[derive(Serialize, Deserialize, Clone, Debug)]
pub enum DenseTree {
Heading1(Vec<DenseTree>),
Heading2(Vec<DenseTree>),
Heading3(Vec<DenseTree>),
Heading4(Vec<DenseTree>),
Heading5(Vec<DenseTree>),
Heading6(Vec<DenseTree>),
Link {
children: Vec<DenseTree>,
href: String,
nofollow: bool,
},
Image {
src: String,
alt: String,
// title? I don't know if it'd be very useful.
},
Text(String),
}
impl DenseTree {
pub fn from_body(body_node: NodeRef) -> Vec<DenseTree> {
let mut builder = DenseTreeBuilder::new();
builder.add_children_of_node(body_node);
builder.into_tree()
}
pub fn is_text(&self) -> bool {
match self {
DenseTree::Text(_) => true,
_ => false,
}
}
pub fn generate_textual_format(nodes: &Vec<DenseTree>) -> String {
let mut buf = String::new();
for node in nodes {
node.append_in_textual_format(&mut buf);
}
simplify_newlines(&buf)
}
fn append_in_textual_format(&self, string: &mut String) {
match self {
DenseTree::Heading1(children) => {
string.push_str("\n\n# ");
for child in children {
child.append_in_textual_format(string);
}
string.push_str("\n");
}
DenseTree::Heading2(children) => {
string.push_str("\n\n## ");
for child in children {
child.append_in_textual_format(string);
}
string.push_str("\n");
}
DenseTree::Heading3(children) => {
string.push_str("\n\n### ");
for child in children {
child.append_in_textual_format(string);
}
string.push_str("\n");
}
DenseTree::Heading4(children) => {
string.push_str("\n\n#### ");
for child in children {
child.append_in_textual_format(string);
}
string.push_str("\n");
}
DenseTree::Heading5(children) => {
string.push_str("\n\n##### ");
for child in children {
child.append_in_textual_format(string);
}
string.push_str("\n");
}
DenseTree::Heading6(children) => {
string.push_str("\n\n###### ");
for child in children {
child.append_in_textual_format(string);
}
string.push_str("\n");
}
DenseTree::Link { children, href, .. } => {
string.push('[');
for child in children {
child.append_in_textual_format(string);
}
string.push_str(&format!("]({})", href));
}
DenseTree::Image { .. } => {
string.push_str("[IMG]");
}
DenseTree::Text(text) => {
string.push_str(text);
}
}
}
}
struct DenseTreeBuilder {
/// Siblings in the buffer.
nodes: Vec<DenseTree>,
/// Number of preceding newlines at the end of the buffer.
/// Used for generating text that preserves some vague structure.
preceding_newlines: u32,
}
impl DenseTreeBuilder {
pub fn new() -> Self {
DenseTreeBuilder {
nodes: vec![],
preceding_newlines: 0,
}
}
pub fn into_tree(mut self) -> Vec<DenseTree> {
self.simplify();
self.nodes
}
/// Simplify the DenseTree nodes: coalesce Text nodes and
pub fn simplify(&mut self) {
// First coalesce all text nodes
// TODO(perf): Do it in a better way to reduce the cost.
let mut idx = 1;
while idx < self.nodes.len() {
if self.nodes[idx].is_text() && self.nodes[idx - 1].is_text() {
// Merge the two text nodes is a text node, consume it and merge it in.
match self.nodes.remove(idx) {
DenseTree::Text(append_text) => {
match &mut self.nodes[idx - 1] {
DenseTree::Text(string) => {
string.push_str(&append_text);
// Continue so we don't advance, as we just moved the list down a
// bit.
continue;
}
_ => {
panic!(
"Should be unreachable: checked to be text first. ({})",
idx - 1
);
}
}
}
_ => {
panic!("Should be unreachable: checked to be text first. ({})", idx);
}
}
}
idx += 1;
}
for node in &mut self.nodes {
match node {
DenseTree::Text(text) => {
// Coalesce newlines so there are never more than 2 in a row.
*text = simplify_newlines(&simplify_whitespace(&text));
}
_ => { /* nop */ }
}
}
match self.nodes.get_mut(0) {
Some(DenseTree::Text(text)) => {
*text = text.trim_start().to_owned();
}
_ => (),
}
let num_nodes = self.nodes.len();
if num_nodes > 1 {
match self.nodes.get_mut(num_nodes - 1) {
Some(DenseTree::Text(text)) => {
*text = text.trim_end().to_owned();
}
_ => (),
}
}
}
/// Convert a HTML node's children into DenseTree nodes.
pub fn add_children_of_node(&mut self, node: NodeRef) {
for child in node.children() {
if let Some(element) = child.as_element() {
match element.name.local.deref() {
"h1" => {
self.nodes
.push(DenseTree::Heading1(DenseTree::from_body(child)));
self.preceding_newlines = 2;
}
"h2" => {
self.nodes
.push(DenseTree::Heading2(DenseTree::from_body(child)));
self.preceding_newlines = 2;
}
"h3" => {
self.nodes
.push(DenseTree::Heading3(DenseTree::from_body(child)));
self.preceding_newlines = 2;
}
"h4" => {
self.nodes
.push(DenseTree::Heading4(DenseTree::from_body(child)));
self.preceding_newlines = 2;
}
"h5" => {
self.nodes
.push(DenseTree::Heading5(DenseTree::from_body(child)));
self.preceding_newlines = 2;
}
"h6" => {
self.nodes
.push(DenseTree::Heading6(DenseTree::from_body(child)));
self.preceding_newlines = 2;
}
"a" => {
let attrs = element.attributes.borrow();
let href = attrs.get("href").unwrap_or("").to_owned();
if href.starts_with("javascript:") || href.starts_with("data:") {
// Skip this link. Just unwrap it.
self.add_children_of_node(child.clone());
continue;
}
let nofollow = attrs
.get("rel")
.map(|rel: &str| {
rel.split_whitespace()
.any(|rel_word: &str| rel_word.eq_ignore_ascii_case("nofollow"))
})
.unwrap_or(false);
drop(attrs);
self.nodes.push(DenseTree::Link {
children: DenseTree::from_body(child),
href,
nofollow,
});
self.preceding_newlines = 0;
}
"img" => {
// TODO Decide if this is worth the space...
let attrs = element.attributes.borrow();
let src = attrs.get("src").unwrap_or("").to_owned();
if src.starts_with("javascript:") || src.starts_with("data:") {
// Skip this image.
continue;
}
let alt = simplify_whitespace(attrs.get("alt").unwrap_or("").trim());
self.nodes.push(DenseTree::Image { src, alt });
}
"p" | "pre" => {
// Paragraphs must have 2 preceding newlines.
if self.preceding_newlines < 2 {
self.nodes.push(DenseTree::Text(
match self.preceding_newlines {
0 => "\n\n",
1 => "\n",
_ => unreachable!(),
}
.to_owned(),
));
self.preceding_newlines = 2;
}
self.add_children_of_node(child);
// Paragraphs must have 2 trailing newlines.
if self.preceding_newlines < 2 {
self.nodes.push(DenseTree::Text(
match self.preceding_newlines {
0 => "\n\n",
1 => "\n",
_ => unreachable!(),
}
.to_owned(),
));
self.preceding_newlines = 2;
}
}
"br" => {
self.nodes.push(DenseTree::Text("\n".to_owned()));
self.preceding_newlines += 1;
}
"div" | "li" => {
// Divs must have 1 preceding newline.
if self.preceding_newlines < 1 {
self.nodes.push(DenseTree::Text("\n".to_owned()));
self.preceding_newlines = 1;
}
self.add_children_of_node(child);
// Divs must have 1 trailing newline.
if self.preceding_newlines < 1 {
self.nodes.push(DenseTree::Text("\n".to_owned()));
self.preceding_newlines = 1;
}
}
"script" | "style" | "svg" | "noscript" => {
// We just prune these, as we don't want them.
// (noscript tends just to be noisy 'enable JS now!!' messages, so prune those too.)
continue;
}
_ => {
// Simply unwrap the unknown element.
self.add_children_of_node(child);
}
}
//element.name.local
} else if let Some(text) = child.as_text() {
let text_to_add =
simplify_whitespace(&simplify_newlines(&text.borrow().replace("\n", " ")));
self.preceding_newlines =
text_to_add.chars().rev().take_while(|c| *c == '\n').count() as u32;
self.nodes.push(DenseTree::Text(text_to_add));
}
}
}
}
lazy_static! {
static ref MANY_WHITESPACE: Regex = Regex::new(r"[ \t]+").unwrap();
static ref THREE_OR_MORE_NEWLINES: Regex = Regex::new(r"\n+[ \t\n]+\n+").unwrap();
static ref UNNECESSARY_LS_WHITESPACE: Regex = Regex::new(r"\n[ \s]+").unwrap();
static ref UNNECESSARY_LE_WHITESPACE: Regex = Regex::new(r"[ \s]+\n").unwrap();
}
pub fn simplify_whitespace(input: &str) -> String {
let s = MANY_WHITESPACE.replace_all(input, " ");
let s = UNNECESSARY_LS_WHITESPACE.replace_all(s.borrow(), "\n");
UNNECESSARY_LE_WHITESPACE
.replace_all(s.borrow(), "\n")
.into_owned()
}
pub fn simplify_newlines(input: &str) -> String {
THREE_OR_MORE_NEWLINES
.replace_all(&input.replace("\r", ""), "\n\n")
.into_owned()
}
#[cfg(test)]
mod test {
use crate::{simplify_newlines, simplify_whitespace};
#[test]
pub fn test_simplify_whitespace() {
assert_eq!(
simplify_whitespace("hello cat\tdog \t bat"),
"hello cat dog bat"
);
}
#[test]
pub fn test_simplify_newlines() {
assert_eq!(
simplify_newlines("hello\n\n\n\nare\n\n\nyou\n\n\n\n\n\n\t\n\n\nthere?"),
"hello\n\nare\n\nyou\n\nthere?"
);
}
}