From eb899ac9a504b14fd2482e521ef36e93d7bb0cb6 Mon Sep 17 00:00:00 2001 From: Olivier 'reivilibre Date: Sun, 27 Mar 2022 21:36:21 +0100 Subject: [PATCH] Simplify textification of indexed pages --- quickpeep_densedoc/src/lib.rs | 56 ++++++++++++------- quickpeep_indexer/src/bin/qp-indexer.rs | 5 +- quickpeep_raker/src/raking/page_extraction.rs | 4 +- 3 files changed, 43 insertions(+), 22 deletions(-) diff --git a/quickpeep_densedoc/src/lib.rs b/quickpeep_densedoc/src/lib.rs index f9844a1..719176c 100644 --- a/quickpeep_densedoc/src/lib.rs +++ b/quickpeep_densedoc/src/lib.rs @@ -66,67 +66,85 @@ impl DenseTree { } } - pub fn generate_textual_format(nodes: &Vec) -> String { + pub fn generate_textual_format(nodes: &Vec, rich: bool) -> String { let mut buf = String::new(); for node in nodes { - node.append_in_textual_format(&mut buf); + node.append_in_textual_format(&mut buf, rich); } simplify_newlines(&buf) } - fn append_in_textual_format(&self, string: &mut String) { + fn append_in_textual_format(&self, string: &mut String, rich: bool) { match self { DenseTree::Heading1(children) => { - string.push_str("\n\n# "); + if rich { + string.push_str("\n\n# "); + }; for child in children { - child.append_in_textual_format(string); + child.append_in_textual_format(string, rich); } string.push_str("\n"); } DenseTree::Heading2(children) => { - string.push_str("\n\n## "); + if rich { + string.push_str("\n\n## "); + }; for child in children { - child.append_in_textual_format(string); + child.append_in_textual_format(string, rich); } string.push_str("\n"); } DenseTree::Heading3(children) => { - string.push_str("\n\n### "); + if rich { + string.push_str("\n\n### "); + }; for child in children { - child.append_in_textual_format(string); + child.append_in_textual_format(string, rich); } string.push_str("\n"); } DenseTree::Heading4(children) => { - string.push_str("\n\n#### "); + if rich { + string.push_str("\n\n#### "); + }; for child in children { - child.append_in_textual_format(string); + child.append_in_textual_format(string, rich); } string.push_str("\n"); } DenseTree::Heading5(children) => { - string.push_str("\n\n##### "); + if rich { + string.push_str("\n\n##### "); + }; for child in children { - child.append_in_textual_format(string); + child.append_in_textual_format(string, rich); } string.push_str("\n"); } DenseTree::Heading6(children) => { - string.push_str("\n\n###### "); + if rich { + string.push_str("\n\n###### "); + }; for child in children { - child.append_in_textual_format(string); + child.append_in_textual_format(string, rich); } string.push_str("\n"); } DenseTree::Link { children, href, .. } => { - string.push('['); + if rich { + string.push('['); + }; for child in children { - child.append_in_textual_format(string); + child.append_in_textual_format(string, rich); } - string.push_str(&format!("]({})", href)); + if rich { + string.push_str(&format!("]({})", href)); + }; } DenseTree::Image { .. } => { - string.push_str("[IMG]"); + if rich { + string.push_str("[IMG]"); + }; } DenseTree::Text(text) => { string.push_str(text); diff --git a/quickpeep_indexer/src/bin/qp-indexer.rs b/quickpeep_indexer/src/bin/qp-indexer.rs index 3f4ddf4..539019b 100644 --- a/quickpeep_indexer/src/bin/qp-indexer.rs +++ b/quickpeep_indexer/src/bin/qp-indexer.rs @@ -76,8 +76,9 @@ pub async fn main() -> anyhow::Result<()> { let document = page_record.record.document; - let article_body = DenseTree::generate_textual_format(&document.body_content); - let nonarticle_body = DenseTree::generate_textual_format(&document.body_remainder); + let article_body = DenseTree::generate_textual_format(&document.body_content, false); + let nonarticle_body = + DenseTree::generate_textual_format(&document.body_remainder, false); let tags = seed_lookup .look_up(&Url::parse(page_record.url.as_ref())?)? diff --git a/quickpeep_raker/src/raking/page_extraction.rs b/quickpeep_raker/src/raking/page_extraction.rs index 220f908..51175b3 100644 --- a/quickpeep_raker/src/raking/page_extraction.rs +++ b/quickpeep_raker/src/raking/page_extraction.rs @@ -181,7 +181,9 @@ impl PageExtractionServiceInternal { } let dense_doc = DenseTree::from_body(root_node.clone()); - let dense_doc_text = Lazy::new(Box::new(|| DenseTree::generate_textual_format(&dense_doc))); + let dense_doc_text = Lazy::new(Box::new(|| { + DenseTree::generate_textual_format(&dense_doc, true) + })); //eprintln!("^^^^^\n{}\n^^^^^", *dense_doc_text); if language.is_none() {