Simplify textification of indexed pages

This commit is contained in:
Olivier 'reivilibre' 2022-03-27 21:36:21 +01:00
parent 8ec8003dbb
commit eb899ac9a5
3 changed files with 43 additions and 22 deletions

View File

@ -66,67 +66,85 @@ impl DenseTree {
} }
} }
pub fn generate_textual_format(nodes: &Vec<DenseTree>) -> String { pub fn generate_textual_format(nodes: &Vec<DenseTree>, rich: bool) -> String {
let mut buf = String::new(); let mut buf = String::new();
for node in nodes { for node in nodes {
node.append_in_textual_format(&mut buf); node.append_in_textual_format(&mut buf, rich);
} }
simplify_newlines(&buf) simplify_newlines(&buf)
} }
fn append_in_textual_format(&self, string: &mut String) { fn append_in_textual_format(&self, string: &mut String, rich: bool) {
match self { match self {
DenseTree::Heading1(children) => { DenseTree::Heading1(children) => {
string.push_str("\n\n# "); if rich {
string.push_str("\n\n# ");
};
for child in children { for child in children {
child.append_in_textual_format(string); child.append_in_textual_format(string, rich);
} }
string.push_str("\n"); string.push_str("\n");
} }
DenseTree::Heading2(children) => { DenseTree::Heading2(children) => {
string.push_str("\n\n## "); if rich {
string.push_str("\n\n## ");
};
for child in children { for child in children {
child.append_in_textual_format(string); child.append_in_textual_format(string, rich);
} }
string.push_str("\n"); string.push_str("\n");
} }
DenseTree::Heading3(children) => { DenseTree::Heading3(children) => {
string.push_str("\n\n### "); if rich {
string.push_str("\n\n### ");
};
for child in children { for child in children {
child.append_in_textual_format(string); child.append_in_textual_format(string, rich);
} }
string.push_str("\n"); string.push_str("\n");
} }
DenseTree::Heading4(children) => { DenseTree::Heading4(children) => {
string.push_str("\n\n#### "); if rich {
string.push_str("\n\n#### ");
};
for child in children { for child in children {
child.append_in_textual_format(string); child.append_in_textual_format(string, rich);
} }
string.push_str("\n"); string.push_str("\n");
} }
DenseTree::Heading5(children) => { DenseTree::Heading5(children) => {
string.push_str("\n\n##### "); if rich {
string.push_str("\n\n##### ");
};
for child in children { for child in children {
child.append_in_textual_format(string); child.append_in_textual_format(string, rich);
} }
string.push_str("\n"); string.push_str("\n");
} }
DenseTree::Heading6(children) => { DenseTree::Heading6(children) => {
string.push_str("\n\n###### "); if rich {
string.push_str("\n\n###### ");
};
for child in children { for child in children {
child.append_in_textual_format(string); child.append_in_textual_format(string, rich);
} }
string.push_str("\n"); string.push_str("\n");
} }
DenseTree::Link { children, href, .. } => { DenseTree::Link { children, href, .. } => {
string.push('['); if rich {
string.push('[');
};
for child in children { for child in children {
child.append_in_textual_format(string); child.append_in_textual_format(string, rich);
} }
string.push_str(&format!("]({})", href)); if rich {
string.push_str(&format!("]({})", href));
};
} }
DenseTree::Image { .. } => { DenseTree::Image { .. } => {
string.push_str("[IMG]"); if rich {
string.push_str("[IMG]");
};
} }
DenseTree::Text(text) => { DenseTree::Text(text) => {
string.push_str(text); string.push_str(text);

View File

@ -76,8 +76,9 @@ pub async fn main() -> anyhow::Result<()> {
let document = page_record.record.document; let document = page_record.record.document;
let article_body = DenseTree::generate_textual_format(&document.body_content); let article_body = DenseTree::generate_textual_format(&document.body_content, false);
let nonarticle_body = DenseTree::generate_textual_format(&document.body_remainder); let nonarticle_body =
DenseTree::generate_textual_format(&document.body_remainder, false);
let tags = seed_lookup let tags = seed_lookup
.look_up(&Url::parse(page_record.url.as_ref())?)? .look_up(&Url::parse(page_record.url.as_ref())?)?

View File

@ -181,7 +181,9 @@ impl PageExtractionServiceInternal {
} }
let dense_doc = DenseTree::from_body(root_node.clone()); let dense_doc = DenseTree::from_body(root_node.clone());
let dense_doc_text = Lazy::new(Box::new(|| DenseTree::generate_textual_format(&dense_doc))); let dense_doc_text = Lazy::new(Box::new(|| {
DenseTree::generate_textual_format(&dense_doc, true)
}));
//eprintln!("^^^^^\n{}\n^^^^^", *dense_doc_text); //eprintln!("^^^^^\n{}\n^^^^^", *dense_doc_text);
if language.is_none() { if language.is_none() {