Simplify textification of indexed pages

This commit is contained in:
Olivier 'reivilibre' 2022-03-27 21:36:21 +01:00
parent 8ec8003dbb
commit eb899ac9a5
3 changed files with 43 additions and 22 deletions

View File

@ -66,67 +66,85 @@ impl DenseTree {
}
}
pub fn generate_textual_format(nodes: &Vec<DenseTree>) -> String {
pub fn generate_textual_format(nodes: &Vec<DenseTree>, rich: bool) -> String {
let mut buf = String::new();
for node in nodes {
node.append_in_textual_format(&mut buf);
node.append_in_textual_format(&mut buf, rich);
}
simplify_newlines(&buf)
}
fn append_in_textual_format(&self, string: &mut String) {
fn append_in_textual_format(&self, string: &mut String, rich: bool) {
match self {
DenseTree::Heading1(children) => {
if rich {
string.push_str("\n\n# ");
};
for child in children {
child.append_in_textual_format(string);
child.append_in_textual_format(string, rich);
}
string.push_str("\n");
}
DenseTree::Heading2(children) => {
if rich {
string.push_str("\n\n## ");
};
for child in children {
child.append_in_textual_format(string);
child.append_in_textual_format(string, rich);
}
string.push_str("\n");
}
DenseTree::Heading3(children) => {
if rich {
string.push_str("\n\n### ");
};
for child in children {
child.append_in_textual_format(string);
child.append_in_textual_format(string, rich);
}
string.push_str("\n");
}
DenseTree::Heading4(children) => {
if rich {
string.push_str("\n\n#### ");
};
for child in children {
child.append_in_textual_format(string);
child.append_in_textual_format(string, rich);
}
string.push_str("\n");
}
DenseTree::Heading5(children) => {
if rich {
string.push_str("\n\n##### ");
};
for child in children {
child.append_in_textual_format(string);
child.append_in_textual_format(string, rich);
}
string.push_str("\n");
}
DenseTree::Heading6(children) => {
if rich {
string.push_str("\n\n###### ");
};
for child in children {
child.append_in_textual_format(string);
child.append_in_textual_format(string, rich);
}
string.push_str("\n");
}
DenseTree::Link { children, href, .. } => {
if rich {
string.push('[');
};
for child in children {
child.append_in_textual_format(string);
child.append_in_textual_format(string, rich);
}
if rich {
string.push_str(&format!("]({})", href));
};
}
DenseTree::Image { .. } => {
if rich {
string.push_str("[IMG]");
};
}
DenseTree::Text(text) => {
string.push_str(text);

View File

@ -76,8 +76,9 @@ pub async fn main() -> anyhow::Result<()> {
let document = page_record.record.document;
let article_body = DenseTree::generate_textual_format(&document.body_content);
let nonarticle_body = DenseTree::generate_textual_format(&document.body_remainder);
let article_body = DenseTree::generate_textual_format(&document.body_content, false);
let nonarticle_body =
DenseTree::generate_textual_format(&document.body_remainder, false);
let tags = seed_lookup
.look_up(&Url::parse(page_record.url.as_ref())?)?

View File

@ -181,7 +181,9 @@ impl PageExtractionServiceInternal {
}
let dense_doc = DenseTree::from_body(root_node.clone());
let dense_doc_text = Lazy::new(Box::new(|| DenseTree::generate_textual_format(&dense_doc)));
let dense_doc_text = Lazy::new(Box::new(|| {
DenseTree::generate_textual_format(&dense_doc, true)
}));
//eprintln!("^^^^^\n{}\n^^^^^", *dense_doc_text);
if language.is_none() {