Simplify textification of indexed pages
This commit is contained in:
parent
8ec8003dbb
commit
eb899ac9a5
|
@ -66,67 +66,85 @@ impl DenseTree {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn generate_textual_format(nodes: &Vec<DenseTree>) -> String {
|
||||
pub fn generate_textual_format(nodes: &Vec<DenseTree>, rich: bool) -> String {
|
||||
let mut buf = String::new();
|
||||
for node in nodes {
|
||||
node.append_in_textual_format(&mut buf);
|
||||
node.append_in_textual_format(&mut buf, rich);
|
||||
}
|
||||
simplify_newlines(&buf)
|
||||
}
|
||||
|
||||
fn append_in_textual_format(&self, string: &mut String) {
|
||||
fn append_in_textual_format(&self, string: &mut String, rich: bool) {
|
||||
match self {
|
||||
DenseTree::Heading1(children) => {
|
||||
string.push_str("\n\n# ");
|
||||
if rich {
|
||||
string.push_str("\n\n# ");
|
||||
};
|
||||
for child in children {
|
||||
child.append_in_textual_format(string);
|
||||
child.append_in_textual_format(string, rich);
|
||||
}
|
||||
string.push_str("\n");
|
||||
}
|
||||
DenseTree::Heading2(children) => {
|
||||
string.push_str("\n\n## ");
|
||||
if rich {
|
||||
string.push_str("\n\n## ");
|
||||
};
|
||||
for child in children {
|
||||
child.append_in_textual_format(string);
|
||||
child.append_in_textual_format(string, rich);
|
||||
}
|
||||
string.push_str("\n");
|
||||
}
|
||||
DenseTree::Heading3(children) => {
|
||||
string.push_str("\n\n### ");
|
||||
if rich {
|
||||
string.push_str("\n\n### ");
|
||||
};
|
||||
for child in children {
|
||||
child.append_in_textual_format(string);
|
||||
child.append_in_textual_format(string, rich);
|
||||
}
|
||||
string.push_str("\n");
|
||||
}
|
||||
DenseTree::Heading4(children) => {
|
||||
string.push_str("\n\n#### ");
|
||||
if rich {
|
||||
string.push_str("\n\n#### ");
|
||||
};
|
||||
for child in children {
|
||||
child.append_in_textual_format(string);
|
||||
child.append_in_textual_format(string, rich);
|
||||
}
|
||||
string.push_str("\n");
|
||||
}
|
||||
DenseTree::Heading5(children) => {
|
||||
string.push_str("\n\n##### ");
|
||||
if rich {
|
||||
string.push_str("\n\n##### ");
|
||||
};
|
||||
for child in children {
|
||||
child.append_in_textual_format(string);
|
||||
child.append_in_textual_format(string, rich);
|
||||
}
|
||||
string.push_str("\n");
|
||||
}
|
||||
DenseTree::Heading6(children) => {
|
||||
string.push_str("\n\n###### ");
|
||||
if rich {
|
||||
string.push_str("\n\n###### ");
|
||||
};
|
||||
for child in children {
|
||||
child.append_in_textual_format(string);
|
||||
child.append_in_textual_format(string, rich);
|
||||
}
|
||||
string.push_str("\n");
|
||||
}
|
||||
DenseTree::Link { children, href, .. } => {
|
||||
string.push('[');
|
||||
if rich {
|
||||
string.push('[');
|
||||
};
|
||||
for child in children {
|
||||
child.append_in_textual_format(string);
|
||||
child.append_in_textual_format(string, rich);
|
||||
}
|
||||
string.push_str(&format!("]({})", href));
|
||||
if rich {
|
||||
string.push_str(&format!("]({})", href));
|
||||
};
|
||||
}
|
||||
DenseTree::Image { .. } => {
|
||||
string.push_str("[IMG]");
|
||||
if rich {
|
||||
string.push_str("[IMG]");
|
||||
};
|
||||
}
|
||||
DenseTree::Text(text) => {
|
||||
string.push_str(text);
|
||||
|
|
|
@ -76,8 +76,9 @@ pub async fn main() -> anyhow::Result<()> {
|
|||
|
||||
let document = page_record.record.document;
|
||||
|
||||
let article_body = DenseTree::generate_textual_format(&document.body_content);
|
||||
let nonarticle_body = DenseTree::generate_textual_format(&document.body_remainder);
|
||||
let article_body = DenseTree::generate_textual_format(&document.body_content, false);
|
||||
let nonarticle_body =
|
||||
DenseTree::generate_textual_format(&document.body_remainder, false);
|
||||
|
||||
let tags = seed_lookup
|
||||
.look_up(&Url::parse(page_record.url.as_ref())?)?
|
||||
|
|
|
@ -181,7 +181,9 @@ impl PageExtractionServiceInternal {
|
|||
}
|
||||
|
||||
let dense_doc = DenseTree::from_body(root_node.clone());
|
||||
let dense_doc_text = Lazy::new(Box::new(|| DenseTree::generate_textual_format(&dense_doc)));
|
||||
let dense_doc_text = Lazy::new(Box::new(|| {
|
||||
DenseTree::generate_textual_format(&dense_doc, true)
|
||||
}));
|
||||
//eprintln!("^^^^^\n{}\n^^^^^", *dense_doc_text);
|
||||
|
||||
if language.is_none() {
|
||||
|
|
Loading…
Reference in New Issue