Simplify textification of indexed pages
This commit is contained in:
parent
8ec8003dbb
commit
eb899ac9a5
|
@ -66,67 +66,85 @@ impl DenseTree {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn generate_textual_format(nodes: &Vec<DenseTree>) -> String {
|
pub fn generate_textual_format(nodes: &Vec<DenseTree>, rich: bool) -> String {
|
||||||
let mut buf = String::new();
|
let mut buf = String::new();
|
||||||
for node in nodes {
|
for node in nodes {
|
||||||
node.append_in_textual_format(&mut buf);
|
node.append_in_textual_format(&mut buf, rich);
|
||||||
}
|
}
|
||||||
simplify_newlines(&buf)
|
simplify_newlines(&buf)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn append_in_textual_format(&self, string: &mut String) {
|
fn append_in_textual_format(&self, string: &mut String, rich: bool) {
|
||||||
match self {
|
match self {
|
||||||
DenseTree::Heading1(children) => {
|
DenseTree::Heading1(children) => {
|
||||||
string.push_str("\n\n# ");
|
if rich {
|
||||||
|
string.push_str("\n\n# ");
|
||||||
|
};
|
||||||
for child in children {
|
for child in children {
|
||||||
child.append_in_textual_format(string);
|
child.append_in_textual_format(string, rich);
|
||||||
}
|
}
|
||||||
string.push_str("\n");
|
string.push_str("\n");
|
||||||
}
|
}
|
||||||
DenseTree::Heading2(children) => {
|
DenseTree::Heading2(children) => {
|
||||||
string.push_str("\n\n## ");
|
if rich {
|
||||||
|
string.push_str("\n\n## ");
|
||||||
|
};
|
||||||
for child in children {
|
for child in children {
|
||||||
child.append_in_textual_format(string);
|
child.append_in_textual_format(string, rich);
|
||||||
}
|
}
|
||||||
string.push_str("\n");
|
string.push_str("\n");
|
||||||
}
|
}
|
||||||
DenseTree::Heading3(children) => {
|
DenseTree::Heading3(children) => {
|
||||||
string.push_str("\n\n### ");
|
if rich {
|
||||||
|
string.push_str("\n\n### ");
|
||||||
|
};
|
||||||
for child in children {
|
for child in children {
|
||||||
child.append_in_textual_format(string);
|
child.append_in_textual_format(string, rich);
|
||||||
}
|
}
|
||||||
string.push_str("\n");
|
string.push_str("\n");
|
||||||
}
|
}
|
||||||
DenseTree::Heading4(children) => {
|
DenseTree::Heading4(children) => {
|
||||||
string.push_str("\n\n#### ");
|
if rich {
|
||||||
|
string.push_str("\n\n#### ");
|
||||||
|
};
|
||||||
for child in children {
|
for child in children {
|
||||||
child.append_in_textual_format(string);
|
child.append_in_textual_format(string, rich);
|
||||||
}
|
}
|
||||||
string.push_str("\n");
|
string.push_str("\n");
|
||||||
}
|
}
|
||||||
DenseTree::Heading5(children) => {
|
DenseTree::Heading5(children) => {
|
||||||
string.push_str("\n\n##### ");
|
if rich {
|
||||||
|
string.push_str("\n\n##### ");
|
||||||
|
};
|
||||||
for child in children {
|
for child in children {
|
||||||
child.append_in_textual_format(string);
|
child.append_in_textual_format(string, rich);
|
||||||
}
|
}
|
||||||
string.push_str("\n");
|
string.push_str("\n");
|
||||||
}
|
}
|
||||||
DenseTree::Heading6(children) => {
|
DenseTree::Heading6(children) => {
|
||||||
string.push_str("\n\n###### ");
|
if rich {
|
||||||
|
string.push_str("\n\n###### ");
|
||||||
|
};
|
||||||
for child in children {
|
for child in children {
|
||||||
child.append_in_textual_format(string);
|
child.append_in_textual_format(string, rich);
|
||||||
}
|
}
|
||||||
string.push_str("\n");
|
string.push_str("\n");
|
||||||
}
|
}
|
||||||
DenseTree::Link { children, href, .. } => {
|
DenseTree::Link { children, href, .. } => {
|
||||||
string.push('[');
|
if rich {
|
||||||
|
string.push('[');
|
||||||
|
};
|
||||||
for child in children {
|
for child in children {
|
||||||
child.append_in_textual_format(string);
|
child.append_in_textual_format(string, rich);
|
||||||
}
|
}
|
||||||
string.push_str(&format!("]({})", href));
|
if rich {
|
||||||
|
string.push_str(&format!("]({})", href));
|
||||||
|
};
|
||||||
}
|
}
|
||||||
DenseTree::Image { .. } => {
|
DenseTree::Image { .. } => {
|
||||||
string.push_str("[IMG]");
|
if rich {
|
||||||
|
string.push_str("[IMG]");
|
||||||
|
};
|
||||||
}
|
}
|
||||||
DenseTree::Text(text) => {
|
DenseTree::Text(text) => {
|
||||||
string.push_str(text);
|
string.push_str(text);
|
||||||
|
|
|
@ -76,8 +76,9 @@ pub async fn main() -> anyhow::Result<()> {
|
||||||
|
|
||||||
let document = page_record.record.document;
|
let document = page_record.record.document;
|
||||||
|
|
||||||
let article_body = DenseTree::generate_textual_format(&document.body_content);
|
let article_body = DenseTree::generate_textual_format(&document.body_content, false);
|
||||||
let nonarticle_body = DenseTree::generate_textual_format(&document.body_remainder);
|
let nonarticle_body =
|
||||||
|
DenseTree::generate_textual_format(&document.body_remainder, false);
|
||||||
|
|
||||||
let tags = seed_lookup
|
let tags = seed_lookup
|
||||||
.look_up(&Url::parse(page_record.url.as_ref())?)?
|
.look_up(&Url::parse(page_record.url.as_ref())?)?
|
||||||
|
|
|
@ -181,7 +181,9 @@ impl PageExtractionServiceInternal {
|
||||||
}
|
}
|
||||||
|
|
||||||
let dense_doc = DenseTree::from_body(root_node.clone());
|
let dense_doc = DenseTree::from_body(root_node.clone());
|
||||||
let dense_doc_text = Lazy::new(Box::new(|| DenseTree::generate_textual_format(&dense_doc)));
|
let dense_doc_text = Lazy::new(Box::new(|| {
|
||||||
|
DenseTree::generate_textual_format(&dense_doc, true)
|
||||||
|
}));
|
||||||
//eprintln!("^^^^^\n{}\n^^^^^", *dense_doc_text);
|
//eprintln!("^^^^^\n{}\n^^^^^", *dense_doc_text);
|
||||||
|
|
||||||
if language.is_none() {
|
if language.is_none() {
|
||||||
|
|
Loading…
Reference in New Issue