Load documents and index them!

This commit is contained in:
Olivier 'reivilibre' 2022-03-24 23:45:36 +00:00
parent 5418afe8dd
commit 9866da2d16
3 changed files with 22 additions and 3 deletions

1
Cargo.lock generated
View File

@ -3465,6 +3465,7 @@ dependencies = [
"colour", "colour",
"env_logger", "env_logger",
"log", "log",
"quickpeep_densedoc",
"quickpeep_index", "quickpeep_index",
"quickpeep_structs", "quickpeep_structs",
"serde", "serde",

View File

@ -17,5 +17,6 @@ toml = "0.5.8"
clap = { version = "3.1.6", features = ["derive"] } clap = { version = "3.1.6", features = ["derive"] }
colour = "0.6.0" colour = "0.6.0"
quickpeep_densedoc = { path = "../quickpeep_densedoc" }
quickpeep_index = { path = "../quickpeep_index" } quickpeep_index = { path = "../quickpeep_index" }
quickpeep_structs = { path = "../quickpeep_structs" } quickpeep_structs = { path = "../quickpeep_structs" }

View File

@ -5,6 +5,8 @@ use env_logger::Env;
use std::fs::File; use std::fs::File;
use std::io::{BufRead, BufReader}; use std::io::{BufRead, BufReader};
use quickpeep_densedoc::{DenseTree};
use quickpeep_index::backend::BackendIndependentDocument;
use quickpeep_indexer::config::IndexerConfig; use quickpeep_indexer::config::IndexerConfig;
use quickpeep_structs::rake_entries::{PackRecord, RakedPageEntry, SCHEMA_RAKED_PAGES}; use quickpeep_structs::rake_entries::{PackRecord, RakedPageEntry, SCHEMA_RAKED_PAGES};
use std::path::PathBuf; use std::path::PathBuf;
@ -28,7 +30,7 @@ pub fn main() -> anyhow::Result<()> {
.unwrap_or_else(|| PathBuf::from("qp_indexer.toml")); .unwrap_or_else(|| PathBuf::from("qp_indexer.toml"));
let config = IndexerConfig::load(&config_path).context("Failed to load config")?; let config = IndexerConfig::load(&config_path).context("Failed to load config")?;
let _indexer_backend = config.open_indexer_backend()?; let mut indexer_backend = config.open_indexer_backend()?;
for pack in opts.rakepacks { for pack in opts.rakepacks {
blue!("Indexing: "); blue!("Indexing: ");
@ -47,10 +49,25 @@ pub fn main() -> anyhow::Result<()> {
// TODO(unstable): this condition is `.has_data_left()` but it's unstable. // TODO(unstable): this condition is `.has_data_left()` but it's unstable.
while buf_reader.fill_buf().map(|b| !b.is_empty())? { while buf_reader.fill_buf().map(|b| !b.is_empty())? {
let _page_record: PackRecord<RakedPageEntry> = let page_record: PackRecord<RakedPageEntry> = serde_bare::from_reader(&mut buf_reader)?;
serde_bare::from_reader(&mut buf_reader)?;
let document = page_record.record.document;
let article_body = DenseTree::generate_textual_format(&document.body_content);
let nonarticle_body = DenseTree::generate_textual_format(&document.body_remainder);
// TODO Store the actual structure of the document in the store?
indexer_backend.add_document(BackendIndependentDocument {
title: document.head.title,
article_body,
nonarticle_body,
// TODO populate tags & antifeatures
tags: vec![],
url: page_record.url.to_string(),
})?;
} }
} }
indexer_backend.flush()?;
Ok(()) Ok(())
} }