From 9866da2d1650a96470e2203e025fb9989aa03a3c Mon Sep 17 00:00:00 2001 From: Olivier 'reivilibre Date: Thu, 24 Mar 2022 23:45:36 +0000 Subject: [PATCH] Load documents and index them! --- Cargo.lock | 1 + quickpeep_indexer/Cargo.toml | 1 + quickpeep_indexer/src/bin/qp-indexer.rs | 23 ++++++++++++++++++++--- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 67e1967..f07542e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3465,6 +3465,7 @@ dependencies = [ "colour", "env_logger", "log", + "quickpeep_densedoc", "quickpeep_index", "quickpeep_structs", "serde", diff --git a/quickpeep_indexer/Cargo.toml b/quickpeep_indexer/Cargo.toml index 4b26ba9..1a322c2 100644 --- a/quickpeep_indexer/Cargo.toml +++ b/quickpeep_indexer/Cargo.toml @@ -17,5 +17,6 @@ toml = "0.5.8" clap = { version = "3.1.6", features = ["derive"] } colour = "0.6.0" +quickpeep_densedoc = { path = "../quickpeep_densedoc" } quickpeep_index = { path = "../quickpeep_index" } quickpeep_structs = { path = "../quickpeep_structs" } diff --git a/quickpeep_indexer/src/bin/qp-indexer.rs b/quickpeep_indexer/src/bin/qp-indexer.rs index 43cd067..08aa0a5 100644 --- a/quickpeep_indexer/src/bin/qp-indexer.rs +++ b/quickpeep_indexer/src/bin/qp-indexer.rs @@ -5,6 +5,8 @@ use env_logger::Env; use std::fs::File; use std::io::{BufRead, BufReader}; +use quickpeep_densedoc::{DenseTree}; +use quickpeep_index::backend::BackendIndependentDocument; use quickpeep_indexer::config::IndexerConfig; use quickpeep_structs::rake_entries::{PackRecord, RakedPageEntry, SCHEMA_RAKED_PAGES}; use std::path::PathBuf; @@ -28,7 +30,7 @@ pub fn main() -> anyhow::Result<()> { .unwrap_or_else(|| PathBuf::from("qp_indexer.toml")); let config = IndexerConfig::load(&config_path).context("Failed to load config")?; - let _indexer_backend = config.open_indexer_backend()?; + let mut indexer_backend = config.open_indexer_backend()?; for pack in opts.rakepacks { blue!("Indexing: "); @@ -47,10 +49,25 @@ pub fn main() -> anyhow::Result<()> { // TODO(unstable): this condition is `.has_data_left()` but it's unstable. while buf_reader.fill_buf().map(|b| !b.is_empty())? { - let _page_record: PackRecord = - serde_bare::from_reader(&mut buf_reader)?; + let page_record: PackRecord = serde_bare::from_reader(&mut buf_reader)?; + + let document = page_record.record.document; + + let article_body = DenseTree::generate_textual_format(&document.body_content); + let nonarticle_body = DenseTree::generate_textual_format(&document.body_remainder); + + // TODO Store the actual structure of the document in the store? + indexer_backend.add_document(BackendIndependentDocument { + title: document.head.title, + article_body, + nonarticle_body, + // TODO populate tags & antifeatures + tags: vec![], + url: page_record.url.to_string(), + })?; } } + indexer_backend.flush()?; Ok(()) }