diff --git a/quickpeep_index/src/backend.rs b/quickpeep_index/src/backend.rs index 978c5b7..d71317a 100644 --- a/quickpeep_index/src/backend.rs +++ b/quickpeep_index/src/backend.rs @@ -1,3 +1,5 @@ +use std::ops::Range; + pub mod meili; pub mod tantivy; @@ -7,6 +9,8 @@ pub trait Backend { fn add_document(&mut self, document: BackendIndependentDocument) -> anyhow::Result<()>; fn flush(&mut self) -> anyhow::Result<()>; + + fn query(&self, query: String) -> anyhow::Result<Vec<SearchDocument>>; } /// A backend-independent document struct. @@ -18,3 +22,14 @@ pub struct BackendIndependentDocument { pub tags: Vec<String>, pub url: String, } + +/// A backend-independent document struct. +#[derive(Clone, Debug)] +pub struct SearchDocument { + pub score: f32, + pub title: String, + pub excerpt: String, + pub excerpt_highlights: Vec<Range<usize>>, + pub tags: Vec<String>, + pub url: String, +} diff --git a/quickpeep_index/src/backend/tantivy.rs b/quickpeep_index/src/backend/tantivy.rs index fd21aaf..806767c 100644 --- a/quickpeep_index/src/backend/tantivy.rs +++ b/quickpeep_index/src/backend/tantivy.rs @@ -1,4 +1,4 @@ -use crate::backend::{Backend, BackendIndependentDocument}; +use crate::backend::{Backend, BackendIndependentDocument, SearchDocument}; use anyhow::Context; use fancy_mdbx::database::WrappedTable; use fancy_mdbx::environment::Env; @@ -6,7 +6,10 @@ use fancy_mdbx::wrapper::{CompressorWrapper, SerdeBareWrapper}; use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::path::Path; +use tantivy::collector::TopDocs; +use tantivy::query::QueryParser; use tantivy::schema::{Facet, Field, SchemaBuilder, STORED, TEXT}; +use tantivy::tokenizer::TokenizerManager; use tantivy::{doc, Index, IndexWriter}; pub struct Fields { @@ -26,7 +29,11 @@ pub struct TantivyBackend { } #[derive(Serialize, Deserialize, Debug, Clone)] -pub struct DocumentStoreRow {} +pub struct DocumentStoreRow { + title: String, + body: String, + nonbody: String, +} pub struct StoreTables { pub documents: WrappedTable<String, CompressorWrapper<SerdeBareWrapper<DocumentStoreRow>>>, @@ -63,7 +70,7 @@ impl TantivyBackend { } else { let mut schema_builder = SchemaBuilder::new(); let fields = Fields { - title: schema_builder.add_text_field("title", TEXT | STORED), + title: schema_builder.add_text_field("title", TEXT), article: schema_builder.add_text_field("article", TEXT), nonarticle: schema_builder.add_text_field("nonarticle", TEXT), url: schema_builder.add_text_field("url", STORED), @@ -118,10 +125,10 @@ impl Backend for TantivyBackend { } = &self.fields; let mut tantivy_doc = doc! { - *title => document.title, - *article => document.article_body, - *nonarticle => document.nonarticle_body, - *url => document.url + *title => document.title.clone(), + *article => document.article_body.clone(), + *nonarticle => document.nonarticle_body.clone(), + *url => document.url.clone(), }; // TODO do we actually want facets? How about u64 tags or something...? for tag in &document.tags { @@ -129,6 +136,17 @@ impl Backend for TantivyBackend { } index_writer.add_document(tantivy_doc)?; + self.env.rw_txn(|txn| { + self.tables.documents.put( + txn, + &document.url, + &DocumentStoreRow { + title: document.title, + body: document.article_body, + nonbody: document.nonarticle_body, + }, + ) + })?; Ok(()) } @@ -138,4 +156,49 @@ impl Backend for TantivyBackend { } Ok(()) } + + fn query(&self, query: String) -> anyhow::Result<Vec<SearchDocument>> { + let reader = self.index.reader()?; + let parser = QueryParser::new( + self.index.schema(), + vec![self.fields.title, self.fields.article, self.fields.article], + TokenizerManager::default(), + ); + + let query = parser.parse_query(&query)?; + + // TODO tweak scores? + let txn = self.env.ro_txn()?; + let searcher = reader.searcher(); + let results = searcher.search(&query, &TopDocs::with_limit(50))?; + let mut out = Vec::with_capacity(results.len()); + + for (score, doc_address) in results { + let doc = searcher.doc(doc_address)?; + let url = doc + .get_first(self.fields.url) + .context("No URL field")? + .as_text() + .context("URL not text")?; + let doc_row = self + .tables + .documents + .get(&txn, url.to_owned())? + .context("Document row not found in doc store")?; + + out.push(SearchDocument { + score, + title: doc_row.title, + // TODO + excerpt: "".to_string(), + // TODO + excerpt_highlights: vec![], + // TODO + tags: vec![], + url: url.to_owned(), + }) + } + + Ok(out) + } } diff --git a/quickpeep_indexer/src/bin/qp-index-search.rs b/quickpeep_indexer/src/bin/qp-index-search.rs new file mode 100644 index 0000000..6d61dd4 --- /dev/null +++ b/quickpeep_indexer/src/bin/qp-index-search.rs @@ -0,0 +1,42 @@ +use anyhow::Context; +use clap::Parser; +use colour::{grey_ln, yellow_ln}; +use env_logger::Env; + +use quickpeep_indexer::config::IndexerConfig; + +use std::path::PathBuf; + +/// Seeds a raker's queue with URLs +#[derive(Clone, Debug, Parser)] +pub struct Opts { + #[clap(long = "config")] + config: Option<PathBuf>, + + query: String, +} + +pub fn main() -> anyhow::Result<()> { + env_logger::Builder::from_env(Env::default().default_filter_or("info,qp_index_search=debug")) + .init(); + + let opts: Opts = Opts::parse(); + + let config_path = opts + .config + .unwrap_or_else(|| PathBuf::from("qp_indexer.toml")); + let config = IndexerConfig::load(&config_path).context("Failed to load config")?; + + let indexer_backend = config.open_indexer_backend()?; + + let results = indexer_backend.query(opts.query)?; + + for result in results { + yellow_ln!("{}", result.title); + grey_ln!("\t[{:.3}] {}", result.score, result.url); + // TODO Excerpts + println!(); + } + + Ok(()) +}