diff --git a/quickpeep_index/src/backend/tantivy.rs b/quickpeep_index/src/backend/tantivy.rs index d5370da..ffd1198 100644 --- a/quickpeep_index/src/backend/tantivy.rs +++ b/quickpeep_index/src/backend/tantivy.rs @@ -1,40 +1,107 @@ use crate::backend::{Backend, BackendIndependentDocument}; +use anyhow::Context; +use std::collections::HashMap; use std::path::Path; -use tantivy::schema::{Schema, STORED, TEXT}; -use tantivy::{Index, IndexWriter}; +use tantivy::schema::{Facet, Field, SchemaBuilder, STORED, TEXT}; +use tantivy::{doc, Index, IndexWriter}; -fn experiment_tantivy() -> anyhow::Result<()> { - let mut schema_builder = Schema::builder(); - // TODO what should our schema look like? Should we have another database with stuff? - // (notably we could Zstd-compress things in another datastore, for reduced disk usage...) - schema_builder.add_text_field("title", TEXT | STORED); - schema_builder.add_text_field("article", TEXT); - schema_builder.add_text_field("nonarticle", TEXT); - schema_builder.add_text_field("url", STORED); - schema_builder.add_facet_field("tags", ()); - // schema_builder.add_bytes_field() - let schema = schema_builder.build(); - let index = tantivy::Index::create_in_dir(Path::new("/tmp/tindex"), schema)?; - let _writer = index.writer(100 * 1024 * 1024)?; - - Ok(()) +pub struct Fields { + title: Field, + article: Field, + nonarticle: Field, + url: Field, + tags: Field, } pub struct TantivyBackend { index: Index, - index_writer: IndexWriter, + fields: Fields, + index_writer: Option, +} + +impl TantivyBackend { + pub fn open(path: &Path) -> anyhow::Result { + if !path.exists() { + std::fs::create_dir(path)?; + } + + let dir_path = path.join("tantivy"); + + let (index, fields) = if dir_path.exists() { + let index = Index::open_in_dir(dir_path)?; + + let schema = index.schema(); + let mut field_map: HashMap<_, _> = schema + .fields() + .map(|(field, field_entry)| (field_entry.name(), field)) + .collect(); + + let fields = Fields { + title: field_map.remove("title").context("No title field")?, + article: field_map.remove("article").context("No article field")?, + nonarticle: field_map + .remove("nonarticle") + .context("No nonarticle field")?, + url: field_map.remove("url").context("No url field")?, + tags: field_map.remove("tags").context("No tags field")?, + }; + + (index, fields) + } else { + let mut schema_builder = SchemaBuilder::new(); + let fields = Fields { + title: schema_builder.add_text_field("title", TEXT | STORED), + article: schema_builder.add_text_field("article", TEXT), + nonarticle: schema_builder.add_text_field("nonarticle", TEXT), + url: schema_builder.add_text_field("url", STORED), + tags: schema_builder.add_facet_field("tags", ()), + }; + let schema = schema_builder.build(); + + let index = Index::create_in_dir(dir_path, schema)?; + + (index, fields) + }; + + Ok(TantivyBackend { + index, + fields, + index_writer: None, + }) + } } impl Backend for TantivyBackend { - fn add_document(&mut self, _document: BackendIndependentDocument) -> anyhow::Result<()> { - // self.index_writer.add_document(doc! { - // "title" => document.title, - // "article" => document.article_body, - // "nonarticle" => document.nonarticle_body, - // "url" => document.url, - // "tags" => document.tags - // })?; - todo!() + fn add_document(&mut self, document: BackendIndependentDocument) -> anyhow::Result<()> { + let index_writer = match self.index_writer.as_ref() { + None => { + self.index_writer = Some(self.index.writer(100 * 1024 * 1024)?); + self.index_writer.as_ref().unwrap() + } + Some(index_writer) => index_writer, + }; + + let Fields { + title, + article, + nonarticle, + url, + tags, + } = &self.fields; + + let mut tantivy_doc = doc! { + *title => document.title, + *article => document.article_body, + *nonarticle => document.nonarticle_body, + *url => document.url + }; + // TODO do we actually want facets? How about u64 tags or something...? + for tag in &document.tags { + tantivy_doc.add_facet(*tags, Facet::from(&format!("/{}", tag))); + } + + index_writer.add_document(tantivy_doc)?; + Ok(()) } fn flush(&mut self) -> anyhow::Result<()> {