Add an open function for the Tantivy Backend

This commit is contained in:
Olivier 'reivilibre' 2022-03-24 22:55:21 +00:00
parent 73154e7e34
commit f43424de94
1 changed files with 94 additions and 27 deletions

View File

@ -1,40 +1,107 @@
use crate::backend::{Backend, BackendIndependentDocument};
use anyhow::Context;
use std::collections::HashMap;
use std::path::Path;
use tantivy::schema::{Schema, STORED, TEXT};
use tantivy::{Index, IndexWriter};
use tantivy::schema::{Facet, Field, SchemaBuilder, STORED, TEXT};
use tantivy::{doc, Index, IndexWriter};
fn experiment_tantivy() -> anyhow::Result<()> {
let mut schema_builder = Schema::builder();
// TODO what should our schema look like? Should we have another database with stuff?
// (notably we could Zstd-compress things in another datastore, for reduced disk usage...)
schema_builder.add_text_field("title", TEXT | STORED);
schema_builder.add_text_field("article", TEXT);
schema_builder.add_text_field("nonarticle", TEXT);
schema_builder.add_text_field("url", STORED);
schema_builder.add_facet_field("tags", ());
// schema_builder.add_bytes_field()
let schema = schema_builder.build();
let index = tantivy::Index::create_in_dir(Path::new("/tmp/tindex"), schema)?;
let _writer = index.writer(100 * 1024 * 1024)?;
Ok(())
pub struct Fields {
title: Field,
article: Field,
nonarticle: Field,
url: Field,
tags: Field,
}
pub struct TantivyBackend {
index: Index,
index_writer: IndexWriter,
fields: Fields,
index_writer: Option<IndexWriter>,
}
impl TantivyBackend {
pub fn open(path: &Path) -> anyhow::Result<TantivyBackend> {
if !path.exists() {
std::fs::create_dir(path)?;
}
let dir_path = path.join("tantivy");
let (index, fields) = if dir_path.exists() {
let index = Index::open_in_dir(dir_path)?;
let schema = index.schema();
let mut field_map: HashMap<_, _> = schema
.fields()
.map(|(field, field_entry)| (field_entry.name(), field))
.collect();
let fields = Fields {
title: field_map.remove("title").context("No title field")?,
article: field_map.remove("article").context("No article field")?,
nonarticle: field_map
.remove("nonarticle")
.context("No nonarticle field")?,
url: field_map.remove("url").context("No url field")?,
tags: field_map.remove("tags").context("No tags field")?,
};
(index, fields)
} else {
let mut schema_builder = SchemaBuilder::new();
let fields = Fields {
title: schema_builder.add_text_field("title", TEXT | STORED),
article: schema_builder.add_text_field("article", TEXT),
nonarticle: schema_builder.add_text_field("nonarticle", TEXT),
url: schema_builder.add_text_field("url", STORED),
tags: schema_builder.add_facet_field("tags", ()),
};
let schema = schema_builder.build();
let index = Index::create_in_dir(dir_path, schema)?;
(index, fields)
};
Ok(TantivyBackend {
index,
fields,
index_writer: None,
})
}
}
impl Backend for TantivyBackend {
fn add_document(&mut self, _document: BackendIndependentDocument) -> anyhow::Result<()> {
// self.index_writer.add_document(doc! {
// "title" => document.title,
// "article" => document.article_body,
// "nonarticle" => document.nonarticle_body,
// "url" => document.url,
// "tags" => document.tags
// })?;
todo!()
fn add_document(&mut self, document: BackendIndependentDocument) -> anyhow::Result<()> {
let index_writer = match self.index_writer.as_ref() {
None => {
self.index_writer = Some(self.index.writer(100 * 1024 * 1024)?);
self.index_writer.as_ref().unwrap()
}
Some(index_writer) => index_writer,
};
let Fields {
title,
article,
nonarticle,
url,
tags,
} = &self.fields;
let mut tantivy_doc = doc! {
*title => document.title,
*article => document.article_body,
*nonarticle => document.nonarticle_body,
*url => document.url
};
// TODO do we actually want facets? How about u64 tags or something...?
for tag in &document.tags {
tantivy_doc.add_facet(*tags, Facet::from(&format!("/{}", tag)));
}
index_writer.add_document(tantivy_doc)?;
Ok(())
}
fn flush(&mut self) -> anyhow::Result<()> {