Add a way to search for documents at the CLI (proof of concept)

This commit is contained in:
Olivier 'reivilibre' 2022-03-25 19:28:57 +00:00
parent c01740113f
commit 3a6f2fdf7a
3 changed files with 127 additions and 7 deletions

View File

@ -1,3 +1,5 @@
use std::ops::Range;
pub mod meili;
pub mod tantivy;
@ -7,6 +9,8 @@ pub trait Backend {
fn add_document(&mut self, document: BackendIndependentDocument) -> anyhow::Result<()>;
fn flush(&mut self) -> anyhow::Result<()>;
fn query(&self, query: String) -> anyhow::Result<Vec<SearchDocument>>;
}
/// A backend-independent document struct.
@ -18,3 +22,14 @@ pub struct BackendIndependentDocument {
pub tags: Vec<String>,
pub url: String,
}
/// A backend-independent document struct.
#[derive(Clone, Debug)]
pub struct SearchDocument {
pub score: f32,
pub title: String,
pub excerpt: String,
pub excerpt_highlights: Vec<Range<usize>>,
pub tags: Vec<String>,
pub url: String,
}

View File

@ -1,4 +1,4 @@
use crate::backend::{Backend, BackendIndependentDocument};
use crate::backend::{Backend, BackendIndependentDocument, SearchDocument};
use anyhow::Context;
use fancy_mdbx::database::WrappedTable;
use fancy_mdbx::environment::Env;
@ -6,7 +6,10 @@ use fancy_mdbx::wrapper::{CompressorWrapper, SerdeBareWrapper};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::path::Path;
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::{Facet, Field, SchemaBuilder, STORED, TEXT};
use tantivy::tokenizer::TokenizerManager;
use tantivy::{doc, Index, IndexWriter};
pub struct Fields {
@ -26,7 +29,11 @@ pub struct TantivyBackend {
}
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct DocumentStoreRow {}
pub struct DocumentStoreRow {
title: String,
body: String,
nonbody: String,
}
pub struct StoreTables {
pub documents: WrappedTable<String, CompressorWrapper<SerdeBareWrapper<DocumentStoreRow>>>,
@ -63,7 +70,7 @@ impl TantivyBackend {
} else {
let mut schema_builder = SchemaBuilder::new();
let fields = Fields {
title: schema_builder.add_text_field("title", TEXT | STORED),
title: schema_builder.add_text_field("title", TEXT),
article: schema_builder.add_text_field("article", TEXT),
nonarticle: schema_builder.add_text_field("nonarticle", TEXT),
url: schema_builder.add_text_field("url", STORED),
@ -118,10 +125,10 @@ impl Backend for TantivyBackend {
} = &self.fields;
let mut tantivy_doc = doc! {
*title => document.title,
*article => document.article_body,
*nonarticle => document.nonarticle_body,
*url => document.url
*title => document.title.clone(),
*article => document.article_body.clone(),
*nonarticle => document.nonarticle_body.clone(),
*url => document.url.clone(),
};
// TODO do we actually want facets? How about u64 tags or something...?
for tag in &document.tags {
@ -129,6 +136,17 @@ impl Backend for TantivyBackend {
}
index_writer.add_document(tantivy_doc)?;
self.env.rw_txn(|txn| {
self.tables.documents.put(
txn,
&document.url,
&DocumentStoreRow {
title: document.title,
body: document.article_body,
nonbody: document.nonarticle_body,
},
)
})?;
Ok(())
}
@ -138,4 +156,49 @@ impl Backend for TantivyBackend {
}
Ok(())
}
fn query(&self, query: String) -> anyhow::Result<Vec<SearchDocument>> {
let reader = self.index.reader()?;
let parser = QueryParser::new(
self.index.schema(),
vec![self.fields.title, self.fields.article, self.fields.article],
TokenizerManager::default(),
);
let query = parser.parse_query(&query)?;
// TODO tweak scores?
let txn = self.env.ro_txn()?;
let searcher = reader.searcher();
let results = searcher.search(&query, &TopDocs::with_limit(50))?;
let mut out = Vec::with_capacity(results.len());
for (score, doc_address) in results {
let doc = searcher.doc(doc_address)?;
let url = doc
.get_first(self.fields.url)
.context("No URL field")?
.as_text()
.context("URL not text")?;
let doc_row = self
.tables
.documents
.get(&txn, url.to_owned())?
.context("Document row not found in doc store")?;
out.push(SearchDocument {
score,
title: doc_row.title,
// TODO
excerpt: "".to_string(),
// TODO
excerpt_highlights: vec![],
// TODO
tags: vec![],
url: url.to_owned(),
})
}
Ok(out)
}
}

View File

@ -0,0 +1,42 @@
use anyhow::Context;
use clap::Parser;
use colour::{grey_ln, yellow_ln};
use env_logger::Env;
use quickpeep_indexer::config::IndexerConfig;
use std::path::PathBuf;
/// Seeds a raker's queue with URLs
#[derive(Clone, Debug, Parser)]
pub struct Opts {
#[clap(long = "config")]
config: Option<PathBuf>,
query: String,
}
pub fn main() -> anyhow::Result<()> {
env_logger::Builder::from_env(Env::default().default_filter_or("info,qp_index_search=debug"))
.init();
let opts: Opts = Opts::parse();
let config_path = opts
.config
.unwrap_or_else(|| PathBuf::from("qp_indexer.toml"));
let config = IndexerConfig::load(&config_path).context("Failed to load config")?;
let indexer_backend = config.open_indexer_backend()?;
let results = indexer_backend.query(opts.query)?;
for result in results {
yellow_ln!("{}", result.title);
grey_ln!("\t[{:.3}] {}", result.score, result.url);
// TODO Excerpts
println!();
}
Ok(())
}