Add a way to search for documents at the CLI (proof of concept)
This commit is contained in:
parent
c01740113f
commit
3a6f2fdf7a
@ -1,3 +1,5 @@
|
||||
use std::ops::Range;
|
||||
|
||||
pub mod meili;
|
||||
pub mod tantivy;
|
||||
|
||||
@ -7,6 +9,8 @@ pub trait Backend {
|
||||
fn add_document(&mut self, document: BackendIndependentDocument) -> anyhow::Result<()>;
|
||||
|
||||
fn flush(&mut self) -> anyhow::Result<()>;
|
||||
|
||||
fn query(&self, query: String) -> anyhow::Result<Vec<SearchDocument>>;
|
||||
}
|
||||
|
||||
/// A backend-independent document struct.
|
||||
@ -18,3 +22,14 @@ pub struct BackendIndependentDocument {
|
||||
pub tags: Vec<String>,
|
||||
pub url: String,
|
||||
}
|
||||
|
||||
/// A backend-independent document struct.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct SearchDocument {
|
||||
pub score: f32,
|
||||
pub title: String,
|
||||
pub excerpt: String,
|
||||
pub excerpt_highlights: Vec<Range<usize>>,
|
||||
pub tags: Vec<String>,
|
||||
pub url: String,
|
||||
}
|
||||
|
@ -1,4 +1,4 @@
|
||||
use crate::backend::{Backend, BackendIndependentDocument};
|
||||
use crate::backend::{Backend, BackendIndependentDocument, SearchDocument};
|
||||
use anyhow::Context;
|
||||
use fancy_mdbx::database::WrappedTable;
|
||||
use fancy_mdbx::environment::Env;
|
||||
@ -6,7 +6,10 @@ use fancy_mdbx::wrapper::{CompressorWrapper, SerdeBareWrapper};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::{Facet, Field, SchemaBuilder, STORED, TEXT};
|
||||
use tantivy::tokenizer::TokenizerManager;
|
||||
use tantivy::{doc, Index, IndexWriter};
|
||||
|
||||
pub struct Fields {
|
||||
@ -26,7 +29,11 @@ pub struct TantivyBackend {
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||
pub struct DocumentStoreRow {}
|
||||
pub struct DocumentStoreRow {
|
||||
title: String,
|
||||
body: String,
|
||||
nonbody: String,
|
||||
}
|
||||
|
||||
pub struct StoreTables {
|
||||
pub documents: WrappedTable<String, CompressorWrapper<SerdeBareWrapper<DocumentStoreRow>>>,
|
||||
@ -63,7 +70,7 @@ impl TantivyBackend {
|
||||
} else {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let fields = Fields {
|
||||
title: schema_builder.add_text_field("title", TEXT | STORED),
|
||||
title: schema_builder.add_text_field("title", TEXT),
|
||||
article: schema_builder.add_text_field("article", TEXT),
|
||||
nonarticle: schema_builder.add_text_field("nonarticle", TEXT),
|
||||
url: schema_builder.add_text_field("url", STORED),
|
||||
@ -118,10 +125,10 @@ impl Backend for TantivyBackend {
|
||||
} = &self.fields;
|
||||
|
||||
let mut tantivy_doc = doc! {
|
||||
*title => document.title,
|
||||
*article => document.article_body,
|
||||
*nonarticle => document.nonarticle_body,
|
||||
*url => document.url
|
||||
*title => document.title.clone(),
|
||||
*article => document.article_body.clone(),
|
||||
*nonarticle => document.nonarticle_body.clone(),
|
||||
*url => document.url.clone(),
|
||||
};
|
||||
// TODO do we actually want facets? How about u64 tags or something...?
|
||||
for tag in &document.tags {
|
||||
@ -129,6 +136,17 @@ impl Backend for TantivyBackend {
|
||||
}
|
||||
|
||||
index_writer.add_document(tantivy_doc)?;
|
||||
self.env.rw_txn(|txn| {
|
||||
self.tables.documents.put(
|
||||
txn,
|
||||
&document.url,
|
||||
&DocumentStoreRow {
|
||||
title: document.title,
|
||||
body: document.article_body,
|
||||
nonbody: document.nonarticle_body,
|
||||
},
|
||||
)
|
||||
})?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@ -138,4 +156,49 @@ impl Backend for TantivyBackend {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn query(&self, query: String) -> anyhow::Result<Vec<SearchDocument>> {
|
||||
let reader = self.index.reader()?;
|
||||
let parser = QueryParser::new(
|
||||
self.index.schema(),
|
||||
vec![self.fields.title, self.fields.article, self.fields.article],
|
||||
TokenizerManager::default(),
|
||||
);
|
||||
|
||||
let query = parser.parse_query(&query)?;
|
||||
|
||||
// TODO tweak scores?
|
||||
let txn = self.env.ro_txn()?;
|
||||
let searcher = reader.searcher();
|
||||
let results = searcher.search(&query, &TopDocs::with_limit(50))?;
|
||||
let mut out = Vec::with_capacity(results.len());
|
||||
|
||||
for (score, doc_address) in results {
|
||||
let doc = searcher.doc(doc_address)?;
|
||||
let url = doc
|
||||
.get_first(self.fields.url)
|
||||
.context("No URL field")?
|
||||
.as_text()
|
||||
.context("URL not text")?;
|
||||
let doc_row = self
|
||||
.tables
|
||||
.documents
|
||||
.get(&txn, url.to_owned())?
|
||||
.context("Document row not found in doc store")?;
|
||||
|
||||
out.push(SearchDocument {
|
||||
score,
|
||||
title: doc_row.title,
|
||||
// TODO
|
||||
excerpt: "".to_string(),
|
||||
// TODO
|
||||
excerpt_highlights: vec![],
|
||||
// TODO
|
||||
tags: vec![],
|
||||
url: url.to_owned(),
|
||||
})
|
||||
}
|
||||
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
|
42
quickpeep_indexer/src/bin/qp-index-search.rs
Normal file
42
quickpeep_indexer/src/bin/qp-index-search.rs
Normal file
@ -0,0 +1,42 @@
|
||||
use anyhow::Context;
|
||||
use clap::Parser;
|
||||
use colour::{grey_ln, yellow_ln};
|
||||
use env_logger::Env;
|
||||
|
||||
use quickpeep_indexer::config::IndexerConfig;
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
/// Seeds a raker's queue with URLs
|
||||
#[derive(Clone, Debug, Parser)]
|
||||
pub struct Opts {
|
||||
#[clap(long = "config")]
|
||||
config: Option<PathBuf>,
|
||||
|
||||
query: String,
|
||||
}
|
||||
|
||||
pub fn main() -> anyhow::Result<()> {
|
||||
env_logger::Builder::from_env(Env::default().default_filter_or("info,qp_index_search=debug"))
|
||||
.init();
|
||||
|
||||
let opts: Opts = Opts::parse();
|
||||
|
||||
let config_path = opts
|
||||
.config
|
||||
.unwrap_or_else(|| PathBuf::from("qp_indexer.toml"));
|
||||
let config = IndexerConfig::load(&config_path).context("Failed to load config")?;
|
||||
|
||||
let indexer_backend = config.open_indexer_backend()?;
|
||||
|
||||
let results = indexer_backend.query(opts.query)?;
|
||||
|
||||
for result in results {
|
||||
yellow_ln!("{}", result.title);
|
||||
grey_ln!("\t[{:.3}] {}", result.score, result.url);
|
||||
// TODO Excerpts
|
||||
println!();
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
Loading…
Reference in New Issue
Block a user