Add a way to search for documents at the CLI (proof of concept)
This commit is contained in:
parent
c01740113f
commit
3a6f2fdf7a
@ -1,3 +1,5 @@
|
|||||||
|
use std::ops::Range;
|
||||||
|
|
||||||
pub mod meili;
|
pub mod meili;
|
||||||
pub mod tantivy;
|
pub mod tantivy;
|
||||||
|
|
||||||
@ -7,6 +9,8 @@ pub trait Backend {
|
|||||||
fn add_document(&mut self, document: BackendIndependentDocument) -> anyhow::Result<()>;
|
fn add_document(&mut self, document: BackendIndependentDocument) -> anyhow::Result<()>;
|
||||||
|
|
||||||
fn flush(&mut self) -> anyhow::Result<()>;
|
fn flush(&mut self) -> anyhow::Result<()>;
|
||||||
|
|
||||||
|
fn query(&self, query: String) -> anyhow::Result<Vec<SearchDocument>>;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A backend-independent document struct.
|
/// A backend-independent document struct.
|
||||||
@ -18,3 +22,14 @@ pub struct BackendIndependentDocument {
|
|||||||
pub tags: Vec<String>,
|
pub tags: Vec<String>,
|
||||||
pub url: String,
|
pub url: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// A backend-independent document struct.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct SearchDocument {
|
||||||
|
pub score: f32,
|
||||||
|
pub title: String,
|
||||||
|
pub excerpt: String,
|
||||||
|
pub excerpt_highlights: Vec<Range<usize>>,
|
||||||
|
pub tags: Vec<String>,
|
||||||
|
pub url: String,
|
||||||
|
}
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
use crate::backend::{Backend, BackendIndependentDocument};
|
use crate::backend::{Backend, BackendIndependentDocument, SearchDocument};
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use fancy_mdbx::database::WrappedTable;
|
use fancy_mdbx::database::WrappedTable;
|
||||||
use fancy_mdbx::environment::Env;
|
use fancy_mdbx::environment::Env;
|
||||||
@ -6,7 +6,10 @@ use fancy_mdbx::wrapper::{CompressorWrapper, SerdeBareWrapper};
|
|||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
use tantivy::collector::TopDocs;
|
||||||
|
use tantivy::query::QueryParser;
|
||||||
use tantivy::schema::{Facet, Field, SchemaBuilder, STORED, TEXT};
|
use tantivy::schema::{Facet, Field, SchemaBuilder, STORED, TEXT};
|
||||||
|
use tantivy::tokenizer::TokenizerManager;
|
||||||
use tantivy::{doc, Index, IndexWriter};
|
use tantivy::{doc, Index, IndexWriter};
|
||||||
|
|
||||||
pub struct Fields {
|
pub struct Fields {
|
||||||
@ -26,7 +29,11 @@ pub struct TantivyBackend {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Debug, Clone)]
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||||
pub struct DocumentStoreRow {}
|
pub struct DocumentStoreRow {
|
||||||
|
title: String,
|
||||||
|
body: String,
|
||||||
|
nonbody: String,
|
||||||
|
}
|
||||||
|
|
||||||
pub struct StoreTables {
|
pub struct StoreTables {
|
||||||
pub documents: WrappedTable<String, CompressorWrapper<SerdeBareWrapper<DocumentStoreRow>>>,
|
pub documents: WrappedTable<String, CompressorWrapper<SerdeBareWrapper<DocumentStoreRow>>>,
|
||||||
@ -63,7 +70,7 @@ impl TantivyBackend {
|
|||||||
} else {
|
} else {
|
||||||
let mut schema_builder = SchemaBuilder::new();
|
let mut schema_builder = SchemaBuilder::new();
|
||||||
let fields = Fields {
|
let fields = Fields {
|
||||||
title: schema_builder.add_text_field("title", TEXT | STORED),
|
title: schema_builder.add_text_field("title", TEXT),
|
||||||
article: schema_builder.add_text_field("article", TEXT),
|
article: schema_builder.add_text_field("article", TEXT),
|
||||||
nonarticle: schema_builder.add_text_field("nonarticle", TEXT),
|
nonarticle: schema_builder.add_text_field("nonarticle", TEXT),
|
||||||
url: schema_builder.add_text_field("url", STORED),
|
url: schema_builder.add_text_field("url", STORED),
|
||||||
@ -118,10 +125,10 @@ impl Backend for TantivyBackend {
|
|||||||
} = &self.fields;
|
} = &self.fields;
|
||||||
|
|
||||||
let mut tantivy_doc = doc! {
|
let mut tantivy_doc = doc! {
|
||||||
*title => document.title,
|
*title => document.title.clone(),
|
||||||
*article => document.article_body,
|
*article => document.article_body.clone(),
|
||||||
*nonarticle => document.nonarticle_body,
|
*nonarticle => document.nonarticle_body.clone(),
|
||||||
*url => document.url
|
*url => document.url.clone(),
|
||||||
};
|
};
|
||||||
// TODO do we actually want facets? How about u64 tags or something...?
|
// TODO do we actually want facets? How about u64 tags or something...?
|
||||||
for tag in &document.tags {
|
for tag in &document.tags {
|
||||||
@ -129,6 +136,17 @@ impl Backend for TantivyBackend {
|
|||||||
}
|
}
|
||||||
|
|
||||||
index_writer.add_document(tantivy_doc)?;
|
index_writer.add_document(tantivy_doc)?;
|
||||||
|
self.env.rw_txn(|txn| {
|
||||||
|
self.tables.documents.put(
|
||||||
|
txn,
|
||||||
|
&document.url,
|
||||||
|
&DocumentStoreRow {
|
||||||
|
title: document.title,
|
||||||
|
body: document.article_body,
|
||||||
|
nonbody: document.nonarticle_body,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
})?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -138,4 +156,49 @@ impl Backend for TantivyBackend {
|
|||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn query(&self, query: String) -> anyhow::Result<Vec<SearchDocument>> {
|
||||||
|
let reader = self.index.reader()?;
|
||||||
|
let parser = QueryParser::new(
|
||||||
|
self.index.schema(),
|
||||||
|
vec![self.fields.title, self.fields.article, self.fields.article],
|
||||||
|
TokenizerManager::default(),
|
||||||
|
);
|
||||||
|
|
||||||
|
let query = parser.parse_query(&query)?;
|
||||||
|
|
||||||
|
// TODO tweak scores?
|
||||||
|
let txn = self.env.ro_txn()?;
|
||||||
|
let searcher = reader.searcher();
|
||||||
|
let results = searcher.search(&query, &TopDocs::with_limit(50))?;
|
||||||
|
let mut out = Vec::with_capacity(results.len());
|
||||||
|
|
||||||
|
for (score, doc_address) in results {
|
||||||
|
let doc = searcher.doc(doc_address)?;
|
||||||
|
let url = doc
|
||||||
|
.get_first(self.fields.url)
|
||||||
|
.context("No URL field")?
|
||||||
|
.as_text()
|
||||||
|
.context("URL not text")?;
|
||||||
|
let doc_row = self
|
||||||
|
.tables
|
||||||
|
.documents
|
||||||
|
.get(&txn, url.to_owned())?
|
||||||
|
.context("Document row not found in doc store")?;
|
||||||
|
|
||||||
|
out.push(SearchDocument {
|
||||||
|
score,
|
||||||
|
title: doc_row.title,
|
||||||
|
// TODO
|
||||||
|
excerpt: "".to_string(),
|
||||||
|
// TODO
|
||||||
|
excerpt_highlights: vec![],
|
||||||
|
// TODO
|
||||||
|
tags: vec![],
|
||||||
|
url: url.to_owned(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(out)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
42
quickpeep_indexer/src/bin/qp-index-search.rs
Normal file
42
quickpeep_indexer/src/bin/qp-index-search.rs
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
use anyhow::Context;
|
||||||
|
use clap::Parser;
|
||||||
|
use colour::{grey_ln, yellow_ln};
|
||||||
|
use env_logger::Env;
|
||||||
|
|
||||||
|
use quickpeep_indexer::config::IndexerConfig;
|
||||||
|
|
||||||
|
use std::path::PathBuf;
|
||||||
|
|
||||||
|
/// Seeds a raker's queue with URLs
|
||||||
|
#[derive(Clone, Debug, Parser)]
|
||||||
|
pub struct Opts {
|
||||||
|
#[clap(long = "config")]
|
||||||
|
config: Option<PathBuf>,
|
||||||
|
|
||||||
|
query: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn main() -> anyhow::Result<()> {
|
||||||
|
env_logger::Builder::from_env(Env::default().default_filter_or("info,qp_index_search=debug"))
|
||||||
|
.init();
|
||||||
|
|
||||||
|
let opts: Opts = Opts::parse();
|
||||||
|
|
||||||
|
let config_path = opts
|
||||||
|
.config
|
||||||
|
.unwrap_or_else(|| PathBuf::from("qp_indexer.toml"));
|
||||||
|
let config = IndexerConfig::load(&config_path).context("Failed to load config")?;
|
||||||
|
|
||||||
|
let indexer_backend = config.open_indexer_backend()?;
|
||||||
|
|
||||||
|
let results = indexer_backend.query(opts.query)?;
|
||||||
|
|
||||||
|
for result in results {
|
||||||
|
yellow_ln!("{}", result.title);
|
||||||
|
grey_ln!("\t[{:.3}] {}", result.score, result.url);
|
||||||
|
// TODO Excerpts
|
||||||
|
println!();
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user