Add facility to qp-indexer that lets it download rakepacks from a feed
ci/woodpecker/push/check Pipeline was successful Details
ci/woodpecker/push/manual Pipeline failed Details
ci/woodpecker/push/release Pipeline was successful Details

This commit is contained in:
Olivier 'reivilibre' 2022-11-26 20:48:06 +00:00
parent bd16f58d9e
commit 54a468d079
4 changed files with 163 additions and 37 deletions

2
Cargo.lock generated
View File

@ -3766,11 +3766,13 @@ dependencies = [
"quickpeep_seed_parser", "quickpeep_seed_parser",
"quickpeep_structs", "quickpeep_structs",
"quickpeep_utils", "quickpeep_utils",
"reqwest",
"ron", "ron",
"serde", "serde",
"serde_bare", "serde_bare",
"serde_json", "serde_json",
"smartstring", "smartstring",
"tempfile",
"tokio", "tokio",
"url", "url",
"zstd", "zstd",

View File

@ -30,6 +30,11 @@ patricia_tree = "0.3.1"
# For decompression of emitted packs. 0.11.1+zstd.1.5.2 # For decompression of emitted packs. 0.11.1+zstd.1.5.2
zstd = "0.11.1" zstd = "0.11.1"
# HTTP Requests
reqwest = { version = "0.11.9", features = ["blocking"] }
tempfile = "3.3.0"
quickpeep_densedoc = { path = "../quickpeep_densedoc" } quickpeep_densedoc = { path = "../quickpeep_densedoc" }
quickpeep_index = { path = "../quickpeep_index" } quickpeep_index = { path = "../quickpeep_index" }
quickpeep_structs = { path = "../quickpeep_structs" } quickpeep_structs = { path = "../quickpeep_structs" }

View File

@ -2,9 +2,9 @@ use anyhow::{bail, Context};
use clap::Parser; use clap::Parser;
use colour::{blue, yellow_ln}; use colour::{blue, yellow_ln};
use env_logger::Env; use env_logger::Env;
use std::collections::HashMap; use std::collections::{BTreeSet, HashMap};
use std::fs::File; use std::fs::{File, OpenOptions};
use std::io::{BufRead, BufReader}; use std::io::{BufRead, BufReader, Write};
use patricia_tree::PatriciaMap; use patricia_tree::PatriciaMap;
use quickpeep_densedoc::DenseTree; use quickpeep_densedoc::DenseTree;
@ -20,7 +20,8 @@ use quickpeep_structs::rake_entries::{
}; };
use quickpeep_utils::urls::get_reduced_domain; use quickpeep_utils::urls::get_reduced_domain;
use smartstring::alias::CompactString; use smartstring::alias::CompactString;
use std::path::PathBuf; use std::path::{Path, PathBuf};
use tempfile::NamedTempFile;
use tokio::sync::mpsc::Receiver; use tokio::sync::mpsc::Receiver;
use url::Url; use url::Url;
@ -30,6 +31,11 @@ pub struct Opts {
#[clap(long = "config")] #[clap(long = "config")]
config: Option<PathBuf>, config: Option<PathBuf>,
/// If specified, rakepacks from a feed will automatically be fetched and indexed.
/// The rakepacks are tracked as having been processed.
#[clap(long = "feed")]
feed: Option<Url>,
rakepacks: Vec<PathBuf>, rakepacks: Vec<PathBuf>,
} }
@ -62,7 +68,110 @@ pub async fn main() -> anyhow::Result<()> {
let mut indexer_backend = config.open_indexer_backend()?; let mut indexer_backend = config.open_indexer_backend()?;
if let Some(feed) = opts.feed {
let processed_rakepack_path = config
.processed_rakepack_path()
.context("can't get a suitable location to track processed rakepacks")?;
handle_pack_feed(
feed,
&mut indexer_backend,
processed_rakepack_path,
&seed_lookup,
&icon_store,
)
.context("failed to handle pack feed")?;
}
for pack in opts.rakepacks { for pack in opts.rakepacks {
handle_pack(&pack, &mut indexer_backend, &seed_lookup, &icon_store)
.with_context(|| format!("Whilst handling pack: {pack:?}"))?;
}
indexer_backend.flush()?;
Ok(())
}
pub fn handle_pack_feed(
feed_url: Url,
indexer_backend: &mut Box<dyn Backend>,
processed_list_path: PathBuf,
seed_lookup: &SeedLookupTable,
icon_store: &IconStore,
) -> anyhow::Result<()> {
blue!("Scanning feed: ");
yellow_ln!("{:?}", feed_url);
let new_packs =
find_new_packs(feed_url.clone(), &processed_list_path).context("finding new packs")?;
let mut processed_log = OpenOptions::new()
.append(true)
.create(true)
.open(&processed_list_path)
.context("can't open processed list for append")?;
for pack_name in new_packs {
let pack_url = feed_url
.join(&pack_name)
.context("Can't resolve URL of new pack")?;
blue!("Downloading: ");
yellow_ln!("{:?}", pack_url);
let mut temp_file = NamedTempFile::new().context("opening temp file")?;
reqwest::blocking::get(pack_url.clone())
.context("failed to request pack")?
.error_for_status()?
.copy_to(temp_file.as_file_mut())
.context("failed to download pack to temp file")?;
handle_pack(temp_file.path(), indexer_backend, seed_lookup, icon_store).with_context(
|| {
format!(
"Whilst handling pack: {:?} ({:?})",
temp_file.path(),
pack_url
)
},
)?;
processed_log.write(format!("\n{}", &pack_name).as_bytes())?;
processed_log.flush()?;
}
Ok(())
}
fn find_new_packs(feed_url: Url, processed_list_path: &Path) -> anyhow::Result<BTreeSet<String>> {
let processed_file = OpenOptions::new()
.read(true)
.create(true)
.open(processed_list_path)?;
let br = BufReader::new(processed_file);
let processed: Result<BTreeSet<String>, _> = br.lines().collect();
let processed = processed.context("failed to read local processed list")?;
let mut unprocessed: BTreeSet<String> = BTreeSet::new();
let feed_lines = BufReader::new(reqwest::blocking::get(feed_url)?.error_for_status()?).lines();
for line in feed_lines {
let line = line?;
if line.is_empty() {
continue;
}
if processed.contains(&line) {
continue;
}
unprocessed.insert(line.to_owned());
}
Ok(unprocessed)
}
pub fn handle_pack(
pack: &Path,
indexer_backend: &mut Box<dyn Backend>,
seed_lookup: &SeedLookupTable,
icon_store: &IconStore,
) -> anyhow::Result<()> {
blue!("Indexing: "); blue!("Indexing: ");
yellow_ln!("{:?}", pack); yellow_ln!("{:?}", pack);
@ -77,7 +186,7 @@ pub async fn main() -> anyhow::Result<()> {
SCHEMA_RAKED_PAGES => { SCHEMA_RAKED_PAGES => {
// TODO(unstable): this condition is `.has_data_left()` but it's unstable. // TODO(unstable): this condition is `.has_data_left()` but it's unstable.
while buf_reader.fill_buf().map(|b| !b.is_empty())? { while buf_reader.fill_buf().map(|b| !b.is_empty())? {
handle_page_pack(&mut buf_reader, &seed_lookup, &mut indexer_backend) handle_page_pack(&mut buf_reader, &seed_lookup, indexer_backend)
.context("failed to handle page pack")?; .context("failed to handle page pack")?;
} }
} }
@ -96,9 +205,6 @@ pub async fn main() -> anyhow::Result<()> {
); );
} }
} }
}
indexer_backend.flush()?;
Ok(()) Ok(())
} }

View File

@ -55,4 +55,17 @@ impl IndexerConfig {
} }
} }
} }
/// Returns the path to a text file which can be used for storing a list of processed rakepacks
/// (needed for following rakepack streams over a network).
pub fn processed_rakepack_path(&self) -> anyhow::Result<PathBuf> {
match &self.index.backend {
BackendConfig::Tantivy(tantivy) => {
Ok(tantivy.index_dir.join("processed_rakepacks.lst"))
}
BackendConfig::Meili(_) => {
todo!()
}
}
}
} }