Add facility to qp-indexer that lets it download rakepacks from a feed
This commit is contained in:
parent
bd16f58d9e
commit
54a468d079
|
@ -3766,11 +3766,13 @@ dependencies = [
|
||||||
"quickpeep_seed_parser",
|
"quickpeep_seed_parser",
|
||||||
"quickpeep_structs",
|
"quickpeep_structs",
|
||||||
"quickpeep_utils",
|
"quickpeep_utils",
|
||||||
|
"reqwest",
|
||||||
"ron",
|
"ron",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_bare",
|
"serde_bare",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"smartstring",
|
"smartstring",
|
||||||
|
"tempfile",
|
||||||
"tokio",
|
"tokio",
|
||||||
"url",
|
"url",
|
||||||
"zstd",
|
"zstd",
|
||||||
|
|
|
@ -30,6 +30,11 @@ patricia_tree = "0.3.1"
|
||||||
# For decompression of emitted packs. 0.11.1+zstd.1.5.2
|
# For decompression of emitted packs. 0.11.1+zstd.1.5.2
|
||||||
zstd = "0.11.1"
|
zstd = "0.11.1"
|
||||||
|
|
||||||
|
# HTTP Requests
|
||||||
|
reqwest = { version = "0.11.9", features = ["blocking"] }
|
||||||
|
|
||||||
|
tempfile = "3.3.0"
|
||||||
|
|
||||||
quickpeep_densedoc = { path = "../quickpeep_densedoc" }
|
quickpeep_densedoc = { path = "../quickpeep_densedoc" }
|
||||||
quickpeep_index = { path = "../quickpeep_index" }
|
quickpeep_index = { path = "../quickpeep_index" }
|
||||||
quickpeep_structs = { path = "../quickpeep_structs" }
|
quickpeep_structs = { path = "../quickpeep_structs" }
|
||||||
|
|
|
@ -2,9 +2,9 @@ use anyhow::{bail, Context};
|
||||||
use clap::Parser;
|
use clap::Parser;
|
||||||
use colour::{blue, yellow_ln};
|
use colour::{blue, yellow_ln};
|
||||||
use env_logger::Env;
|
use env_logger::Env;
|
||||||
use std::collections::HashMap;
|
use std::collections::{BTreeSet, HashMap};
|
||||||
use std::fs::File;
|
use std::fs::{File, OpenOptions};
|
||||||
use std::io::{BufRead, BufReader};
|
use std::io::{BufRead, BufReader, Write};
|
||||||
|
|
||||||
use patricia_tree::PatriciaMap;
|
use patricia_tree::PatriciaMap;
|
||||||
use quickpeep_densedoc::DenseTree;
|
use quickpeep_densedoc::DenseTree;
|
||||||
|
@ -20,7 +20,8 @@ use quickpeep_structs::rake_entries::{
|
||||||
};
|
};
|
||||||
use quickpeep_utils::urls::get_reduced_domain;
|
use quickpeep_utils::urls::get_reduced_domain;
|
||||||
use smartstring::alias::CompactString;
|
use smartstring::alias::CompactString;
|
||||||
use std::path::PathBuf;
|
use std::path::{Path, PathBuf};
|
||||||
|
use tempfile::NamedTempFile;
|
||||||
use tokio::sync::mpsc::Receiver;
|
use tokio::sync::mpsc::Receiver;
|
||||||
use url::Url;
|
use url::Url;
|
||||||
|
|
||||||
|
@ -30,6 +31,11 @@ pub struct Opts {
|
||||||
#[clap(long = "config")]
|
#[clap(long = "config")]
|
||||||
config: Option<PathBuf>,
|
config: Option<PathBuf>,
|
||||||
|
|
||||||
|
/// If specified, rakepacks from a feed will automatically be fetched and indexed.
|
||||||
|
/// The rakepacks are tracked as having been processed.
|
||||||
|
#[clap(long = "feed")]
|
||||||
|
feed: Option<Url>,
|
||||||
|
|
||||||
rakepacks: Vec<PathBuf>,
|
rakepacks: Vec<PathBuf>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -62,46 +68,146 @@ pub async fn main() -> anyhow::Result<()> {
|
||||||
|
|
||||||
let mut indexer_backend = config.open_indexer_backend()?;
|
let mut indexer_backend = config.open_indexer_backend()?;
|
||||||
|
|
||||||
|
if let Some(feed) = opts.feed {
|
||||||
|
let processed_rakepack_path = config
|
||||||
|
.processed_rakepack_path()
|
||||||
|
.context("can't get a suitable location to track processed rakepacks")?;
|
||||||
|
handle_pack_feed(
|
||||||
|
feed,
|
||||||
|
&mut indexer_backend,
|
||||||
|
processed_rakepack_path,
|
||||||
|
&seed_lookup,
|
||||||
|
&icon_store,
|
||||||
|
)
|
||||||
|
.context("failed to handle pack feed")?;
|
||||||
|
}
|
||||||
|
|
||||||
for pack in opts.rakepacks {
|
for pack in opts.rakepacks {
|
||||||
blue!("Indexing: ");
|
handle_pack(&pack, &mut indexer_backend, &seed_lookup, &icon_store)
|
||||||
yellow_ln!("{:?}", pack);
|
.with_context(|| format!("Whilst handling pack: {pack:?}"))?;
|
||||||
|
|
||||||
let file = File::open(&pack)?;
|
|
||||||
let decompressor = zstd::stream::Decoder::new(file)?;
|
|
||||||
// TODO the decompressor has a buffer already, but we need this to see the end
|
|
||||||
let mut buf_reader = BufReader::new(decompressor);
|
|
||||||
let schema: String =
|
|
||||||
serde_bare::from_reader(&mut buf_reader).context("failed to read schema ver")?;
|
|
||||||
|
|
||||||
match schema.as_ref() {
|
|
||||||
SCHEMA_RAKED_PAGES => {
|
|
||||||
// TODO(unstable): this condition is `.has_data_left()` but it's unstable.
|
|
||||||
while buf_reader.fill_buf().map(|b| !b.is_empty())? {
|
|
||||||
handle_page_pack(&mut buf_reader, &seed_lookup, &mut indexer_backend)
|
|
||||||
.context("failed to handle page pack")?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
SCHEMA_RAKED_ICONS => {
|
|
||||||
// TODO(unstable): this condition is `.has_data_left()` but it's unstable.
|
|
||||||
while buf_reader.fill_buf().map(|b| !b.is_empty())? {
|
|
||||||
handle_icon_pack(&mut buf_reader, &icon_store)
|
|
||||||
.context("failed to handle icon pack")?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_ => {
|
|
||||||
bail!(
|
|
||||||
"Wrong schema version: wanted e.g. {:?}, got {:?}",
|
|
||||||
SCHEMA_RAKED_PAGES,
|
|
||||||
&schema
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
indexer_backend.flush()?;
|
indexer_backend.flush()?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn handle_pack_feed(
|
||||||
|
feed_url: Url,
|
||||||
|
indexer_backend: &mut Box<dyn Backend>,
|
||||||
|
processed_list_path: PathBuf,
|
||||||
|
seed_lookup: &SeedLookupTable,
|
||||||
|
icon_store: &IconStore,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
blue!("Scanning feed: ");
|
||||||
|
yellow_ln!("{:?}", feed_url);
|
||||||
|
|
||||||
|
let new_packs =
|
||||||
|
find_new_packs(feed_url.clone(), &processed_list_path).context("finding new packs")?;
|
||||||
|
let mut processed_log = OpenOptions::new()
|
||||||
|
.append(true)
|
||||||
|
.create(true)
|
||||||
|
.open(&processed_list_path)
|
||||||
|
.context("can't open processed list for append")?;
|
||||||
|
|
||||||
|
for pack_name in new_packs {
|
||||||
|
let pack_url = feed_url
|
||||||
|
.join(&pack_name)
|
||||||
|
.context("Can't resolve URL of new pack")?;
|
||||||
|
|
||||||
|
blue!("Downloading: ");
|
||||||
|
yellow_ln!("{:?}", pack_url);
|
||||||
|
let mut temp_file = NamedTempFile::new().context("opening temp file")?;
|
||||||
|
|
||||||
|
reqwest::blocking::get(pack_url.clone())
|
||||||
|
.context("failed to request pack")?
|
||||||
|
.error_for_status()?
|
||||||
|
.copy_to(temp_file.as_file_mut())
|
||||||
|
.context("failed to download pack to temp file")?;
|
||||||
|
|
||||||
|
handle_pack(temp_file.path(), indexer_backend, seed_lookup, icon_store).with_context(
|
||||||
|
|| {
|
||||||
|
format!(
|
||||||
|
"Whilst handling pack: {:?} ({:?})",
|
||||||
|
temp_file.path(),
|
||||||
|
pack_url
|
||||||
|
)
|
||||||
|
},
|
||||||
|
)?;
|
||||||
|
|
||||||
|
processed_log.write(format!("\n{}", &pack_name).as_bytes())?;
|
||||||
|
processed_log.flush()?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn find_new_packs(feed_url: Url, processed_list_path: &Path) -> anyhow::Result<BTreeSet<String>> {
|
||||||
|
let processed_file = OpenOptions::new()
|
||||||
|
.read(true)
|
||||||
|
.create(true)
|
||||||
|
.open(processed_list_path)?;
|
||||||
|
let br = BufReader::new(processed_file);
|
||||||
|
let processed: Result<BTreeSet<String>, _> = br.lines().collect();
|
||||||
|
let processed = processed.context("failed to read local processed list")?;
|
||||||
|
|
||||||
|
let mut unprocessed: BTreeSet<String> = BTreeSet::new();
|
||||||
|
|
||||||
|
let feed_lines = BufReader::new(reqwest::blocking::get(feed_url)?.error_for_status()?).lines();
|
||||||
|
for line in feed_lines {
|
||||||
|
let line = line?;
|
||||||
|
if line.is_empty() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if processed.contains(&line) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
unprocessed.insert(line.to_owned());
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(unprocessed)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn handle_pack(
|
||||||
|
pack: &Path,
|
||||||
|
indexer_backend: &mut Box<dyn Backend>,
|
||||||
|
seed_lookup: &SeedLookupTable,
|
||||||
|
icon_store: &IconStore,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
blue!("Indexing: ");
|
||||||
|
yellow_ln!("{:?}", pack);
|
||||||
|
|
||||||
|
let file = File::open(&pack)?;
|
||||||
|
let decompressor = zstd::stream::Decoder::new(file)?;
|
||||||
|
// TODO the decompressor has a buffer already, but we need this to see the end
|
||||||
|
let mut buf_reader = BufReader::new(decompressor);
|
||||||
|
let schema: String =
|
||||||
|
serde_bare::from_reader(&mut buf_reader).context("failed to read schema ver")?;
|
||||||
|
|
||||||
|
match schema.as_ref() {
|
||||||
|
SCHEMA_RAKED_PAGES => {
|
||||||
|
// TODO(unstable): this condition is `.has_data_left()` but it's unstable.
|
||||||
|
while buf_reader.fill_buf().map(|b| !b.is_empty())? {
|
||||||
|
handle_page_pack(&mut buf_reader, &seed_lookup, indexer_backend)
|
||||||
|
.context("failed to handle page pack")?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
SCHEMA_RAKED_ICONS => {
|
||||||
|
// TODO(unstable): this condition is `.has_data_left()` but it's unstable.
|
||||||
|
while buf_reader.fill_buf().map(|b| !b.is_empty())? {
|
||||||
|
handle_icon_pack(&mut buf_reader, &icon_store)
|
||||||
|
.context("failed to handle icon pack")?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
bail!(
|
||||||
|
"Wrong schema version: wanted e.g. {:?}, got {:?}",
|
||||||
|
SCHEMA_RAKED_PAGES,
|
||||||
|
&schema
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
pub fn handle_page_pack(
|
pub fn handle_page_pack(
|
||||||
buf_reader: &mut impl BufRead,
|
buf_reader: &mut impl BufRead,
|
||||||
seed_lookup: &SeedLookupTable,
|
seed_lookup: &SeedLookupTable,
|
||||||
|
|
|
@ -55,4 +55,17 @@ impl IndexerConfig {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the path to a text file which can be used for storing a list of processed rakepacks
|
||||||
|
/// (needed for following rakepack streams over a network).
|
||||||
|
pub fn processed_rakepack_path(&self) -> anyhow::Result<PathBuf> {
|
||||||
|
match &self.index.backend {
|
||||||
|
BackendConfig::Tantivy(tantivy) => {
|
||||||
|
Ok(tantivy.index_dir.join("processed_rakepacks.lst"))
|
||||||
|
}
|
||||||
|
BackendConfig::Meili(_) => {
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue