diff --git a/quickpeep_indexer/src/bin/qp-indexer.rs b/quickpeep_indexer/src/bin/qp-indexer.rs index 73035e6..80ca084 100644 --- a/quickpeep_indexer/src/bin/qp-indexer.rs +++ b/quickpeep_indexer/src/bin/qp-indexer.rs @@ -29,11 +29,6 @@ pub struct Opts { rakepacks: Vec, } -pub struct SeedLookupTable { - pub by_prefix: PatriciaMap, - pub by_reduced_domain: HashMap, -} - #[tokio::main] pub async fn main() -> anyhow::Result<()> { env_logger::Builder::from_env(Env::default().default_filter_or("info,qp_indexer=debug")).init(); @@ -84,13 +79,18 @@ pub async fn main() -> anyhow::Result<()> { let article_body = DenseTree::generate_textual_format(&document.body_content); let nonarticle_body = DenseTree::generate_textual_format(&document.body_remainder); + let tags = seed_lookup + .look_up(&Url::parse(page_record.url.as_ref())?)? + .map(|seed: &Seed| seed.tags.iter().map(|cs| cs.to_string()).collect()) + .unwrap_or_else(|| Vec::with_capacity(0)); + // TODO Store the actual structure of the document in the store? indexer_backend.add_document(BackendIndependentDocument { title: document.head.title, article_body, nonarticle_body, // TODO populate tags & antifeatures - tags: vec![], + tags, url: page_record.url.to_string(), })?; } @@ -100,6 +100,11 @@ pub async fn main() -> anyhow::Result<()> { Ok(()) } +pub struct SeedLookupTable { + pub by_prefix: PatriciaMap, + pub by_reduced_domain: HashMap, +} + pub async fn build_seed_lookup_table( mut seed_rx: Receiver, ) -> anyhow::Result { @@ -125,3 +130,18 @@ pub async fn build_seed_lookup_table( Ok(seed_lookup) } + +impl SeedLookupTable { + pub fn look_up(&self, url: &Url) -> anyhow::Result> { + if let Some((_prefix, seed)) = self.by_prefix.get_longest_common_prefix(&url.as_str()) { + return Ok(Some(seed)); + } + + let domain = get_reduced_domain(url)?; + if let Some(seed) = self.by_reduced_domain.get(domain.as_ref()) { + return Ok(Some(seed)); + } + + Ok(None) + } +} diff --git a/quickpeep_seed_parser/src/loader.rs b/quickpeep_seed_parser/src/loader.rs index a5dcb9e..e4e4c84 100644 --- a/quickpeep_seed_parser/src/loader.rs +++ b/quickpeep_seed_parser/src/loader.rs @@ -1,6 +1,8 @@ use crate::parse_seeds; use anyhow::{anyhow, bail}; use log::warn; +use smartstring::alias::CompactString; +use std::collections::BTreeSet; use std::ffi::OsStr; use std::path::PathBuf; use tokio::sync::mpsc::Sender; @@ -10,7 +12,7 @@ pub const WEED_EXTENSION: &'static str = ".weed"; pub struct Seed { pub url: UrlOrUrlPattern, - // TODO(later) These make more sense at the indexer stage. tags: BTreeSet, + pub tags: BTreeSet, } /// Either a URL or a URL prefix. @@ -38,17 +40,16 @@ pub async fn seed_loader(seed_files: Vec, send: &Sender) -> anyho Ok(seedblocks) => { for seedblock in seedblocks { for seed in seedblock.seeds { - /* let tags: BTreeSet = seedblock .tags .iter() .chain(seed.extra_tags.iter()) .cloned() .collect(); - */ + send.send(Seed { url: seed_url_parse_pattern(seed.url), - // tags, + tags, }) .await .map_err(|_| anyhow!("Seed receiver shut down prematurely"))?;