Load tags and index them as necessary

This commit is contained in:
Olivier 'reivilibre' 2022-03-27 21:03:55 +01:00
parent 0a955bb2f5
commit 945a1504ca
2 changed files with 31 additions and 10 deletions

View File

@ -29,11 +29,6 @@ pub struct Opts {
rakepacks: Vec<PathBuf>,
}
pub struct SeedLookupTable {
pub by_prefix: PatriciaMap<Seed>,
pub by_reduced_domain: HashMap<CompactString, Seed>,
}
#[tokio::main]
pub async fn main() -> anyhow::Result<()> {
env_logger::Builder::from_env(Env::default().default_filter_or("info,qp_indexer=debug")).init();
@ -84,13 +79,18 @@ pub async fn main() -> anyhow::Result<()> {
let article_body = DenseTree::generate_textual_format(&document.body_content);
let nonarticle_body = DenseTree::generate_textual_format(&document.body_remainder);
let tags = seed_lookup
.look_up(&Url::parse(page_record.url.as_ref())?)?
.map(|seed: &Seed| seed.tags.iter().map(|cs| cs.to_string()).collect())
.unwrap_or_else(|| Vec::with_capacity(0));
// TODO Store the actual structure of the document in the store?
indexer_backend.add_document(BackendIndependentDocument {
title: document.head.title,
article_body,
nonarticle_body,
// TODO populate tags & antifeatures
tags: vec![],
tags,
url: page_record.url.to_string(),
})?;
}
@ -100,6 +100,11 @@ pub async fn main() -> anyhow::Result<()> {
Ok(())
}
pub struct SeedLookupTable {
pub by_prefix: PatriciaMap<Seed>,
pub by_reduced_domain: HashMap<CompactString, Seed>,
}
pub async fn build_seed_lookup_table(
mut seed_rx: Receiver<Seed>,
) -> anyhow::Result<SeedLookupTable> {
@ -125,3 +130,18 @@ pub async fn build_seed_lookup_table(
Ok(seed_lookup)
}
impl SeedLookupTable {
pub fn look_up(&self, url: &Url) -> anyhow::Result<Option<&Seed>> {
if let Some((_prefix, seed)) = self.by_prefix.get_longest_common_prefix(&url.as_str()) {
return Ok(Some(seed));
}
let domain = get_reduced_domain(url)?;
if let Some(seed) = self.by_reduced_domain.get(domain.as_ref()) {
return Ok(Some(seed));
}
Ok(None)
}
}

View File

@ -1,6 +1,8 @@
use crate::parse_seeds;
use anyhow::{anyhow, bail};
use log::warn;
use smartstring::alias::CompactString;
use std::collections::BTreeSet;
use std::ffi::OsStr;
use std::path::PathBuf;
use tokio::sync::mpsc::Sender;
@ -10,7 +12,7 @@ pub const WEED_EXTENSION: &'static str = ".weed";
pub struct Seed {
pub url: UrlOrUrlPattern,
// TODO(later) These make more sense at the indexer stage. tags: BTreeSet<CompactString>,
pub tags: BTreeSet<CompactString>,
}
/// Either a URL or a URL prefix.
@ -38,17 +40,16 @@ pub async fn seed_loader(seed_files: Vec<PathBuf>, send: &Sender<Seed>) -> anyho
Ok(seedblocks) => {
for seedblock in seedblocks {
for seed in seedblock.seeds {
/*
let tags: BTreeSet<CompactString> = seedblock
.tags
.iter()
.chain(seed.extra_tags.iter())
.cloned()
.collect();
*/
send.send(Seed {
url: seed_url_parse_pattern(seed.url),
// tags,
tags,
})
.await
.map_err(|_| anyhow!("Seed receiver shut down prematurely"))?;