Load tags and index them as necessary

This commit is contained in:
Olivier 'reivilibre' 2022-03-27 21:03:55 +01:00
parent 0a955bb2f5
commit 945a1504ca
2 changed files with 31 additions and 10 deletions

View File

@ -29,11 +29,6 @@ pub struct Opts {
rakepacks: Vec<PathBuf>, rakepacks: Vec<PathBuf>,
} }
pub struct SeedLookupTable {
pub by_prefix: PatriciaMap<Seed>,
pub by_reduced_domain: HashMap<CompactString, Seed>,
}
#[tokio::main] #[tokio::main]
pub async fn main() -> anyhow::Result<()> { pub async fn main() -> anyhow::Result<()> {
env_logger::Builder::from_env(Env::default().default_filter_or("info,qp_indexer=debug")).init(); env_logger::Builder::from_env(Env::default().default_filter_or("info,qp_indexer=debug")).init();
@ -84,13 +79,18 @@ pub async fn main() -> anyhow::Result<()> {
let article_body = DenseTree::generate_textual_format(&document.body_content); let article_body = DenseTree::generate_textual_format(&document.body_content);
let nonarticle_body = DenseTree::generate_textual_format(&document.body_remainder); let nonarticle_body = DenseTree::generate_textual_format(&document.body_remainder);
let tags = seed_lookup
.look_up(&Url::parse(page_record.url.as_ref())?)?
.map(|seed: &Seed| seed.tags.iter().map(|cs| cs.to_string()).collect())
.unwrap_or_else(|| Vec::with_capacity(0));
// TODO Store the actual structure of the document in the store? // TODO Store the actual structure of the document in the store?
indexer_backend.add_document(BackendIndependentDocument { indexer_backend.add_document(BackendIndependentDocument {
title: document.head.title, title: document.head.title,
article_body, article_body,
nonarticle_body, nonarticle_body,
// TODO populate tags & antifeatures // TODO populate tags & antifeatures
tags: vec![], tags,
url: page_record.url.to_string(), url: page_record.url.to_string(),
})?; })?;
} }
@ -100,6 +100,11 @@ pub async fn main() -> anyhow::Result<()> {
Ok(()) Ok(())
} }
pub struct SeedLookupTable {
pub by_prefix: PatriciaMap<Seed>,
pub by_reduced_domain: HashMap<CompactString, Seed>,
}
pub async fn build_seed_lookup_table( pub async fn build_seed_lookup_table(
mut seed_rx: Receiver<Seed>, mut seed_rx: Receiver<Seed>,
) -> anyhow::Result<SeedLookupTable> { ) -> anyhow::Result<SeedLookupTable> {
@ -125,3 +130,18 @@ pub async fn build_seed_lookup_table(
Ok(seed_lookup) Ok(seed_lookup)
} }
impl SeedLookupTable {
pub fn look_up(&self, url: &Url) -> anyhow::Result<Option<&Seed>> {
if let Some((_prefix, seed)) = self.by_prefix.get_longest_common_prefix(&url.as_str()) {
return Ok(Some(seed));
}
let domain = get_reduced_domain(url)?;
if let Some(seed) = self.by_reduced_domain.get(domain.as_ref()) {
return Ok(Some(seed));
}
Ok(None)
}
}

View File

@ -1,6 +1,8 @@
use crate::parse_seeds; use crate::parse_seeds;
use anyhow::{anyhow, bail}; use anyhow::{anyhow, bail};
use log::warn; use log::warn;
use smartstring::alias::CompactString;
use std::collections::BTreeSet;
use std::ffi::OsStr; use std::ffi::OsStr;
use std::path::PathBuf; use std::path::PathBuf;
use tokio::sync::mpsc::Sender; use tokio::sync::mpsc::Sender;
@ -10,7 +12,7 @@ pub const WEED_EXTENSION: &'static str = ".weed";
pub struct Seed { pub struct Seed {
pub url: UrlOrUrlPattern, pub url: UrlOrUrlPattern,
// TODO(later) These make more sense at the indexer stage. tags: BTreeSet<CompactString>, pub tags: BTreeSet<CompactString>,
} }
/// Either a URL or a URL prefix. /// Either a URL or a URL prefix.
@ -38,17 +40,16 @@ pub async fn seed_loader(seed_files: Vec<PathBuf>, send: &Sender<Seed>) -> anyho
Ok(seedblocks) => { Ok(seedblocks) => {
for seedblock in seedblocks { for seedblock in seedblocks {
for seed in seedblock.seeds { for seed in seedblock.seeds {
/*
let tags: BTreeSet<CompactString> = seedblock let tags: BTreeSet<CompactString> = seedblock
.tags .tags
.iter() .iter()
.chain(seed.extra_tags.iter()) .chain(seed.extra_tags.iter())
.cloned() .cloned()
.collect(); .collect();
*/
send.send(Seed { send.send(Seed {
url: seed_url_parse_pattern(seed.url), url: seed_url_parse_pattern(seed.url),
// tags, tags,
}) })
.await .await
.map_err(|_| anyhow!("Seed receiver shut down prematurely"))?; .map_err(|_| anyhow!("Seed receiver shut down prematurely"))?;