Load tags and index them as necessary
This commit is contained in:
parent
0a955bb2f5
commit
945a1504ca
|
@ -29,11 +29,6 @@ pub struct Opts {
|
|||
rakepacks: Vec<PathBuf>,
|
||||
}
|
||||
|
||||
pub struct SeedLookupTable {
|
||||
pub by_prefix: PatriciaMap<Seed>,
|
||||
pub by_reduced_domain: HashMap<CompactString, Seed>,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
pub async fn main() -> anyhow::Result<()> {
|
||||
env_logger::Builder::from_env(Env::default().default_filter_or("info,qp_indexer=debug")).init();
|
||||
|
@ -84,13 +79,18 @@ pub async fn main() -> anyhow::Result<()> {
|
|||
let article_body = DenseTree::generate_textual_format(&document.body_content);
|
||||
let nonarticle_body = DenseTree::generate_textual_format(&document.body_remainder);
|
||||
|
||||
let tags = seed_lookup
|
||||
.look_up(&Url::parse(page_record.url.as_ref())?)?
|
||||
.map(|seed: &Seed| seed.tags.iter().map(|cs| cs.to_string()).collect())
|
||||
.unwrap_or_else(|| Vec::with_capacity(0));
|
||||
|
||||
// TODO Store the actual structure of the document in the store?
|
||||
indexer_backend.add_document(BackendIndependentDocument {
|
||||
title: document.head.title,
|
||||
article_body,
|
||||
nonarticle_body,
|
||||
// TODO populate tags & antifeatures
|
||||
tags: vec![],
|
||||
tags,
|
||||
url: page_record.url.to_string(),
|
||||
})?;
|
||||
}
|
||||
|
@ -100,6 +100,11 @@ pub async fn main() -> anyhow::Result<()> {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
pub struct SeedLookupTable {
|
||||
pub by_prefix: PatriciaMap<Seed>,
|
||||
pub by_reduced_domain: HashMap<CompactString, Seed>,
|
||||
}
|
||||
|
||||
pub async fn build_seed_lookup_table(
|
||||
mut seed_rx: Receiver<Seed>,
|
||||
) -> anyhow::Result<SeedLookupTable> {
|
||||
|
@ -125,3 +130,18 @@ pub async fn build_seed_lookup_table(
|
|||
|
||||
Ok(seed_lookup)
|
||||
}
|
||||
|
||||
impl SeedLookupTable {
|
||||
pub fn look_up(&self, url: &Url) -> anyhow::Result<Option<&Seed>> {
|
||||
if let Some((_prefix, seed)) = self.by_prefix.get_longest_common_prefix(&url.as_str()) {
|
||||
return Ok(Some(seed));
|
||||
}
|
||||
|
||||
let domain = get_reduced_domain(url)?;
|
||||
if let Some(seed) = self.by_reduced_domain.get(domain.as_ref()) {
|
||||
return Ok(Some(seed));
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
use crate::parse_seeds;
|
||||
use anyhow::{anyhow, bail};
|
||||
use log::warn;
|
||||
use smartstring::alias::CompactString;
|
||||
use std::collections::BTreeSet;
|
||||
use std::ffi::OsStr;
|
||||
use std::path::PathBuf;
|
||||
use tokio::sync::mpsc::Sender;
|
||||
|
@ -10,7 +12,7 @@ pub const WEED_EXTENSION: &'static str = ".weed";
|
|||
|
||||
pub struct Seed {
|
||||
pub url: UrlOrUrlPattern,
|
||||
// TODO(later) These make more sense at the indexer stage. tags: BTreeSet<CompactString>,
|
||||
pub tags: BTreeSet<CompactString>,
|
||||
}
|
||||
|
||||
/// Either a URL or a URL prefix.
|
||||
|
@ -38,17 +40,16 @@ pub async fn seed_loader(seed_files: Vec<PathBuf>, send: &Sender<Seed>) -> anyho
|
|||
Ok(seedblocks) => {
|
||||
for seedblock in seedblocks {
|
||||
for seed in seedblock.seeds {
|
||||
/*
|
||||
let tags: BTreeSet<CompactString> = seedblock
|
||||
.tags
|
||||
.iter()
|
||||
.chain(seed.extra_tags.iter())
|
||||
.cloned()
|
||||
.collect();
|
||||
*/
|
||||
|
||||
send.send(Seed {
|
||||
url: seed_url_parse_pattern(seed.url),
|
||||
// tags,
|
||||
tags,
|
||||
})
|
||||
.await
|
||||
.map_err(|_| anyhow!("Seed receiver shut down prematurely"))?;
|
||||
|
|
Loading…
Reference in New Issue