Load tags and index them as necessary
This commit is contained in:
parent
0a955bb2f5
commit
945a1504ca
@ -29,11 +29,6 @@ pub struct Opts {
|
|||||||
rakepacks: Vec<PathBuf>,
|
rakepacks: Vec<PathBuf>,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct SeedLookupTable {
|
|
||||||
pub by_prefix: PatriciaMap<Seed>,
|
|
||||||
pub by_reduced_domain: HashMap<CompactString, Seed>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
pub async fn main() -> anyhow::Result<()> {
|
pub async fn main() -> anyhow::Result<()> {
|
||||||
env_logger::Builder::from_env(Env::default().default_filter_or("info,qp_indexer=debug")).init();
|
env_logger::Builder::from_env(Env::default().default_filter_or("info,qp_indexer=debug")).init();
|
||||||
@ -84,13 +79,18 @@ pub async fn main() -> anyhow::Result<()> {
|
|||||||
let article_body = DenseTree::generate_textual_format(&document.body_content);
|
let article_body = DenseTree::generate_textual_format(&document.body_content);
|
||||||
let nonarticle_body = DenseTree::generate_textual_format(&document.body_remainder);
|
let nonarticle_body = DenseTree::generate_textual_format(&document.body_remainder);
|
||||||
|
|
||||||
|
let tags = seed_lookup
|
||||||
|
.look_up(&Url::parse(page_record.url.as_ref())?)?
|
||||||
|
.map(|seed: &Seed| seed.tags.iter().map(|cs| cs.to_string()).collect())
|
||||||
|
.unwrap_or_else(|| Vec::with_capacity(0));
|
||||||
|
|
||||||
// TODO Store the actual structure of the document in the store?
|
// TODO Store the actual structure of the document in the store?
|
||||||
indexer_backend.add_document(BackendIndependentDocument {
|
indexer_backend.add_document(BackendIndependentDocument {
|
||||||
title: document.head.title,
|
title: document.head.title,
|
||||||
article_body,
|
article_body,
|
||||||
nonarticle_body,
|
nonarticle_body,
|
||||||
// TODO populate tags & antifeatures
|
// TODO populate tags & antifeatures
|
||||||
tags: vec![],
|
tags,
|
||||||
url: page_record.url.to_string(),
|
url: page_record.url.to_string(),
|
||||||
})?;
|
})?;
|
||||||
}
|
}
|
||||||
@ -100,6 +100,11 @@ pub async fn main() -> anyhow::Result<()> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub struct SeedLookupTable {
|
||||||
|
pub by_prefix: PatriciaMap<Seed>,
|
||||||
|
pub by_reduced_domain: HashMap<CompactString, Seed>,
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn build_seed_lookup_table(
|
pub async fn build_seed_lookup_table(
|
||||||
mut seed_rx: Receiver<Seed>,
|
mut seed_rx: Receiver<Seed>,
|
||||||
) -> anyhow::Result<SeedLookupTable> {
|
) -> anyhow::Result<SeedLookupTable> {
|
||||||
@ -125,3 +130,18 @@ pub async fn build_seed_lookup_table(
|
|||||||
|
|
||||||
Ok(seed_lookup)
|
Ok(seed_lookup)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl SeedLookupTable {
|
||||||
|
pub fn look_up(&self, url: &Url) -> anyhow::Result<Option<&Seed>> {
|
||||||
|
if let Some((_prefix, seed)) = self.by_prefix.get_longest_common_prefix(&url.as_str()) {
|
||||||
|
return Ok(Some(seed));
|
||||||
|
}
|
||||||
|
|
||||||
|
let domain = get_reduced_domain(url)?;
|
||||||
|
if let Some(seed) = self.by_reduced_domain.get(domain.as_ref()) {
|
||||||
|
return Ok(Some(seed));
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
use crate::parse_seeds;
|
use crate::parse_seeds;
|
||||||
use anyhow::{anyhow, bail};
|
use anyhow::{anyhow, bail};
|
||||||
use log::warn;
|
use log::warn;
|
||||||
|
use smartstring::alias::CompactString;
|
||||||
|
use std::collections::BTreeSet;
|
||||||
use std::ffi::OsStr;
|
use std::ffi::OsStr;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use tokio::sync::mpsc::Sender;
|
use tokio::sync::mpsc::Sender;
|
||||||
@ -10,7 +12,7 @@ pub const WEED_EXTENSION: &'static str = ".weed";
|
|||||||
|
|
||||||
pub struct Seed {
|
pub struct Seed {
|
||||||
pub url: UrlOrUrlPattern,
|
pub url: UrlOrUrlPattern,
|
||||||
// TODO(later) These make more sense at the indexer stage. tags: BTreeSet<CompactString>,
|
pub tags: BTreeSet<CompactString>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Either a URL or a URL prefix.
|
/// Either a URL or a URL prefix.
|
||||||
@ -38,17 +40,16 @@ pub async fn seed_loader(seed_files: Vec<PathBuf>, send: &Sender<Seed>) -> anyho
|
|||||||
Ok(seedblocks) => {
|
Ok(seedblocks) => {
|
||||||
for seedblock in seedblocks {
|
for seedblock in seedblocks {
|
||||||
for seed in seedblock.seeds {
|
for seed in seedblock.seeds {
|
||||||
/*
|
|
||||||
let tags: BTreeSet<CompactString> = seedblock
|
let tags: BTreeSet<CompactString> = seedblock
|
||||||
.tags
|
.tags
|
||||||
.iter()
|
.iter()
|
||||||
.chain(seed.extra_tags.iter())
|
.chain(seed.extra_tags.iter())
|
||||||
.cloned()
|
.cloned()
|
||||||
.collect();
|
.collect();
|
||||||
*/
|
|
||||||
send.send(Seed {
|
send.send(Seed {
|
||||||
url: seed_url_parse_pattern(seed.url),
|
url: seed_url_parse_pattern(seed.url),
|
||||||
// tags,
|
tags,
|
||||||
})
|
})
|
||||||
.await
|
.await
|
||||||
.map_err(|_| anyhow!("Seed receiver shut down prematurely"))?;
|
.map_err(|_| anyhow!("Seed receiver shut down prematurely"))?;
|
||||||
|
Loading…
Reference in New Issue
Block a user