diff --git a/quickpeep_indexer/src/bin/qp-indexer.rs b/quickpeep_indexer/src/bin/qp-indexer.rs index ba79aa7..cfdd78f 100644 --- a/quickpeep_indexer/src/bin/qp-indexer.rs +++ b/quickpeep_indexer/src/bin/qp-indexer.rs @@ -318,7 +318,8 @@ pub async fn build_seed_lookup_table( match &seed.url { UrlOrUrlPattern::Url(url_str) => { let url = Url::parse(url_str)?; - let reduced_domain = get_reduced_domain(&url)?; + let reduced_domain = get_reduced_domain(&url) + .with_context(|| format!("No domain in seed: '{url}'!"))?; seed_lookup .by_reduced_domain .insert(reduced_domain.into(), seed); @@ -338,7 +339,8 @@ impl SeedLookupTable { return Ok(Some(seed)); } - let domain = get_reduced_domain(url)?; + let domain = get_reduced_domain(url) + .with_context(|| format!("No domain in looked up URL: '{url}'"))?; if let Some(seed) = self.by_reduced_domain.get(domain.as_ref()) { return Ok(Some(seed)); } diff --git a/quickpeep_raker/src/bin/qp-seedrake.rs b/quickpeep_raker/src/bin/qp-seedrake.rs index ecfcff8..8cff450 100644 --- a/quickpeep_raker/src/bin/qp-seedrake.rs +++ b/quickpeep_raker/src/bin/qp-seedrake.rs @@ -169,7 +169,8 @@ async fn import_and_flush_batch_seeds( for seed in buf.drain(..) { let as_url = Url::parse(seed.url.as_str()) .with_context(|| format!("Failed to parse {:?} as URL", seed.url))?; - let domain = get_reduced_domain(&as_url)?; + let domain = get_reduced_domain(&as_url) + .with_context(|| format!("No domain in seed URL '{as_url}'!"))?; let allowed_domain_record = txn.get_allowed_domain_record(domain.borrow())?; @@ -244,7 +245,8 @@ async fn import_and_flush_batch_weeds( for seed in buf.drain(..) { let as_url = Url::parse(seed.url.as_str()) .with_context(|| format!("Failed to parse {:?} as URL", seed.url))?; - let domain = get_reduced_domain(&as_url)?; + let domain = get_reduced_domain(&as_url) + .with_context(|| format!("No domain in weed URL '{as_url}'!"))?; let weed_domain_record = txn.get_weed_domain_record(domain.borrow())?; diff --git a/quickpeep_raker/src/raking/references.rs b/quickpeep_raker/src/raking/references.rs index 678d938..d787c05 100644 --- a/quickpeep_raker/src/raking/references.rs +++ b/quickpeep_raker/src/raking/references.rs @@ -45,6 +45,12 @@ pub fn find_references( } => { if !nofollow { if let Ok(full_url) = page_url.join(&href) { + if full_url.domain().is_none() { + // Skip URLs that don't have a domain after being made absolute. + // This also skips IP addresses: we probably don't want to bother + // indexing content from explicit IP addresses. + continue; + } if SUPPORTED_SCHEMES.contains(&full_url.scheme()) { refs.insert(RakedReference { target: clean_url(&full_url).to_string(), @@ -72,6 +78,10 @@ pub fn find_references( add_link_refs(&doc, &mut refs, &page_url); for feed in feeds { + if feed.domain().is_none() { + // same rationale as above. + continue; + } refs.insert(RakedReference { target: clean_url(feed).as_str().to_owned(), kind: ReferenceKind::HeaderLinkedFeed, diff --git a/quickpeep_raker/src/raking/task.rs b/quickpeep_raker/src/raking/task.rs index accb206..3a327eb 100644 --- a/quickpeep_raker/src/raking/task.rs +++ b/quickpeep_raker/src/raking/task.rs @@ -454,7 +454,9 @@ impl TaskContext { // TODO(future) do we want to log this somewhere? // or at least a metric - let domain = get_reduced_domain(url)?; + let domain = get_reduced_domain(url).with_context(|| { + format!("No domain in URL '{url}' for which we are processing the outcome!") + })?; let url = url.clone(); // TODO(feature) add 1.1× the previous backoff, if there was one. @@ -518,7 +520,9 @@ impl EventProcessor<'_> { self.store .as_ref() .async_rw_txn(move |txn| { - let domain = get_reduced_domain(&url)?; + let domain = get_reduced_domain(&url).with_context(|| { + format!("No domain for URL '{url}' for which we are processing the page!") + })?; txn.mark_url_as_visited( domain.as_ref(), url.as_ref(), @@ -546,7 +550,9 @@ impl EventProcessor<'_> { self.store .as_ref() .async_rw_txn(move |txn| { - let domain = get_reduced_domain(&url)?; + let domain = get_reduced_domain(&url).with_context(|| { + format!("No domain for URL '{url}' for which we are processing an icon!") + })?; txn.mark_url_as_visited( domain.as_ref(), url.as_ref(), @@ -577,7 +583,9 @@ impl EventProcessor<'_> { self.store .as_ref() .async_rw_txn(move |txn| { - let domain = get_reduced_domain(&url)?; + let domain = get_reduced_domain(&url).with_context(|| { + format!("No domain for URL '{url}' for which we are processing refs!") + })?; txn.mark_url_as_visited( domain.as_ref(), url.as_ref(), @@ -638,7 +646,9 @@ impl EventProcessor<'_> { self.store .as_ref() .async_rw_txn(move |txn| { - let domain = get_reduced_domain(&url)?; + let domain = get_reduced_domain(&url).with_context(|| { + format!("No domain for URL '{url}' for which we are processing a rejection!") + })?; txn.mark_url_as_visited( domain.as_ref(), url.as_ref(), diff --git a/quickpeep_raker/src/storage.rs b/quickpeep_raker/src/storage.rs index 82df6cf..ddf057a 100644 --- a/quickpeep_raker/src/storage.rs +++ b/quickpeep_raker/src/storage.rs @@ -516,7 +516,8 @@ impl<'a> RakerTxn<'a, RW> { let visited_urls = &self.mdbx.borrow_dbs().visited_urls; let url = Url::parse(url_str)?; - let url_domain = get_reduced_domain(&url)?; + let url_domain = get_reduced_domain(&url) + .with_context(|| format!("No domain for to-be-enqueued URL: '{url}'!"))?; let queue_key = format!("{}\n{}", url_domain, url); @@ -568,7 +569,8 @@ impl<'a> RakerTxn<'a, RW> { let urls_on_hold = &self.mdbx.borrow_dbs().urls_on_hold; let url = Url::parse(url_str)?; - let url_domain = get_reduced_domain(&url)?; + let url_domain = get_reduced_domain(&url) + .with_context(|| format!("No domain for to-be-put-on-hold URL: '{url}'!"))?; let queue_key = format!("{}\n{}", url_domain, url); diff --git a/quickpeep_utils/src/urls.rs b/quickpeep_utils/src/urls.rs index a8ef851..5a555d8 100644 --- a/quickpeep_utils/src/urls.rs +++ b/quickpeep_utils/src/urls.rs @@ -1,11 +1,11 @@ -use anyhow::Context; use std::borrow::Cow; use url::Url; -pub fn get_reduced_domain(url: &Url) -> anyhow::Result> { - let domain = url.domain().context("URLs must have domains")?; +pub fn get_reduced_domain(url: &Url) -> Option> { + // If the URL does not have a host or not a domain (e.g. IP address) then exits with None here. + let domain = url.domain()?; - Ok(Cow::Borrowed(match domain.strip_prefix("www.") { + Some(Cow::Borrowed(match domain.strip_prefix("www.") { Some(stripped) => stripped, None => domain, }))