Clarify and handle 'No domain for URL' error in a better way

2023-03-21 23:36:47 +00:00 · 2023-03-21 23:36:47 +00:00 · 6d37a07d3e
commit 6d37a07d3e
parent 73c72bce25
6 changed files with 41 additions and 15 deletions
--- a/quickpeep_indexer/src/bin/qp-indexer.rs
+++ b/quickpeep_indexer/src/bin/qp-indexer.rs
@ -318,7 +318,8 @@ pub async fn build_seed_lookup_table(
        match &seed.url {
            UrlOrUrlPattern::Url(url_str) => {
                let url = Url::parse(url_str)?;
-                let reduced_domain = get_reduced_domain(&url)?;
+                let reduced_domain = get_reduced_domain(&url)
+                    .with_context(|| format!("No domain in seed: '{url}'!"))?;
                seed_lookup
                    .by_reduced_domain
                    .insert(reduced_domain.into(), seed);
@ -338,7 +339,8 @@ impl SeedLookupTable {
            return Ok(Some(seed));
        }

-        let domain = get_reduced_domain(url)?;
+        let domain = get_reduced_domain(url)
+            .with_context(|| format!("No domain in looked up URL: '{url}'"))?;
        if let Some(seed) = self.by_reduced_domain.get(domain.as_ref()) {
            return Ok(Some(seed));
        }
--- a/quickpeep_raker/src/bin/qp-seedrake.rs
+++ b/quickpeep_raker/src/bin/qp-seedrake.rs
@ -169,7 +169,8 @@ async fn import_and_flush_batch_seeds(
    for seed in buf.drain(..) {
        let as_url = Url::parse(seed.url.as_str())
            .with_context(|| format!("Failed to parse {:?} as URL", seed.url))?;
-        let domain = get_reduced_domain(&as_url)?;
+        let domain = get_reduced_domain(&as_url)
+            .with_context(|| format!("No domain in seed URL '{as_url}'!"))?;

        let allowed_domain_record = txn.get_allowed_domain_record(domain.borrow())?;

@ -244,7 +245,8 @@ async fn import_and_flush_batch_weeds(
    for seed in buf.drain(..) {
        let as_url = Url::parse(seed.url.as_str())
            .with_context(|| format!("Failed to parse {:?} as URL", seed.url))?;
-        let domain = get_reduced_domain(&as_url)?;
+        let domain = get_reduced_domain(&as_url)
+            .with_context(|| format!("No domain in weed URL '{as_url}'!"))?;

        let weed_domain_record = txn.get_weed_domain_record(domain.borrow())?;

--- a/quickpeep_raker/src/raking/references.rs
+++ b/quickpeep_raker/src/raking/references.rs
@ -45,6 +45,12 @@ pub fn find_references(
                } => {
                    if !nofollow {
                        if let Ok(full_url) = page_url.join(&href) {
+                            if full_url.domain().is_none() {
+                                // Skip URLs that don't have a domain after being made absolute.
+                                // This also skips IP addresses: we probably don't want to bother
+                                // indexing content from explicit IP addresses.
+                                continue;
+                            }
                            if SUPPORTED_SCHEMES.contains(&full_url.scheme()) {
                                refs.insert(RakedReference {
                                    target: clean_url(&full_url).to_string(),
@ -72,6 +78,10 @@ pub fn find_references(
    add_link_refs(&doc, &mut refs, &page_url);

    for feed in feeds {
+        if feed.domain().is_none() {
+            // same rationale as above.
+            continue;
+        }
        refs.insert(RakedReference {
            target: clean_url(feed).as_str().to_owned(),
            kind: ReferenceKind::HeaderLinkedFeed,
--- a/quickpeep_raker/src/raking/task.rs
+++ b/quickpeep_raker/src/raking/task.rs
@ -454,7 +454,9 @@ impl TaskContext {
                // TODO(future) do we want to log this somewhere?
                // or at least a metric

-                let domain = get_reduced_domain(url)?;
+                let domain = get_reduced_domain(url).with_context(|| {
+                    format!("No domain in URL '{url}' for which we are processing the outcome!")
+                })?;
                let url = url.clone();

                // TODO(feature) add 1.1× the previous backoff, if there was one.
@ -518,7 +520,9 @@ impl EventProcessor<'_> {
        self.store
            .as_ref()
            .async_rw_txn(move |txn| {
-                let domain = get_reduced_domain(&url)?;
+                let domain = get_reduced_domain(&url).with_context(|| {
+                    format!("No domain for URL '{url}' for which we are processing the page!")
+                })?;
                txn.mark_url_as_visited(
                    domain.as_ref(),
                    url.as_ref(),
@ -546,7 +550,9 @@ impl EventProcessor<'_> {
        self.store
            .as_ref()
            .async_rw_txn(move |txn| {
-                let domain = get_reduced_domain(&url)?;
+                let domain = get_reduced_domain(&url).with_context(|| {
+                    format!("No domain for URL '{url}' for which we are processing an icon!")
+                })?;
                txn.mark_url_as_visited(
                    domain.as_ref(),
                    url.as_ref(),
@ -577,7 +583,9 @@ impl EventProcessor<'_> {
        self.store
            .as_ref()
            .async_rw_txn(move |txn| {
-                let domain = get_reduced_domain(&url)?;
+                let domain = get_reduced_domain(&url).with_context(|| {
+                    format!("No domain for URL '{url}' for which we are processing refs!")
+                })?;
                txn.mark_url_as_visited(
                    domain.as_ref(),
                    url.as_ref(),
@ -638,7 +646,9 @@ impl EventProcessor<'_> {
        self.store
            .as_ref()
            .async_rw_txn(move |txn| {
-                let domain = get_reduced_domain(&url)?;
+                let domain = get_reduced_domain(&url).with_context(|| {
+                    format!("No domain for URL '{url}' for which we are processing a rejection!")
+                })?;
                txn.mark_url_as_visited(
                    domain.as_ref(),
                    url.as_ref(),
--- a/quickpeep_raker/src/storage.rs
+++ b/quickpeep_raker/src/storage.rs
@ -516,7 +516,8 @@ impl<'a> RakerTxn<'a, RW> {
        let visited_urls = &self.mdbx.borrow_dbs().visited_urls;

        let url = Url::parse(url_str)?;
-        let url_domain = get_reduced_domain(&url)?;
+        let url_domain = get_reduced_domain(&url)
+            .with_context(|| format!("No domain for to-be-enqueued URL: '{url}'!"))?;

        let queue_key = format!("{}\n{}", url_domain, url);

@ -568,7 +569,8 @@ impl<'a> RakerTxn<'a, RW> {
        let urls_on_hold = &self.mdbx.borrow_dbs().urls_on_hold;

        let url = Url::parse(url_str)?;
-        let url_domain = get_reduced_domain(&url)?;
+        let url_domain = get_reduced_domain(&url)
+            .with_context(|| format!("No domain for to-be-put-on-hold URL: '{url}'!"))?;

        let queue_key = format!("{}\n{}", url_domain, url);

--- a/quickpeep_utils/src/urls.rs
+++ b/quickpeep_utils/src/urls.rs
@ -1,11 +1,11 @@
-use anyhow::Context;
 use std::borrow::Cow;
 use url::Url;

-pub fn get_reduced_domain(url: &Url) -> anyhow::Result<Cow<'_, str>> {
-    let domain = url.domain().context("URLs must have domains")?;
+pub fn get_reduced_domain(url: &Url) -> Option<Cow<'_, str>> {
+    // If the URL does not have a host or not a domain (e.g. IP address) then exits with None here.
+    let domain = url.domain()?;

-    Ok(Cow::Borrowed(match domain.strip_prefix("www.") {
+    Some(Cow::Borrowed(match domain.strip_prefix("www.") {
        Some(stripped) => stripped,
        None => domain,
    }))