Clarify and handle 'No domain for URL' error in a better way
This commit is contained in:
parent
73c72bce25
commit
6d37a07d3e
|
@ -318,7 +318,8 @@ pub async fn build_seed_lookup_table(
|
|||
match &seed.url {
|
||||
UrlOrUrlPattern::Url(url_str) => {
|
||||
let url = Url::parse(url_str)?;
|
||||
let reduced_domain = get_reduced_domain(&url)?;
|
||||
let reduced_domain = get_reduced_domain(&url)
|
||||
.with_context(|| format!("No domain in seed: '{url}'!"))?;
|
||||
seed_lookup
|
||||
.by_reduced_domain
|
||||
.insert(reduced_domain.into(), seed);
|
||||
|
@ -338,7 +339,8 @@ impl SeedLookupTable {
|
|||
return Ok(Some(seed));
|
||||
}
|
||||
|
||||
let domain = get_reduced_domain(url)?;
|
||||
let domain = get_reduced_domain(url)
|
||||
.with_context(|| format!("No domain in looked up URL: '{url}'"))?;
|
||||
if let Some(seed) = self.by_reduced_domain.get(domain.as_ref()) {
|
||||
return Ok(Some(seed));
|
||||
}
|
||||
|
|
|
@ -169,7 +169,8 @@ async fn import_and_flush_batch_seeds(
|
|||
for seed in buf.drain(..) {
|
||||
let as_url = Url::parse(seed.url.as_str())
|
||||
.with_context(|| format!("Failed to parse {:?} as URL", seed.url))?;
|
||||
let domain = get_reduced_domain(&as_url)?;
|
||||
let domain = get_reduced_domain(&as_url)
|
||||
.with_context(|| format!("No domain in seed URL '{as_url}'!"))?;
|
||||
|
||||
let allowed_domain_record = txn.get_allowed_domain_record(domain.borrow())?;
|
||||
|
||||
|
@ -244,7 +245,8 @@ async fn import_and_flush_batch_weeds(
|
|||
for seed in buf.drain(..) {
|
||||
let as_url = Url::parse(seed.url.as_str())
|
||||
.with_context(|| format!("Failed to parse {:?} as URL", seed.url))?;
|
||||
let domain = get_reduced_domain(&as_url)?;
|
||||
let domain = get_reduced_domain(&as_url)
|
||||
.with_context(|| format!("No domain in weed URL '{as_url}'!"))?;
|
||||
|
||||
let weed_domain_record = txn.get_weed_domain_record(domain.borrow())?;
|
||||
|
||||
|
|
|
@ -45,6 +45,12 @@ pub fn find_references(
|
|||
} => {
|
||||
if !nofollow {
|
||||
if let Ok(full_url) = page_url.join(&href) {
|
||||
if full_url.domain().is_none() {
|
||||
// Skip URLs that don't have a domain after being made absolute.
|
||||
// This also skips IP addresses: we probably don't want to bother
|
||||
// indexing content from explicit IP addresses.
|
||||
continue;
|
||||
}
|
||||
if SUPPORTED_SCHEMES.contains(&full_url.scheme()) {
|
||||
refs.insert(RakedReference {
|
||||
target: clean_url(&full_url).to_string(),
|
||||
|
@ -72,6 +78,10 @@ pub fn find_references(
|
|||
add_link_refs(&doc, &mut refs, &page_url);
|
||||
|
||||
for feed in feeds {
|
||||
if feed.domain().is_none() {
|
||||
// same rationale as above.
|
||||
continue;
|
||||
}
|
||||
refs.insert(RakedReference {
|
||||
target: clean_url(feed).as_str().to_owned(),
|
||||
kind: ReferenceKind::HeaderLinkedFeed,
|
||||
|
|
|
@ -454,7 +454,9 @@ impl TaskContext {
|
|||
// TODO(future) do we want to log this somewhere?
|
||||
// or at least a metric
|
||||
|
||||
let domain = get_reduced_domain(url)?;
|
||||
let domain = get_reduced_domain(url).with_context(|| {
|
||||
format!("No domain in URL '{url}' for which we are processing the outcome!")
|
||||
})?;
|
||||
let url = url.clone();
|
||||
|
||||
// TODO(feature) add 1.1× the previous backoff, if there was one.
|
||||
|
@ -518,7 +520,9 @@ impl EventProcessor<'_> {
|
|||
self.store
|
||||
.as_ref()
|
||||
.async_rw_txn(move |txn| {
|
||||
let domain = get_reduced_domain(&url)?;
|
||||
let domain = get_reduced_domain(&url).with_context(|| {
|
||||
format!("No domain for URL '{url}' for which we are processing the page!")
|
||||
})?;
|
||||
txn.mark_url_as_visited(
|
||||
domain.as_ref(),
|
||||
url.as_ref(),
|
||||
|
@ -546,7 +550,9 @@ impl EventProcessor<'_> {
|
|||
self.store
|
||||
.as_ref()
|
||||
.async_rw_txn(move |txn| {
|
||||
let domain = get_reduced_domain(&url)?;
|
||||
let domain = get_reduced_domain(&url).with_context(|| {
|
||||
format!("No domain for URL '{url}' for which we are processing an icon!")
|
||||
})?;
|
||||
txn.mark_url_as_visited(
|
||||
domain.as_ref(),
|
||||
url.as_ref(),
|
||||
|
@ -577,7 +583,9 @@ impl EventProcessor<'_> {
|
|||
self.store
|
||||
.as_ref()
|
||||
.async_rw_txn(move |txn| {
|
||||
let domain = get_reduced_domain(&url)?;
|
||||
let domain = get_reduced_domain(&url).with_context(|| {
|
||||
format!("No domain for URL '{url}' for which we are processing refs!")
|
||||
})?;
|
||||
txn.mark_url_as_visited(
|
||||
domain.as_ref(),
|
||||
url.as_ref(),
|
||||
|
@ -638,7 +646,9 @@ impl EventProcessor<'_> {
|
|||
self.store
|
||||
.as_ref()
|
||||
.async_rw_txn(move |txn| {
|
||||
let domain = get_reduced_domain(&url)?;
|
||||
let domain = get_reduced_domain(&url).with_context(|| {
|
||||
format!("No domain for URL '{url}' for which we are processing a rejection!")
|
||||
})?;
|
||||
txn.mark_url_as_visited(
|
||||
domain.as_ref(),
|
||||
url.as_ref(),
|
||||
|
|
|
@ -516,7 +516,8 @@ impl<'a> RakerTxn<'a, RW> {
|
|||
let visited_urls = &self.mdbx.borrow_dbs().visited_urls;
|
||||
|
||||
let url = Url::parse(url_str)?;
|
||||
let url_domain = get_reduced_domain(&url)?;
|
||||
let url_domain = get_reduced_domain(&url)
|
||||
.with_context(|| format!("No domain for to-be-enqueued URL: '{url}'!"))?;
|
||||
|
||||
let queue_key = format!("{}\n{}", url_domain, url);
|
||||
|
||||
|
@ -568,7 +569,8 @@ impl<'a> RakerTxn<'a, RW> {
|
|||
let urls_on_hold = &self.mdbx.borrow_dbs().urls_on_hold;
|
||||
|
||||
let url = Url::parse(url_str)?;
|
||||
let url_domain = get_reduced_domain(&url)?;
|
||||
let url_domain = get_reduced_domain(&url)
|
||||
.with_context(|| format!("No domain for to-be-put-on-hold URL: '{url}'!"))?;
|
||||
|
||||
let queue_key = format!("{}\n{}", url_domain, url);
|
||||
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
use anyhow::Context;
|
||||
use std::borrow::Cow;
|
||||
use url::Url;
|
||||
|
||||
pub fn get_reduced_domain(url: &Url) -> anyhow::Result<Cow<'_, str>> {
|
||||
let domain = url.domain().context("URLs must have domains")?;
|
||||
pub fn get_reduced_domain(url: &Url) -> Option<Cow<'_, str>> {
|
||||
// If the URL does not have a host or not a domain (e.g. IP address) then exits with None here.
|
||||
let domain = url.domain()?;
|
||||
|
||||
Ok(Cow::Borrowed(match domain.strip_prefix("www.") {
|
||||
Some(Cow::Borrowed(match domain.strip_prefix("www.") {
|
||||
Some(stripped) => stripped,
|
||||
None => domain,
|
||||
}))
|
||||
|
|
Loading…
Reference in New Issue