Clarify and handle 'No domain for URL' error in a better way
ci/woodpecker/push/check Pipeline failed Details
ci/woodpecker/push/manual Pipeline failed Details
ci/woodpecker/push/release Pipeline was successful Details

This commit is contained in:
Olivier 'reivilibre' 2023-03-21 23:36:47 +00:00
parent 73c72bce25
commit 6d37a07d3e
6 changed files with 41 additions and 15 deletions

View File

@ -318,7 +318,8 @@ pub async fn build_seed_lookup_table(
match &seed.url {
UrlOrUrlPattern::Url(url_str) => {
let url = Url::parse(url_str)?;
let reduced_domain = get_reduced_domain(&url)?;
let reduced_domain = get_reduced_domain(&url)
.with_context(|| format!("No domain in seed: '{url}'!"))?;
seed_lookup
.by_reduced_domain
.insert(reduced_domain.into(), seed);
@ -338,7 +339,8 @@ impl SeedLookupTable {
return Ok(Some(seed));
}
let domain = get_reduced_domain(url)?;
let domain = get_reduced_domain(url)
.with_context(|| format!("No domain in looked up URL: '{url}'"))?;
if let Some(seed) = self.by_reduced_domain.get(domain.as_ref()) {
return Ok(Some(seed));
}

View File

@ -169,7 +169,8 @@ async fn import_and_flush_batch_seeds(
for seed in buf.drain(..) {
let as_url = Url::parse(seed.url.as_str())
.with_context(|| format!("Failed to parse {:?} as URL", seed.url))?;
let domain = get_reduced_domain(&as_url)?;
let domain = get_reduced_domain(&as_url)
.with_context(|| format!("No domain in seed URL '{as_url}'!"))?;
let allowed_domain_record = txn.get_allowed_domain_record(domain.borrow())?;
@ -244,7 +245,8 @@ async fn import_and_flush_batch_weeds(
for seed in buf.drain(..) {
let as_url = Url::parse(seed.url.as_str())
.with_context(|| format!("Failed to parse {:?} as URL", seed.url))?;
let domain = get_reduced_domain(&as_url)?;
let domain = get_reduced_domain(&as_url)
.with_context(|| format!("No domain in weed URL '{as_url}'!"))?;
let weed_domain_record = txn.get_weed_domain_record(domain.borrow())?;

View File

@ -45,6 +45,12 @@ pub fn find_references(
} => {
if !nofollow {
if let Ok(full_url) = page_url.join(&href) {
if full_url.domain().is_none() {
// Skip URLs that don't have a domain after being made absolute.
// This also skips IP addresses: we probably don't want to bother
// indexing content from explicit IP addresses.
continue;
}
if SUPPORTED_SCHEMES.contains(&full_url.scheme()) {
refs.insert(RakedReference {
target: clean_url(&full_url).to_string(),
@ -72,6 +78,10 @@ pub fn find_references(
add_link_refs(&doc, &mut refs, &page_url);
for feed in feeds {
if feed.domain().is_none() {
// same rationale as above.
continue;
}
refs.insert(RakedReference {
target: clean_url(feed).as_str().to_owned(),
kind: ReferenceKind::HeaderLinkedFeed,

View File

@ -454,7 +454,9 @@ impl TaskContext {
// TODO(future) do we want to log this somewhere?
// or at least a metric
let domain = get_reduced_domain(url)?;
let domain = get_reduced_domain(url).with_context(|| {
format!("No domain in URL '{url}' for which we are processing the outcome!")
})?;
let url = url.clone();
// TODO(feature) add 1.1× the previous backoff, if there was one.
@ -518,7 +520,9 @@ impl EventProcessor<'_> {
self.store
.as_ref()
.async_rw_txn(move |txn| {
let domain = get_reduced_domain(&url)?;
let domain = get_reduced_domain(&url).with_context(|| {
format!("No domain for URL '{url}' for which we are processing the page!")
})?;
txn.mark_url_as_visited(
domain.as_ref(),
url.as_ref(),
@ -546,7 +550,9 @@ impl EventProcessor<'_> {
self.store
.as_ref()
.async_rw_txn(move |txn| {
let domain = get_reduced_domain(&url)?;
let domain = get_reduced_domain(&url).with_context(|| {
format!("No domain for URL '{url}' for which we are processing an icon!")
})?;
txn.mark_url_as_visited(
domain.as_ref(),
url.as_ref(),
@ -577,7 +583,9 @@ impl EventProcessor<'_> {
self.store
.as_ref()
.async_rw_txn(move |txn| {
let domain = get_reduced_domain(&url)?;
let domain = get_reduced_domain(&url).with_context(|| {
format!("No domain for URL '{url}' for which we are processing refs!")
})?;
txn.mark_url_as_visited(
domain.as_ref(),
url.as_ref(),
@ -638,7 +646,9 @@ impl EventProcessor<'_> {
self.store
.as_ref()
.async_rw_txn(move |txn| {
let domain = get_reduced_domain(&url)?;
let domain = get_reduced_domain(&url).with_context(|| {
format!("No domain for URL '{url}' for which we are processing a rejection!")
})?;
txn.mark_url_as_visited(
domain.as_ref(),
url.as_ref(),

View File

@ -516,7 +516,8 @@ impl<'a> RakerTxn<'a, RW> {
let visited_urls = &self.mdbx.borrow_dbs().visited_urls;
let url = Url::parse(url_str)?;
let url_domain = get_reduced_domain(&url)?;
let url_domain = get_reduced_domain(&url)
.with_context(|| format!("No domain for to-be-enqueued URL: '{url}'!"))?;
let queue_key = format!("{}\n{}", url_domain, url);
@ -568,7 +569,8 @@ impl<'a> RakerTxn<'a, RW> {
let urls_on_hold = &self.mdbx.borrow_dbs().urls_on_hold;
let url = Url::parse(url_str)?;
let url_domain = get_reduced_domain(&url)?;
let url_domain = get_reduced_domain(&url)
.with_context(|| format!("No domain for to-be-put-on-hold URL: '{url}'!"))?;
let queue_key = format!("{}\n{}", url_domain, url);

View File

@ -1,11 +1,11 @@
use anyhow::Context;
use std::borrow::Cow;
use url::Url;
pub fn get_reduced_domain(url: &Url) -> anyhow::Result<Cow<'_, str>> {
let domain = url.domain().context("URLs must have domains")?;
pub fn get_reduced_domain(url: &Url) -> Option<Cow<'_, str>> {
// If the URL does not have a host or not a domain (e.g. IP address) then exits with None here.
let domain = url.domain()?;
Ok(Cow::Borrowed(match domain.strip_prefix("www.") {
Some(Cow::Borrowed(match domain.strip_prefix("www.") {
Some(stripped) => stripped,
None => domain,
}))