Clarify and handle 'No domain for URL' error in a better way
This commit is contained in:
parent
73c72bce25
commit
6d37a07d3e
@ -318,7 +318,8 @@ pub async fn build_seed_lookup_table(
|
|||||||
match &seed.url {
|
match &seed.url {
|
||||||
UrlOrUrlPattern::Url(url_str) => {
|
UrlOrUrlPattern::Url(url_str) => {
|
||||||
let url = Url::parse(url_str)?;
|
let url = Url::parse(url_str)?;
|
||||||
let reduced_domain = get_reduced_domain(&url)?;
|
let reduced_domain = get_reduced_domain(&url)
|
||||||
|
.with_context(|| format!("No domain in seed: '{url}'!"))?;
|
||||||
seed_lookup
|
seed_lookup
|
||||||
.by_reduced_domain
|
.by_reduced_domain
|
||||||
.insert(reduced_domain.into(), seed);
|
.insert(reduced_domain.into(), seed);
|
||||||
@ -338,7 +339,8 @@ impl SeedLookupTable {
|
|||||||
return Ok(Some(seed));
|
return Ok(Some(seed));
|
||||||
}
|
}
|
||||||
|
|
||||||
let domain = get_reduced_domain(url)?;
|
let domain = get_reduced_domain(url)
|
||||||
|
.with_context(|| format!("No domain in looked up URL: '{url}'"))?;
|
||||||
if let Some(seed) = self.by_reduced_domain.get(domain.as_ref()) {
|
if let Some(seed) = self.by_reduced_domain.get(domain.as_ref()) {
|
||||||
return Ok(Some(seed));
|
return Ok(Some(seed));
|
||||||
}
|
}
|
||||||
|
@ -169,7 +169,8 @@ async fn import_and_flush_batch_seeds(
|
|||||||
for seed in buf.drain(..) {
|
for seed in buf.drain(..) {
|
||||||
let as_url = Url::parse(seed.url.as_str())
|
let as_url = Url::parse(seed.url.as_str())
|
||||||
.with_context(|| format!("Failed to parse {:?} as URL", seed.url))?;
|
.with_context(|| format!("Failed to parse {:?} as URL", seed.url))?;
|
||||||
let domain = get_reduced_domain(&as_url)?;
|
let domain = get_reduced_domain(&as_url)
|
||||||
|
.with_context(|| format!("No domain in seed URL '{as_url}'!"))?;
|
||||||
|
|
||||||
let allowed_domain_record = txn.get_allowed_domain_record(domain.borrow())?;
|
let allowed_domain_record = txn.get_allowed_domain_record(domain.borrow())?;
|
||||||
|
|
||||||
@ -244,7 +245,8 @@ async fn import_and_flush_batch_weeds(
|
|||||||
for seed in buf.drain(..) {
|
for seed in buf.drain(..) {
|
||||||
let as_url = Url::parse(seed.url.as_str())
|
let as_url = Url::parse(seed.url.as_str())
|
||||||
.with_context(|| format!("Failed to parse {:?} as URL", seed.url))?;
|
.with_context(|| format!("Failed to parse {:?} as URL", seed.url))?;
|
||||||
let domain = get_reduced_domain(&as_url)?;
|
let domain = get_reduced_domain(&as_url)
|
||||||
|
.with_context(|| format!("No domain in weed URL '{as_url}'!"))?;
|
||||||
|
|
||||||
let weed_domain_record = txn.get_weed_domain_record(domain.borrow())?;
|
let weed_domain_record = txn.get_weed_domain_record(domain.borrow())?;
|
||||||
|
|
||||||
|
@ -45,6 +45,12 @@ pub fn find_references(
|
|||||||
} => {
|
} => {
|
||||||
if !nofollow {
|
if !nofollow {
|
||||||
if let Ok(full_url) = page_url.join(&href) {
|
if let Ok(full_url) = page_url.join(&href) {
|
||||||
|
if full_url.domain().is_none() {
|
||||||
|
// Skip URLs that don't have a domain after being made absolute.
|
||||||
|
// This also skips IP addresses: we probably don't want to bother
|
||||||
|
// indexing content from explicit IP addresses.
|
||||||
|
continue;
|
||||||
|
}
|
||||||
if SUPPORTED_SCHEMES.contains(&full_url.scheme()) {
|
if SUPPORTED_SCHEMES.contains(&full_url.scheme()) {
|
||||||
refs.insert(RakedReference {
|
refs.insert(RakedReference {
|
||||||
target: clean_url(&full_url).to_string(),
|
target: clean_url(&full_url).to_string(),
|
||||||
@ -72,6 +78,10 @@ pub fn find_references(
|
|||||||
add_link_refs(&doc, &mut refs, &page_url);
|
add_link_refs(&doc, &mut refs, &page_url);
|
||||||
|
|
||||||
for feed in feeds {
|
for feed in feeds {
|
||||||
|
if feed.domain().is_none() {
|
||||||
|
// same rationale as above.
|
||||||
|
continue;
|
||||||
|
}
|
||||||
refs.insert(RakedReference {
|
refs.insert(RakedReference {
|
||||||
target: clean_url(feed).as_str().to_owned(),
|
target: clean_url(feed).as_str().to_owned(),
|
||||||
kind: ReferenceKind::HeaderLinkedFeed,
|
kind: ReferenceKind::HeaderLinkedFeed,
|
||||||
|
@ -454,7 +454,9 @@ impl TaskContext {
|
|||||||
// TODO(future) do we want to log this somewhere?
|
// TODO(future) do we want to log this somewhere?
|
||||||
// or at least a metric
|
// or at least a metric
|
||||||
|
|
||||||
let domain = get_reduced_domain(url)?;
|
let domain = get_reduced_domain(url).with_context(|| {
|
||||||
|
format!("No domain in URL '{url}' for which we are processing the outcome!")
|
||||||
|
})?;
|
||||||
let url = url.clone();
|
let url = url.clone();
|
||||||
|
|
||||||
// TODO(feature) add 1.1× the previous backoff, if there was one.
|
// TODO(feature) add 1.1× the previous backoff, if there was one.
|
||||||
@ -518,7 +520,9 @@ impl EventProcessor<'_> {
|
|||||||
self.store
|
self.store
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.async_rw_txn(move |txn| {
|
.async_rw_txn(move |txn| {
|
||||||
let domain = get_reduced_domain(&url)?;
|
let domain = get_reduced_domain(&url).with_context(|| {
|
||||||
|
format!("No domain for URL '{url}' for which we are processing the page!")
|
||||||
|
})?;
|
||||||
txn.mark_url_as_visited(
|
txn.mark_url_as_visited(
|
||||||
domain.as_ref(),
|
domain.as_ref(),
|
||||||
url.as_ref(),
|
url.as_ref(),
|
||||||
@ -546,7 +550,9 @@ impl EventProcessor<'_> {
|
|||||||
self.store
|
self.store
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.async_rw_txn(move |txn| {
|
.async_rw_txn(move |txn| {
|
||||||
let domain = get_reduced_domain(&url)?;
|
let domain = get_reduced_domain(&url).with_context(|| {
|
||||||
|
format!("No domain for URL '{url}' for which we are processing an icon!")
|
||||||
|
})?;
|
||||||
txn.mark_url_as_visited(
|
txn.mark_url_as_visited(
|
||||||
domain.as_ref(),
|
domain.as_ref(),
|
||||||
url.as_ref(),
|
url.as_ref(),
|
||||||
@ -577,7 +583,9 @@ impl EventProcessor<'_> {
|
|||||||
self.store
|
self.store
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.async_rw_txn(move |txn| {
|
.async_rw_txn(move |txn| {
|
||||||
let domain = get_reduced_domain(&url)?;
|
let domain = get_reduced_domain(&url).with_context(|| {
|
||||||
|
format!("No domain for URL '{url}' for which we are processing refs!")
|
||||||
|
})?;
|
||||||
txn.mark_url_as_visited(
|
txn.mark_url_as_visited(
|
||||||
domain.as_ref(),
|
domain.as_ref(),
|
||||||
url.as_ref(),
|
url.as_ref(),
|
||||||
@ -638,7 +646,9 @@ impl EventProcessor<'_> {
|
|||||||
self.store
|
self.store
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.async_rw_txn(move |txn| {
|
.async_rw_txn(move |txn| {
|
||||||
let domain = get_reduced_domain(&url)?;
|
let domain = get_reduced_domain(&url).with_context(|| {
|
||||||
|
format!("No domain for URL '{url}' for which we are processing a rejection!")
|
||||||
|
})?;
|
||||||
txn.mark_url_as_visited(
|
txn.mark_url_as_visited(
|
||||||
domain.as_ref(),
|
domain.as_ref(),
|
||||||
url.as_ref(),
|
url.as_ref(),
|
||||||
|
@ -516,7 +516,8 @@ impl<'a> RakerTxn<'a, RW> {
|
|||||||
let visited_urls = &self.mdbx.borrow_dbs().visited_urls;
|
let visited_urls = &self.mdbx.borrow_dbs().visited_urls;
|
||||||
|
|
||||||
let url = Url::parse(url_str)?;
|
let url = Url::parse(url_str)?;
|
||||||
let url_domain = get_reduced_domain(&url)?;
|
let url_domain = get_reduced_domain(&url)
|
||||||
|
.with_context(|| format!("No domain for to-be-enqueued URL: '{url}'!"))?;
|
||||||
|
|
||||||
let queue_key = format!("{}\n{}", url_domain, url);
|
let queue_key = format!("{}\n{}", url_domain, url);
|
||||||
|
|
||||||
@ -568,7 +569,8 @@ impl<'a> RakerTxn<'a, RW> {
|
|||||||
let urls_on_hold = &self.mdbx.borrow_dbs().urls_on_hold;
|
let urls_on_hold = &self.mdbx.borrow_dbs().urls_on_hold;
|
||||||
|
|
||||||
let url = Url::parse(url_str)?;
|
let url = Url::parse(url_str)?;
|
||||||
let url_domain = get_reduced_domain(&url)?;
|
let url_domain = get_reduced_domain(&url)
|
||||||
|
.with_context(|| format!("No domain for to-be-put-on-hold URL: '{url}'!"))?;
|
||||||
|
|
||||||
let queue_key = format!("{}\n{}", url_domain, url);
|
let queue_key = format!("{}\n{}", url_domain, url);
|
||||||
|
|
||||||
|
@ -1,11 +1,11 @@
|
|||||||
use anyhow::Context;
|
|
||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use url::Url;
|
use url::Url;
|
||||||
|
|
||||||
pub fn get_reduced_domain(url: &Url) -> anyhow::Result<Cow<'_, str>> {
|
pub fn get_reduced_domain(url: &Url) -> Option<Cow<'_, str>> {
|
||||||
let domain = url.domain().context("URLs must have domains")?;
|
// If the URL does not have a host or not a domain (e.g. IP address) then exits with None here.
|
||||||
|
let domain = url.domain()?;
|
||||||
|
|
||||||
Ok(Cow::Borrowed(match domain.strip_prefix("www.") {
|
Some(Cow::Borrowed(match domain.strip_prefix("www.") {
|
||||||
Some(stripped) => stripped,
|
Some(stripped) => stripped,
|
||||||
None => domain,
|
None => domain,
|
||||||
}))
|
}))
|
||||||
|
Loading…
Reference in New Issue
Block a user