Dodge some places where we enqueue URLs without checking they have supported schemes
This commit is contained in:
parent
1e8aa95e7a
commit
1c10cb203a
|
@ -12,6 +12,7 @@ use tokio::sync::mpsc;
|
||||||
use tokio::sync::mpsc::Receiver;
|
use tokio::sync::mpsc::Receiver;
|
||||||
|
|
||||||
use quickpeep_raker::config::RakerConfig;
|
use quickpeep_raker::config::RakerConfig;
|
||||||
|
use quickpeep_raker::raking::references::SUPPORTED_SCHEMES;
|
||||||
use quickpeep_raker::raking::{get_robots_txt_for, RakeIntent};
|
use quickpeep_raker::raking::{get_robots_txt_for, RakeIntent};
|
||||||
use quickpeep_raker::storage::records::{AllowedDomainRecord, WeedDomainRecord};
|
use quickpeep_raker::storage::records::{AllowedDomainRecord, WeedDomainRecord};
|
||||||
use quickpeep_raker::storage::{maintenance, RakerStore};
|
use quickpeep_raker::storage::{maintenance, RakerStore};
|
||||||
|
@ -226,8 +227,10 @@ async fn import_and_flush_batch_seeds(
|
||||||
// look at robots.txt and discover sitemaps!
|
// look at robots.txt and discover sitemaps!
|
||||||
if let Some(robots_txt) = get_robots_txt_for(&url_like, &client).await? {
|
if let Some(robots_txt) = get_robots_txt_for(&url_like, &client).await? {
|
||||||
for sitemap in robots_txt.sitemaps {
|
for sitemap in robots_txt.sitemaps {
|
||||||
txn.enqueue_url(sitemap.url.as_str(), None, RakeIntent::SiteMap)?;
|
if SUPPORTED_SCHEMES.contains(&sitemap.url.scheme()) {
|
||||||
stats.new_sitemaps += 1;
|
txn.enqueue_url(sitemap.url.as_str(), None, RakeIntent::SiteMap)?;
|
||||||
|
stats.new_sitemaps += 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -61,7 +61,7 @@ pub fn find_references(
|
||||||
debug!(
|
debug!(
|
||||||
"ignoring reference {:?}: not a supported scheme",
|
"ignoring reference {:?}: not a supported scheme",
|
||||||
full_url.as_str()
|
full_url.as_str()
|
||||||
)
|
);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
debug!("Can't join {:?} + {:?} to get full URL", page_url, href);
|
debug!("Can't join {:?} + {:?} to get full URL", page_url, href);
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
use crate::config::RerakeTimings;
|
use crate::config::RerakeTimings;
|
||||||
use crate::raking::references::{clean_url, references_from_urlrakes};
|
use crate::raking::references::{clean_url, references_from_urlrakes, SUPPORTED_SCHEMES};
|
||||||
use crate::raking::{
|
use crate::raking::{
|
||||||
get_robots_txt_for, robots_txt_url_for, PermanentFailure, PermanentFailureReason, RakeIntent,
|
get_robots_txt_for, robots_txt_url_for, PermanentFailure, PermanentFailureReason, RakeIntent,
|
||||||
RakeOutcome, Raker, RedirectReason, RobotsTxt, TemporaryFailure, TemporaryFailureReason,
|
RakeOutcome, Raker, RedirectReason, RobotsTxt, TemporaryFailure, TemporaryFailureReason,
|
||||||
|
@ -489,7 +489,9 @@ impl EventProcessor<'_> {
|
||||||
// If there's a favicon to be tried, add it to the list...
|
// If there's a favicon to be tried, add it to the list...
|
||||||
let favicon_url_rel = page.document.head.effective_favicon_url();
|
let favicon_url_rel = page.document.head.effective_favicon_url();
|
||||||
if let Ok(favicon_url) = url.join(favicon_url_rel) {
|
if let Ok(favicon_url) = url.join(favicon_url_rel) {
|
||||||
txn.enqueue_url(favicon_url.as_str(), None, RakeIntent::Icon)?;
|
if SUPPORTED_SCHEMES.contains(&favicon_url.scheme()) {
|
||||||
|
txn.enqueue_url(favicon_url.as_str(), None, RakeIntent::Icon)?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
txn.commit()?;
|
txn.commit()?;
|
||||||
|
|
Loading…
Reference in New Issue