Dodge some places where we enqueue URLs without checking they have supported schemes
ci/woodpecker/push/check Pipeline failed Details
ci/woodpecker/push/manual Pipeline failed Details
ci/woodpecker/push/release Pipeline was successful Details

rei/rakerstore_postgres_overhaul
Olivier 'reivilibre' 2023-03-30 23:40:43 +01:00
parent 1e8aa95e7a
commit 1c10cb203a
3 changed files with 10 additions and 5 deletions

View File

@ -12,6 +12,7 @@ use tokio::sync::mpsc;
use tokio::sync::mpsc::Receiver;
use quickpeep_raker::config::RakerConfig;
use quickpeep_raker::raking::references::SUPPORTED_SCHEMES;
use quickpeep_raker::raking::{get_robots_txt_for, RakeIntent};
use quickpeep_raker::storage::records::{AllowedDomainRecord, WeedDomainRecord};
use quickpeep_raker::storage::{maintenance, RakerStore};
@ -226,8 +227,10 @@ async fn import_and_flush_batch_seeds(
// look at robots.txt and discover sitemaps!
if let Some(robots_txt) = get_robots_txt_for(&url_like, &client).await? {
for sitemap in robots_txt.sitemaps {
txn.enqueue_url(sitemap.url.as_str(), None, RakeIntent::SiteMap)?;
stats.new_sitemaps += 1;
if SUPPORTED_SCHEMES.contains(&sitemap.url.scheme()) {
txn.enqueue_url(sitemap.url.as_str(), None, RakeIntent::SiteMap)?;
stats.new_sitemaps += 1;
}
}
}
}

View File

@ -61,7 +61,7 @@ pub fn find_references(
debug!(
"ignoring reference {:?}: not a supported scheme",
full_url.as_str()
)
);
}
} else {
debug!("Can't join {:?} + {:?} to get full URL", page_url, href);

View File

@ -1,5 +1,5 @@
use crate::config::RerakeTimings;
use crate::raking::references::{clean_url, references_from_urlrakes};
use crate::raking::references::{clean_url, references_from_urlrakes, SUPPORTED_SCHEMES};
use crate::raking::{
get_robots_txt_for, robots_txt_url_for, PermanentFailure, PermanentFailureReason, RakeIntent,
RakeOutcome, Raker, RedirectReason, RobotsTxt, TemporaryFailure, TemporaryFailureReason,
@ -489,7 +489,9 @@ impl EventProcessor<'_> {
// If there's a favicon to be tried, add it to the list...
let favicon_url_rel = page.document.head.effective_favicon_url();
if let Ok(favicon_url) = url.join(favicon_url_rel) {
txn.enqueue_url(favicon_url.as_str(), None, RakeIntent::Icon)?;
if SUPPORTED_SCHEMES.contains(&favicon_url.scheme()) {
txn.enqueue_url(favicon_url.as_str(), None, RakeIntent::Icon)?;
}
}
txn.commit()?;