Dodge some places where we enqueue URLs without checking they have supported schemes
ci/woodpecker/push/check Pipeline failed Details
ci/woodpecker/push/manual Pipeline failed Details
ci/woodpecker/push/release Pipeline was successful Details

This commit is contained in:
Olivier 'reivilibre' 2023-03-30 23:40:43 +01:00
parent 1e8aa95e7a
commit 1c10cb203a
3 changed files with 10 additions and 5 deletions

View File

@ -12,6 +12,7 @@ use tokio::sync::mpsc;
use tokio::sync::mpsc::Receiver; use tokio::sync::mpsc::Receiver;
use quickpeep_raker::config::RakerConfig; use quickpeep_raker::config::RakerConfig;
use quickpeep_raker::raking::references::SUPPORTED_SCHEMES;
use quickpeep_raker::raking::{get_robots_txt_for, RakeIntent}; use quickpeep_raker::raking::{get_robots_txt_for, RakeIntent};
use quickpeep_raker::storage::records::{AllowedDomainRecord, WeedDomainRecord}; use quickpeep_raker::storage::records::{AllowedDomainRecord, WeedDomainRecord};
use quickpeep_raker::storage::{maintenance, RakerStore}; use quickpeep_raker::storage::{maintenance, RakerStore};
@ -226,12 +227,14 @@ async fn import_and_flush_batch_seeds(
// look at robots.txt and discover sitemaps! // look at robots.txt and discover sitemaps!
if let Some(robots_txt) = get_robots_txt_for(&url_like, &client).await? { if let Some(robots_txt) = get_robots_txt_for(&url_like, &client).await? {
for sitemap in robots_txt.sitemaps { for sitemap in robots_txt.sitemaps {
if SUPPORTED_SCHEMES.contains(&sitemap.url.scheme()) {
txn.enqueue_url(sitemap.url.as_str(), None, RakeIntent::SiteMap)?; txn.enqueue_url(sitemap.url.as_str(), None, RakeIntent::SiteMap)?;
stats.new_sitemaps += 1; stats.new_sitemaps += 1;
} }
} }
} }
} }
}
txn.commit()?; txn.commit()?;
Ok(()) Ok(())
} }

View File

@ -61,7 +61,7 @@ pub fn find_references(
debug!( debug!(
"ignoring reference {:?}: not a supported scheme", "ignoring reference {:?}: not a supported scheme",
full_url.as_str() full_url.as_str()
) );
} }
} else { } else {
debug!("Can't join {:?} + {:?} to get full URL", page_url, href); debug!("Can't join {:?} + {:?} to get full URL", page_url, href);

View File

@ -1,5 +1,5 @@
use crate::config::RerakeTimings; use crate::config::RerakeTimings;
use crate::raking::references::{clean_url, references_from_urlrakes}; use crate::raking::references::{clean_url, references_from_urlrakes, SUPPORTED_SCHEMES};
use crate::raking::{ use crate::raking::{
get_robots_txt_for, robots_txt_url_for, PermanentFailure, PermanentFailureReason, RakeIntent, get_robots_txt_for, robots_txt_url_for, PermanentFailure, PermanentFailureReason, RakeIntent,
RakeOutcome, Raker, RedirectReason, RobotsTxt, TemporaryFailure, TemporaryFailureReason, RakeOutcome, Raker, RedirectReason, RobotsTxt, TemporaryFailure, TemporaryFailureReason,
@ -489,8 +489,10 @@ impl EventProcessor<'_> {
// If there's a favicon to be tried, add it to the list... // If there's a favicon to be tried, add it to the list...
let favicon_url_rel = page.document.head.effective_favicon_url(); let favicon_url_rel = page.document.head.effective_favicon_url();
if let Ok(favicon_url) = url.join(favicon_url_rel) { if let Ok(favicon_url) = url.join(favicon_url_rel) {
if SUPPORTED_SCHEMES.contains(&favicon_url.scheme()) {
txn.enqueue_url(favicon_url.as_str(), None, RakeIntent::Icon)?; txn.enqueue_url(favicon_url.as_str(), None, RakeIntent::Icon)?;
} }
}
txn.commit()?; txn.commit()?;
Ok(()) Ok(())