From 1c10cb203af61f7fa40816c797d4b12110cb18c6 Mon Sep 17 00:00:00 2001 From: Olivier 'reivilibre Date: Thu, 30 Mar 2023 23:40:43 +0100 Subject: [PATCH] Dodge some places where we enqueue URLs without checking they have supported schemes --- quickpeep_raker/src/bin/qp-seedrake.rs | 7 +++++-- quickpeep_raker/src/raking/references.rs | 2 +- quickpeep_raker/src/raking/task.rs | 6 ++++-- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/quickpeep_raker/src/bin/qp-seedrake.rs b/quickpeep_raker/src/bin/qp-seedrake.rs index 8cff450..570eb6a 100644 --- a/quickpeep_raker/src/bin/qp-seedrake.rs +++ b/quickpeep_raker/src/bin/qp-seedrake.rs @@ -12,6 +12,7 @@ use tokio::sync::mpsc; use tokio::sync::mpsc::Receiver; use quickpeep_raker::config::RakerConfig; +use quickpeep_raker::raking::references::SUPPORTED_SCHEMES; use quickpeep_raker::raking::{get_robots_txt_for, RakeIntent}; use quickpeep_raker::storage::records::{AllowedDomainRecord, WeedDomainRecord}; use quickpeep_raker::storage::{maintenance, RakerStore}; @@ -226,8 +227,10 @@ async fn import_and_flush_batch_seeds( // look at robots.txt and discover sitemaps! if let Some(robots_txt) = get_robots_txt_for(&url_like, &client).await? { for sitemap in robots_txt.sitemaps { - txn.enqueue_url(sitemap.url.as_str(), None, RakeIntent::SiteMap)?; - stats.new_sitemaps += 1; + if SUPPORTED_SCHEMES.contains(&sitemap.url.scheme()) { + txn.enqueue_url(sitemap.url.as_str(), None, RakeIntent::SiteMap)?; + stats.new_sitemaps += 1; + } } } } diff --git a/quickpeep_raker/src/raking/references.rs b/quickpeep_raker/src/raking/references.rs index d787c05..bb7563a 100644 --- a/quickpeep_raker/src/raking/references.rs +++ b/quickpeep_raker/src/raking/references.rs @@ -61,7 +61,7 @@ pub fn find_references( debug!( "ignoring reference {:?}: not a supported scheme", full_url.as_str() - ) + ); } } else { debug!("Can't join {:?} + {:?} to get full URL", page_url, href); diff --git a/quickpeep_raker/src/raking/task.rs b/quickpeep_raker/src/raking/task.rs index 2c2c87d..8bbddb0 100644 --- a/quickpeep_raker/src/raking/task.rs +++ b/quickpeep_raker/src/raking/task.rs @@ -1,5 +1,5 @@ use crate::config::RerakeTimings; -use crate::raking::references::{clean_url, references_from_urlrakes}; +use crate::raking::references::{clean_url, references_from_urlrakes, SUPPORTED_SCHEMES}; use crate::raking::{ get_robots_txt_for, robots_txt_url_for, PermanentFailure, PermanentFailureReason, RakeIntent, RakeOutcome, Raker, RedirectReason, RobotsTxt, TemporaryFailure, TemporaryFailureReason, @@ -489,7 +489,9 @@ impl EventProcessor<'_> { // If there's a favicon to be tried, add it to the list... let favicon_url_rel = page.document.head.effective_favicon_url(); if let Ok(favicon_url) = url.join(favicon_url_rel) { - txn.enqueue_url(favicon_url.as_str(), None, RakeIntent::Icon)?; + if SUPPORTED_SCHEMES.contains(&favicon_url.scheme()) { + txn.enqueue_url(favicon_url.as_str(), None, RakeIntent::Icon)?; + } } txn.commit()?;