diff --git a/quickpeep_raker/src/raking.rs b/quickpeep_raker/src/raking.rs index a8ed0db..6aed87e 100644 --- a/quickpeep_raker/src/raking.rs +++ b/quickpeep_raker/src/raking.rs @@ -37,24 +37,6 @@ pub const SIZE_LIMIT: usize = 4 * 1024 * 1024; pub const TIME_LIMIT: Duration = Duration::from_secs(10); pub const RAKER_USER_AGENT: &'static str = "QuickPeepBot"; -lazy_static! { - pub static ref IMAGE_MIME_TYPES: HashMap<&'static str, ImageFormat> = { - [ - ("image/png", ImageFormat::Png), - ("image/webp", ImageFormat::WebP), - ("image/jpeg", ImageFormat::Jpeg), - ("image/gif", ImageFormat::Gif), - ("image/vnd.microsoft.icon", ImageFormat::Ico), - ("image/x-icon", ImageFormat::Ico), - ("image/icon", ImageFormat::Ico), - ("image/ico", ImageFormat::Ico), - ("application/ico", ImageFormat::Ico), - ] - .into_iter() - .collect() - }; -} - pub enum RakeOutcome { RakedPage(RakedPage), RakedFeed(Vec), @@ -186,7 +168,22 @@ impl From for RakeIntent { } } +impl RakeIntent { + pub fn supports_mime_type(&self, mime_type: &str) -> bool { + match self { + RakeIntent::Any => ALL_MIME_TYPES.contains(mime_type), + RakeIntent::Page => PAGE_MIME_TYPES.contains(mime_type), + RakeIntent::Feed => FEED_MIME_TYPES.contains(mime_type), + RakeIntent::SiteMap => SITEMAP_MIME_TYPES.contains(mime_type), + RakeIntent::Icon => IMAGE_MIME_TYPES.contains_key(mime_type), + } + } +} + lazy_static! { + static ref PAGE_MIME_TYPES: HashSet<&'static str> = + HashSet::from_iter(vec!["text/html", "text/gemini",]); + static ref SITEMAP_MIME_TYPES: HashSet<&'static str> = HashSet::from_iter(vec!["text/xml", "application/xml",]); @@ -208,6 +205,30 @@ lazy_static! { "application/rdf+xml", "application/feed+json" ]); + + pub static ref IMAGE_MIME_TYPES: HashMap<&'static str, ImageFormat> = { + [ + ("image/png", ImageFormat::Png), + ("image/webp", ImageFormat::WebP), + ("image/jpeg", ImageFormat::Jpeg), + ("image/gif", ImageFormat::Gif), + ("image/vnd.microsoft.icon", ImageFormat::Ico), + ("image/x-icon", ImageFormat::Ico), + ("image/icon", ImageFormat::Ico), + ("image/ico", ImageFormat::Ico), + ("application/ico", ImageFormat::Ico), + ] + .into_iter() + .collect() + }; + + pub static ref ALL_MIME_TYPES: HashSet<&'static str> = HashSet::from_iter( + PAGE_MIME_TYPES.iter().cloned() + .chain(SITEMAP_MIME_TYPES.iter().cloned()) + .chain(FEED_MIME_TYPES.iter().cloned()) + .chain(FEED_LINK_MIME_TYPES.iter().cloned()) + .chain(IMAGE_MIME_TYPES.keys().cloned()) + ); } async fn response_to_bytes_limited( @@ -348,7 +369,12 @@ impl Raker { let content_type = content_type .to_str() .context("Can't convert content-type to str")?; - content_type.split(";").next().unwrap().trim().to_owned() + content_type + .split(";") + .next() + .unwrap() + .trim() + .to_lowercase() } else { increment_counter!("qprake_rake_specific_fail_count", "reason" => "NoCT"); return Ok(RakeOutcome::PermanentFailure(PermanentFailure { @@ -356,6 +382,13 @@ impl Raker { })); }; + if !intent.supports_mime_type(&content_type) { + increment_counter!("qprake_rake_specific_fail_count", "reason" => "OtherCT"); + return Ok(RakeOutcome::PermanentFailure(PermanentFailure { + reason: PermanentFailureReason::UnknownContentType(content_type.to_owned()), + })); + } + let headers = response.headers().clone(); let content = response_to_bytes_limited(response, SIZE_LIMIT, TIME_LIMIT).await?;