diff --git a/quickpeep_raker/src/bin/qp-rake1.rs b/quickpeep_raker/src/bin/qp-rake1.rs index 7e738ed..bd004bd 100644 --- a/quickpeep_raker/src/bin/qp-rake1.rs +++ b/quickpeep_raker/src/bin/qp-rake1.rs @@ -43,11 +43,18 @@ pub async fn main() -> anyhow::Result<()> { let client = reqwest::ClientBuilder::new() .timeout(TIME_LIMIT) - .default_headers(header_map) + .default_headers(header_map.clone()) // We want to handle redirects ourselves so we can track them... .redirect(Policy::none()) .build()?; + let redirect_following_client = reqwest::ClientBuilder::new() + .timeout(TIME_LIMIT) + .default_headers(header_map) + // We want to handle redirects ourselves so we can track them... + .redirect(Policy::limited(5)) + .build()?; + let mut adblock_engines = Vec::new(); for (antifeature, name) in &ADBLOCK_FILTER_PATHS { @@ -76,9 +83,14 @@ pub async fn main() -> anyhow::Result<()> { page_extraction: PageExtractionService::new(adblock_engines)?, }; - let outcome = raker - .rake(&opts.url, opts.intent.unwrap_or(RakeIntent::Any), &client) - .await?; + let intent = opts.intent.unwrap_or(RakeIntent::Any); + let client_to_use = if intent == RakeIntent::Icon { + &redirect_following_client + } else { + &client + }; + + let outcome = raker.rake(&opts.url, intent, client_to_use).await?; match outcome { RakeOutcome::RakedPage(page) => { diff --git a/quickpeep_raker/src/bin/qp-raker.rs b/quickpeep_raker/src/bin/qp-raker.rs index 0a8686a..7c5d3fc 100644 --- a/quickpeep_raker/src/bin/qp-raker.rs +++ b/quickpeep_raker/src/bin/qp-raker.rs @@ -81,13 +81,22 @@ pub async fn main() -> anyhow::Result<()> { let mut header_map = HeaderMap::new(); header_map.insert(USER_AGENT, HeaderValue::from_static(RAKER_USER_AGENT)); - let _client = reqwest::ClientBuilder::new() + let client = reqwest::ClientBuilder::new() + .pool_idle_timeout(Duration::from_secs(90)) + .pool_max_idle_per_host(1) + .timeout(TIME_LIMIT) + .default_headers(header_map.clone()) + // We want to handle redirects ourselves so we can track them... + .redirect(Policy::none()) + .build()?; + + let redirect_following_client = reqwest::ClientBuilder::new() .pool_idle_timeout(Duration::from_secs(90)) .pool_max_idle_per_host(1) .timeout(TIME_LIMIT) .default_headers(header_map) // We want to handle redirects ourselves so we can track them... - .redirect(Policy::none()) + .redirect(Policy::limited(5)) .build()?; let store = RakerStore::open(&config.workbench_dir.join("raker.mdbx"))?; @@ -201,7 +210,8 @@ pub async fn main() -> anyhow::Result<()> { let graceful_stop = Arc::new(AtomicBool::new(false)); let task_context = TaskContext { store: store.clone(), - client: Default::default(), + client, + redirect_following_client, raker: Arc::new(raker), busy_domains: Arc::new(Mutex::new(Default::default())), robotstxt_cache: Arc::new(RwLock::new(LruCache::new(64))), diff --git a/quickpeep_raker/src/raking.rs b/quickpeep_raker/src/raking.rs index b9d88f0..da8910c 100644 --- a/quickpeep_raker/src/raking.rs +++ b/quickpeep_raker/src/raking.rs @@ -236,6 +236,10 @@ pub struct Raker { } impl Raker { + /// Rakes a resource by URL. + /// + /// `intent` specifies the kind of resource we're expecting. This matters in a few circumstances, + /// most notably when picking up favicons. pub async fn rake( &self, url: &Url, @@ -262,6 +266,12 @@ impl Raker { ) .context("Failed to resolve Location header target")?; + if intent == RakeIntent::Icon { + // Icons have special handling around redirects: we dereference them by using + // a separate client, but don't store the redirect if we hit the limit! + bail!("Ran out of redirects to fetch icon with."); + } + return Ok(RakeOutcome::Redirect { reason: RedirectReason::Redirected { http_code }, new_url, diff --git a/quickpeep_raker/src/raking/task.rs b/quickpeep_raker/src/raking/task.rs index f635236..9c83597 100644 --- a/quickpeep_raker/src/raking/task.rs +++ b/quickpeep_raker/src/raking/task.rs @@ -54,6 +54,9 @@ pub struct TaskContext { /// HTTP client pub client: Client, + /// HTTP client that follows redirects automatically. Only used for favicons so far. + pub redirect_following_client: Client, + /// The raker pub raker: Arc, @@ -220,7 +223,12 @@ impl TaskContext { // Now acquire a permit to go and fetch the desired URL let permit = self.semaphore.acquire().await?; - let raked = self.raker.rake(&url, url_record.intent, &self.client).await; + let client = if url_record.intent == RakeIntent::Icon { + &self.redirect_following_client + } else { + &self.client + }; + let raked = self.raker.rake(&url, url_record.intent, client).await; drop(permit); // Next time, we need to wait before our request.