Allow redirects for favicon rakes

This commit is contained in:
Olivier 'reivilibre' 2022-03-27 19:15:24 +01:00
parent 7bdd7d4fc6
commit 6d6e3c52e3
4 changed files with 48 additions and 8 deletions

View File

@ -43,11 +43,18 @@ pub async fn main() -> anyhow::Result<()> {
let client = reqwest::ClientBuilder::new() let client = reqwest::ClientBuilder::new()
.timeout(TIME_LIMIT) .timeout(TIME_LIMIT)
.default_headers(header_map) .default_headers(header_map.clone())
// We want to handle redirects ourselves so we can track them... // We want to handle redirects ourselves so we can track them...
.redirect(Policy::none()) .redirect(Policy::none())
.build()?; .build()?;
let redirect_following_client = reqwest::ClientBuilder::new()
.timeout(TIME_LIMIT)
.default_headers(header_map)
// We want to handle redirects ourselves so we can track them...
.redirect(Policy::limited(5))
.build()?;
let mut adblock_engines = Vec::new(); let mut adblock_engines = Vec::new();
for (antifeature, name) in &ADBLOCK_FILTER_PATHS { for (antifeature, name) in &ADBLOCK_FILTER_PATHS {
@ -76,9 +83,14 @@ pub async fn main() -> anyhow::Result<()> {
page_extraction: PageExtractionService::new(adblock_engines)?, page_extraction: PageExtractionService::new(adblock_engines)?,
}; };
let outcome = raker let intent = opts.intent.unwrap_or(RakeIntent::Any);
.rake(&opts.url, opts.intent.unwrap_or(RakeIntent::Any), &client) let client_to_use = if intent == RakeIntent::Icon {
.await?; &redirect_following_client
} else {
&client
};
let outcome = raker.rake(&opts.url, intent, client_to_use).await?;
match outcome { match outcome {
RakeOutcome::RakedPage(page) => { RakeOutcome::RakedPage(page) => {

View File

@ -81,13 +81,22 @@ pub async fn main() -> anyhow::Result<()> {
let mut header_map = HeaderMap::new(); let mut header_map = HeaderMap::new();
header_map.insert(USER_AGENT, HeaderValue::from_static(RAKER_USER_AGENT)); header_map.insert(USER_AGENT, HeaderValue::from_static(RAKER_USER_AGENT));
let _client = reqwest::ClientBuilder::new() let client = reqwest::ClientBuilder::new()
.pool_idle_timeout(Duration::from_secs(90))
.pool_max_idle_per_host(1)
.timeout(TIME_LIMIT)
.default_headers(header_map.clone())
// We want to handle redirects ourselves so we can track them...
.redirect(Policy::none())
.build()?;
let redirect_following_client = reqwest::ClientBuilder::new()
.pool_idle_timeout(Duration::from_secs(90)) .pool_idle_timeout(Duration::from_secs(90))
.pool_max_idle_per_host(1) .pool_max_idle_per_host(1)
.timeout(TIME_LIMIT) .timeout(TIME_LIMIT)
.default_headers(header_map) .default_headers(header_map)
// We want to handle redirects ourselves so we can track them... // We want to handle redirects ourselves so we can track them...
.redirect(Policy::none()) .redirect(Policy::limited(5))
.build()?; .build()?;
let store = RakerStore::open(&config.workbench_dir.join("raker.mdbx"))?; let store = RakerStore::open(&config.workbench_dir.join("raker.mdbx"))?;
@ -201,7 +210,8 @@ pub async fn main() -> anyhow::Result<()> {
let graceful_stop = Arc::new(AtomicBool::new(false)); let graceful_stop = Arc::new(AtomicBool::new(false));
let task_context = TaskContext { let task_context = TaskContext {
store: store.clone(), store: store.clone(),
client: Default::default(), client,
redirect_following_client,
raker: Arc::new(raker), raker: Arc::new(raker),
busy_domains: Arc::new(Mutex::new(Default::default())), busy_domains: Arc::new(Mutex::new(Default::default())),
robotstxt_cache: Arc::new(RwLock::new(LruCache::new(64))), robotstxt_cache: Arc::new(RwLock::new(LruCache::new(64))),

View File

@ -236,6 +236,10 @@ pub struct Raker {
} }
impl Raker { impl Raker {
/// Rakes a resource by URL.
///
/// `intent` specifies the kind of resource we're expecting. This matters in a few circumstances,
/// most notably when picking up favicons.
pub async fn rake( pub async fn rake(
&self, &self,
url: &Url, url: &Url,
@ -262,6 +266,12 @@ impl Raker {
) )
.context("Failed to resolve Location header target")?; .context("Failed to resolve Location header target")?;
if intent == RakeIntent::Icon {
// Icons have special handling around redirects: we dereference them by using
// a separate client, but don't store the redirect if we hit the limit!
bail!("Ran out of redirects to fetch icon with.");
}
return Ok(RakeOutcome::Redirect { return Ok(RakeOutcome::Redirect {
reason: RedirectReason::Redirected { http_code }, reason: RedirectReason::Redirected { http_code },
new_url, new_url,

View File

@ -54,6 +54,9 @@ pub struct TaskContext {
/// HTTP client /// HTTP client
pub client: Client, pub client: Client,
/// HTTP client that follows redirects automatically. Only used for favicons so far.
pub redirect_following_client: Client,
/// The raker /// The raker
pub raker: Arc<Raker>, pub raker: Arc<Raker>,
@ -220,7 +223,12 @@ impl TaskContext {
// Now acquire a permit to go and fetch the desired URL // Now acquire a permit to go and fetch the desired URL
let permit = self.semaphore.acquire().await?; let permit = self.semaphore.acquire().await?;
let raked = self.raker.rake(&url, url_record.intent, &self.client).await; let client = if url_record.intent == RakeIntent::Icon {
&self.redirect_following_client
} else {
&self.client
};
let raked = self.raker.rake(&url, url_record.intent, client).await;
drop(permit); drop(permit);
// Next time, we need to wait before our request. // Next time, we need to wait before our request.