Allow redirects for favicon rakes

This commit is contained in:
Olivier 'reivilibre' 2022-03-27 19:15:24 +01:00
parent 7bdd7d4fc6
commit 6d6e3c52e3
4 changed files with 48 additions and 8 deletions

View File

@ -43,11 +43,18 @@ pub async fn main() -> anyhow::Result<()> {
let client = reqwest::ClientBuilder::new()
.timeout(TIME_LIMIT)
.default_headers(header_map)
.default_headers(header_map.clone())
// We want to handle redirects ourselves so we can track them...
.redirect(Policy::none())
.build()?;
let redirect_following_client = reqwest::ClientBuilder::new()
.timeout(TIME_LIMIT)
.default_headers(header_map)
// We want to handle redirects ourselves so we can track them...
.redirect(Policy::limited(5))
.build()?;
let mut adblock_engines = Vec::new();
for (antifeature, name) in &ADBLOCK_FILTER_PATHS {
@ -76,9 +83,14 @@ pub async fn main() -> anyhow::Result<()> {
page_extraction: PageExtractionService::new(adblock_engines)?,
};
let outcome = raker
.rake(&opts.url, opts.intent.unwrap_or(RakeIntent::Any), &client)
.await?;
let intent = opts.intent.unwrap_or(RakeIntent::Any);
let client_to_use = if intent == RakeIntent::Icon {
&redirect_following_client
} else {
&client
};
let outcome = raker.rake(&opts.url, intent, client_to_use).await?;
match outcome {
RakeOutcome::RakedPage(page) => {

View File

@ -81,13 +81,22 @@ pub async fn main() -> anyhow::Result<()> {
let mut header_map = HeaderMap::new();
header_map.insert(USER_AGENT, HeaderValue::from_static(RAKER_USER_AGENT));
let _client = reqwest::ClientBuilder::new()
let client = reqwest::ClientBuilder::new()
.pool_idle_timeout(Duration::from_secs(90))
.pool_max_idle_per_host(1)
.timeout(TIME_LIMIT)
.default_headers(header_map.clone())
// We want to handle redirects ourselves so we can track them...
.redirect(Policy::none())
.build()?;
let redirect_following_client = reqwest::ClientBuilder::new()
.pool_idle_timeout(Duration::from_secs(90))
.pool_max_idle_per_host(1)
.timeout(TIME_LIMIT)
.default_headers(header_map)
// We want to handle redirects ourselves so we can track them...
.redirect(Policy::none())
.redirect(Policy::limited(5))
.build()?;
let store = RakerStore::open(&config.workbench_dir.join("raker.mdbx"))?;
@ -201,7 +210,8 @@ pub async fn main() -> anyhow::Result<()> {
let graceful_stop = Arc::new(AtomicBool::new(false));
let task_context = TaskContext {
store: store.clone(),
client: Default::default(),
client,
redirect_following_client,
raker: Arc::new(raker),
busy_domains: Arc::new(Mutex::new(Default::default())),
robotstxt_cache: Arc::new(RwLock::new(LruCache::new(64))),

View File

@ -236,6 +236,10 @@ pub struct Raker {
}
impl Raker {
/// Rakes a resource by URL.
///
/// `intent` specifies the kind of resource we're expecting. This matters in a few circumstances,
/// most notably when picking up favicons.
pub async fn rake(
&self,
url: &Url,
@ -262,6 +266,12 @@ impl Raker {
)
.context("Failed to resolve Location header target")?;
if intent == RakeIntent::Icon {
// Icons have special handling around redirects: we dereference them by using
// a separate client, but don't store the redirect if we hit the limit!
bail!("Ran out of redirects to fetch icon with.");
}
return Ok(RakeOutcome::Redirect {
reason: RedirectReason::Redirected { http_code },
new_url,

View File

@ -54,6 +54,9 @@ pub struct TaskContext {
/// HTTP client
pub client: Client,
/// HTTP client that follows redirects automatically. Only used for favicons so far.
pub redirect_following_client: Client,
/// The raker
pub raker: Arc<Raker>,
@ -220,7 +223,12 @@ impl TaskContext {
// Now acquire a permit to go and fetch the desired URL
let permit = self.semaphore.acquire().await?;
let raked = self.raker.rake(&url, url_record.intent, &self.client).await;
let client = if url_record.intent == RakeIntent::Icon {
&self.redirect_following_client
} else {
&self.client
};
let raked = self.raker.rake(&url, url_record.intent, client).await;
drop(permit);
// Next time, we need to wait before our request.