Allow redirects for favicon rakes
This commit is contained in:
parent
7bdd7d4fc6
commit
6d6e3c52e3
|
@ -43,11 +43,18 @@ pub async fn main() -> anyhow::Result<()> {
|
|||
|
||||
let client = reqwest::ClientBuilder::new()
|
||||
.timeout(TIME_LIMIT)
|
||||
.default_headers(header_map)
|
||||
.default_headers(header_map.clone())
|
||||
// We want to handle redirects ourselves so we can track them...
|
||||
.redirect(Policy::none())
|
||||
.build()?;
|
||||
|
||||
let redirect_following_client = reqwest::ClientBuilder::new()
|
||||
.timeout(TIME_LIMIT)
|
||||
.default_headers(header_map)
|
||||
// We want to handle redirects ourselves so we can track them...
|
||||
.redirect(Policy::limited(5))
|
||||
.build()?;
|
||||
|
||||
let mut adblock_engines = Vec::new();
|
||||
|
||||
for (antifeature, name) in &ADBLOCK_FILTER_PATHS {
|
||||
|
@ -76,9 +83,14 @@ pub async fn main() -> anyhow::Result<()> {
|
|||
page_extraction: PageExtractionService::new(adblock_engines)?,
|
||||
};
|
||||
|
||||
let outcome = raker
|
||||
.rake(&opts.url, opts.intent.unwrap_or(RakeIntent::Any), &client)
|
||||
.await?;
|
||||
let intent = opts.intent.unwrap_or(RakeIntent::Any);
|
||||
let client_to_use = if intent == RakeIntent::Icon {
|
||||
&redirect_following_client
|
||||
} else {
|
||||
&client
|
||||
};
|
||||
|
||||
let outcome = raker.rake(&opts.url, intent, client_to_use).await?;
|
||||
|
||||
match outcome {
|
||||
RakeOutcome::RakedPage(page) => {
|
||||
|
|
|
@ -81,13 +81,22 @@ pub async fn main() -> anyhow::Result<()> {
|
|||
let mut header_map = HeaderMap::new();
|
||||
header_map.insert(USER_AGENT, HeaderValue::from_static(RAKER_USER_AGENT));
|
||||
|
||||
let _client = reqwest::ClientBuilder::new()
|
||||
let client = reqwest::ClientBuilder::new()
|
||||
.pool_idle_timeout(Duration::from_secs(90))
|
||||
.pool_max_idle_per_host(1)
|
||||
.timeout(TIME_LIMIT)
|
||||
.default_headers(header_map.clone())
|
||||
// We want to handle redirects ourselves so we can track them...
|
||||
.redirect(Policy::none())
|
||||
.build()?;
|
||||
|
||||
let redirect_following_client = reqwest::ClientBuilder::new()
|
||||
.pool_idle_timeout(Duration::from_secs(90))
|
||||
.pool_max_idle_per_host(1)
|
||||
.timeout(TIME_LIMIT)
|
||||
.default_headers(header_map)
|
||||
// We want to handle redirects ourselves so we can track them...
|
||||
.redirect(Policy::none())
|
||||
.redirect(Policy::limited(5))
|
||||
.build()?;
|
||||
|
||||
let store = RakerStore::open(&config.workbench_dir.join("raker.mdbx"))?;
|
||||
|
@ -201,7 +210,8 @@ pub async fn main() -> anyhow::Result<()> {
|
|||
let graceful_stop = Arc::new(AtomicBool::new(false));
|
||||
let task_context = TaskContext {
|
||||
store: store.clone(),
|
||||
client: Default::default(),
|
||||
client,
|
||||
redirect_following_client,
|
||||
raker: Arc::new(raker),
|
||||
busy_domains: Arc::new(Mutex::new(Default::default())),
|
||||
robotstxt_cache: Arc::new(RwLock::new(LruCache::new(64))),
|
||||
|
|
|
@ -236,6 +236,10 @@ pub struct Raker {
|
|||
}
|
||||
|
||||
impl Raker {
|
||||
/// Rakes a resource by URL.
|
||||
///
|
||||
/// `intent` specifies the kind of resource we're expecting. This matters in a few circumstances,
|
||||
/// most notably when picking up favicons.
|
||||
pub async fn rake(
|
||||
&self,
|
||||
url: &Url,
|
||||
|
@ -262,6 +266,12 @@ impl Raker {
|
|||
)
|
||||
.context("Failed to resolve Location header target")?;
|
||||
|
||||
if intent == RakeIntent::Icon {
|
||||
// Icons have special handling around redirects: we dereference them by using
|
||||
// a separate client, but don't store the redirect if we hit the limit!
|
||||
bail!("Ran out of redirects to fetch icon with.");
|
||||
}
|
||||
|
||||
return Ok(RakeOutcome::Redirect {
|
||||
reason: RedirectReason::Redirected { http_code },
|
||||
new_url,
|
||||
|
|
|
@ -54,6 +54,9 @@ pub struct TaskContext {
|
|||
/// HTTP client
|
||||
pub client: Client,
|
||||
|
||||
/// HTTP client that follows redirects automatically. Only used for favicons so far.
|
||||
pub redirect_following_client: Client,
|
||||
|
||||
/// The raker
|
||||
pub raker: Arc<Raker>,
|
||||
|
||||
|
@ -220,7 +223,12 @@ impl TaskContext {
|
|||
|
||||
// Now acquire a permit to go and fetch the desired URL
|
||||
let permit = self.semaphore.acquire().await?;
|
||||
let raked = self.raker.rake(&url, url_record.intent, &self.client).await;
|
||||
let client = if url_record.intent == RakeIntent::Icon {
|
||||
&self.redirect_following_client
|
||||
} else {
|
||||
&self.client
|
||||
};
|
||||
let raked = self.raker.rake(&url, url_record.intent, client).await;
|
||||
drop(permit);
|
||||
|
||||
// Next time, we need to wait before our request.
|
||||
|
|
Loading…
Reference in New Issue