Allow redirects for favicon rakes
This commit is contained in:
parent
7bdd7d4fc6
commit
6d6e3c52e3
|
@ -43,11 +43,18 @@ pub async fn main() -> anyhow::Result<()> {
|
||||||
|
|
||||||
let client = reqwest::ClientBuilder::new()
|
let client = reqwest::ClientBuilder::new()
|
||||||
.timeout(TIME_LIMIT)
|
.timeout(TIME_LIMIT)
|
||||||
.default_headers(header_map)
|
.default_headers(header_map.clone())
|
||||||
// We want to handle redirects ourselves so we can track them...
|
// We want to handle redirects ourselves so we can track them...
|
||||||
.redirect(Policy::none())
|
.redirect(Policy::none())
|
||||||
.build()?;
|
.build()?;
|
||||||
|
|
||||||
|
let redirect_following_client = reqwest::ClientBuilder::new()
|
||||||
|
.timeout(TIME_LIMIT)
|
||||||
|
.default_headers(header_map)
|
||||||
|
// We want to handle redirects ourselves so we can track them...
|
||||||
|
.redirect(Policy::limited(5))
|
||||||
|
.build()?;
|
||||||
|
|
||||||
let mut adblock_engines = Vec::new();
|
let mut adblock_engines = Vec::new();
|
||||||
|
|
||||||
for (antifeature, name) in &ADBLOCK_FILTER_PATHS {
|
for (antifeature, name) in &ADBLOCK_FILTER_PATHS {
|
||||||
|
@ -76,9 +83,14 @@ pub async fn main() -> anyhow::Result<()> {
|
||||||
page_extraction: PageExtractionService::new(adblock_engines)?,
|
page_extraction: PageExtractionService::new(adblock_engines)?,
|
||||||
};
|
};
|
||||||
|
|
||||||
let outcome = raker
|
let intent = opts.intent.unwrap_or(RakeIntent::Any);
|
||||||
.rake(&opts.url, opts.intent.unwrap_or(RakeIntent::Any), &client)
|
let client_to_use = if intent == RakeIntent::Icon {
|
||||||
.await?;
|
&redirect_following_client
|
||||||
|
} else {
|
||||||
|
&client
|
||||||
|
};
|
||||||
|
|
||||||
|
let outcome = raker.rake(&opts.url, intent, client_to_use).await?;
|
||||||
|
|
||||||
match outcome {
|
match outcome {
|
||||||
RakeOutcome::RakedPage(page) => {
|
RakeOutcome::RakedPage(page) => {
|
||||||
|
|
|
@ -81,13 +81,22 @@ pub async fn main() -> anyhow::Result<()> {
|
||||||
let mut header_map = HeaderMap::new();
|
let mut header_map = HeaderMap::new();
|
||||||
header_map.insert(USER_AGENT, HeaderValue::from_static(RAKER_USER_AGENT));
|
header_map.insert(USER_AGENT, HeaderValue::from_static(RAKER_USER_AGENT));
|
||||||
|
|
||||||
let _client = reqwest::ClientBuilder::new()
|
let client = reqwest::ClientBuilder::new()
|
||||||
|
.pool_idle_timeout(Duration::from_secs(90))
|
||||||
|
.pool_max_idle_per_host(1)
|
||||||
|
.timeout(TIME_LIMIT)
|
||||||
|
.default_headers(header_map.clone())
|
||||||
|
// We want to handle redirects ourselves so we can track them...
|
||||||
|
.redirect(Policy::none())
|
||||||
|
.build()?;
|
||||||
|
|
||||||
|
let redirect_following_client = reqwest::ClientBuilder::new()
|
||||||
.pool_idle_timeout(Duration::from_secs(90))
|
.pool_idle_timeout(Duration::from_secs(90))
|
||||||
.pool_max_idle_per_host(1)
|
.pool_max_idle_per_host(1)
|
||||||
.timeout(TIME_LIMIT)
|
.timeout(TIME_LIMIT)
|
||||||
.default_headers(header_map)
|
.default_headers(header_map)
|
||||||
// We want to handle redirects ourselves so we can track them...
|
// We want to handle redirects ourselves so we can track them...
|
||||||
.redirect(Policy::none())
|
.redirect(Policy::limited(5))
|
||||||
.build()?;
|
.build()?;
|
||||||
|
|
||||||
let store = RakerStore::open(&config.workbench_dir.join("raker.mdbx"))?;
|
let store = RakerStore::open(&config.workbench_dir.join("raker.mdbx"))?;
|
||||||
|
@ -201,7 +210,8 @@ pub async fn main() -> anyhow::Result<()> {
|
||||||
let graceful_stop = Arc::new(AtomicBool::new(false));
|
let graceful_stop = Arc::new(AtomicBool::new(false));
|
||||||
let task_context = TaskContext {
|
let task_context = TaskContext {
|
||||||
store: store.clone(),
|
store: store.clone(),
|
||||||
client: Default::default(),
|
client,
|
||||||
|
redirect_following_client,
|
||||||
raker: Arc::new(raker),
|
raker: Arc::new(raker),
|
||||||
busy_domains: Arc::new(Mutex::new(Default::default())),
|
busy_domains: Arc::new(Mutex::new(Default::default())),
|
||||||
robotstxt_cache: Arc::new(RwLock::new(LruCache::new(64))),
|
robotstxt_cache: Arc::new(RwLock::new(LruCache::new(64))),
|
||||||
|
|
|
@ -236,6 +236,10 @@ pub struct Raker {
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Raker {
|
impl Raker {
|
||||||
|
/// Rakes a resource by URL.
|
||||||
|
///
|
||||||
|
/// `intent` specifies the kind of resource we're expecting. This matters in a few circumstances,
|
||||||
|
/// most notably when picking up favicons.
|
||||||
pub async fn rake(
|
pub async fn rake(
|
||||||
&self,
|
&self,
|
||||||
url: &Url,
|
url: &Url,
|
||||||
|
@ -262,6 +266,12 @@ impl Raker {
|
||||||
)
|
)
|
||||||
.context("Failed to resolve Location header target")?;
|
.context("Failed to resolve Location header target")?;
|
||||||
|
|
||||||
|
if intent == RakeIntent::Icon {
|
||||||
|
// Icons have special handling around redirects: we dereference them by using
|
||||||
|
// a separate client, but don't store the redirect if we hit the limit!
|
||||||
|
bail!("Ran out of redirects to fetch icon with.");
|
||||||
|
}
|
||||||
|
|
||||||
return Ok(RakeOutcome::Redirect {
|
return Ok(RakeOutcome::Redirect {
|
||||||
reason: RedirectReason::Redirected { http_code },
|
reason: RedirectReason::Redirected { http_code },
|
||||||
new_url,
|
new_url,
|
||||||
|
|
|
@ -54,6 +54,9 @@ pub struct TaskContext {
|
||||||
/// HTTP client
|
/// HTTP client
|
||||||
pub client: Client,
|
pub client: Client,
|
||||||
|
|
||||||
|
/// HTTP client that follows redirects automatically. Only used for favicons so far.
|
||||||
|
pub redirect_following_client: Client,
|
||||||
|
|
||||||
/// The raker
|
/// The raker
|
||||||
pub raker: Arc<Raker>,
|
pub raker: Arc<Raker>,
|
||||||
|
|
||||||
|
@ -220,7 +223,12 @@ impl TaskContext {
|
||||||
|
|
||||||
// Now acquire a permit to go and fetch the desired URL
|
// Now acquire a permit to go and fetch the desired URL
|
||||||
let permit = self.semaphore.acquire().await?;
|
let permit = self.semaphore.acquire().await?;
|
||||||
let raked = self.raker.rake(&url, url_record.intent, &self.client).await;
|
let client = if url_record.intent == RakeIntent::Icon {
|
||||||
|
&self.redirect_following_client
|
||||||
|
} else {
|
||||||
|
&self.client
|
||||||
|
};
|
||||||
|
let raked = self.raker.rake(&url, url_record.intent, client).await;
|
||||||
drop(permit);
|
drop(permit);
|
||||||
|
|
||||||
// Next time, we need to wait before our request.
|
// Next time, we need to wait before our request.
|
||||||
|
|
Loading…
Reference in New Issue