Don't hammer robots.txt

This commit is contained in:
Olivier 'reivilibre' 2022-06-04 23:54:22 +01:00
parent d8f4baf9a3
commit d18d0635d7

View File

@ -182,7 +182,27 @@ impl TaskContext {
let robot_url = robots_txt_url_for(&url)?; let robot_url = robots_txt_url_for(&url)?;
if Some(&robot_url) != current_robot_rules_url.as_ref() { if Some(&robot_url) != current_robot_rules_url.as_ref() {
// We need to update our robot rules! // We need to update our robot rules!
current_robot_rules = self.get_robot_rules(&url).await?; match self.get_robot_rules(&url).await {
Ok(rules) => {
current_robot_rules = rules;
}
Err(err) => {
self.process_outcome(
&url,
RakeOutcome::TemporaryFailure(TemporaryFailure {
reason: TemporaryFailureReason::UnknownClientError(format!(
"robots.txt failure {:?}: {:?}",
url, err
)),
// Back off for a day: this ought to be enough time for the operator to fix the problem... maybe?
backoff_sec: 86400,
}),
)
.await?;
// Forcefully change domain
return Ok(());
}
}
current_robot_rules_url = Some(robot_url); current_robot_rules_url = Some(robot_url);
} }