From d18d0635d7939b811cdb0f6be1f0085d1b5d98cb Mon Sep 17 00:00:00 2001 From: Olivier 'reivilibre Date: Sat, 4 Jun 2022 23:54:22 +0100 Subject: [PATCH] Don't hammer robots.txt --- quickpeep_raker/src/raking/task.rs | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/quickpeep_raker/src/raking/task.rs b/quickpeep_raker/src/raking/task.rs index 6a36c2e..b819f2a 100644 --- a/quickpeep_raker/src/raking/task.rs +++ b/quickpeep_raker/src/raking/task.rs @@ -182,7 +182,27 @@ impl TaskContext { let robot_url = robots_txt_url_for(&url)?; if Some(&robot_url) != current_robot_rules_url.as_ref() { // We need to update our robot rules! - current_robot_rules = self.get_robot_rules(&url).await?; + match self.get_robot_rules(&url).await { + Ok(rules) => { + current_robot_rules = rules; + } + Err(err) => { + self.process_outcome( + &url, + RakeOutcome::TemporaryFailure(TemporaryFailure { + reason: TemporaryFailureReason::UnknownClientError(format!( + "robots.txt failure {:?}: {:?}", + url, err + )), + // Back off for a day: this ought to be enough time for the operator to fix the problem... maybe? + backoff_sec: 86400, + }), + ) + .await?; + // Forcefully change domain + return Ok(()); + } + } current_robot_rules_url = Some(robot_url); }