From 08f4b7aeaa0aa345cbb6bd29c0f93a3f25a2f8b9 Mon Sep 17 00:00:00 2001 From: Olivier 'reivilibre Date: Sat, 26 Nov 2022 22:45:51 +0000 Subject: [PATCH] Add a lot of debug output --- quickpeep_raker/src/raking/task.rs | 48 ++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/quickpeep_raker/src/raking/task.rs b/quickpeep_raker/src/raking/task.rs index d0ac7d6..45eb46e 100644 --- a/quickpeep_raker/src/raking/task.rs +++ b/quickpeep_raker/src/raking/task.rs @@ -145,7 +145,8 @@ impl TaskContext { // Get a URL to process let url = { let txn = self.store.ro_txn()?; - txn.choose_url_for_domain(&domain)? + txn.choose_url_for_domain(&domain) + .context("failed to choose URL for domain")? }; let (url_str, url_record) = if let Some(url) = url { @@ -166,12 +167,14 @@ impl TaskContext { } // Delete the active domain from the store - txn.remove_active_domain(&domain)?; + txn.remove_active_domain(&domain) + .context("failed to remove active domain")?; txn.commit()?; Ok(true) }) - .await?; + .await + .context("failed to check if we're out of URLs")?; if out_of_urls { break; } else { @@ -179,10 +182,12 @@ impl TaskContext { } }; - let url = Url::parse(&url_str)?; + let url = Url::parse(&url_str) + .with_context(|| format!("failed to parse as URL: {url_str:?}"))?; // Check our robot rules are valid for that URL. - let robot_url = robots_txt_url_for(&url)?; + let robot_url = robots_txt_url_for(&url) + .with_context(|| format!("failed to get robots.txt URL for {url_str:?}"))?; if Some(&robot_url) != current_robot_rules_url.as_ref() { // We need to update our robot rules! match self.get_robot_rules(&url).await { @@ -201,7 +206,8 @@ impl TaskContext { backoff_sec: 86400, }), ) - .await?; + .await + .context("failed to handle TemporaryFailure outcome for robots.txt")?; // Forcefully change domain return Ok(()); } @@ -219,7 +225,8 @@ impl TaskContext { reason: PermanentFailureReason::DeniedToRobots, }), ) - .await?; + .await + .context("failed to process PermanentFailure outcome for robots.txt")?; continue; } } @@ -323,7 +330,8 @@ impl TaskContext { txn.commit()?; Ok(()) }) - .await?; + .await + .context("failure whilst turning long crawl delay into backoff")?; } } @@ -349,10 +357,12 @@ impl TaskContext { self.as_event_processor() .process_page(url.clone(), page.page_entry, today) - .await?; + .await + .context("failure processing page for RakedPage")?; self.as_event_processor() .process_refs(url.clone(), page.referrer_entry, today, false) - .await?; + .await + .context("failure processing refs for RakedPage")?; Ok(NextAction::Continue) } @@ -369,7 +379,8 @@ impl TaskContext { self.as_event_processor() .process_refs(url.clone(), refs, today, true) - .await?; + .await + .context("failure processing refs for RakedFeed")?; Ok(NextAction::Continue) } @@ -386,7 +397,8 @@ impl TaskContext { self.as_event_processor() .process_refs(url.clone(), refs, today, true) - .await?; + .await + .context("failure processing refs for RakedSitemap")?; Ok(NextAction::Continue) } @@ -405,7 +417,8 @@ impl TaskContext { self.as_event_processor() .process_icon(url.clone(), today) - .await?; + .await + .context("failure processing icon for RakedIcon")?; Ok(NextAction::Continue) } @@ -431,7 +444,8 @@ impl TaskContext { self.as_event_processor() .process_refs(url.clone(), refs, today, false) - .await?; + .await + .context("Failure processing refs for Redirect")?; Ok(NextAction::Continue) } @@ -452,7 +466,8 @@ impl TaskContext { txn.commit()?; Ok(()) }) - .await?; + .await + .context("failed to store backoff")?; // Change domain now Ok(NextAction::ChangeDomain) @@ -465,7 +480,8 @@ impl TaskContext { .context("Rejection processor shut down; can't stream rejection!!")?; self.as_event_processor() .process_rejection(url.clone(), today) - .await?; + .await + .context("failed to process rejection for PermanentFailure")?; // Reasons for permanent rejection aren't our fault or a site-wide fault; // so don't worry about carrying on.