Add a lot of debug output
ci/woodpecker/push/check Pipeline was successful Details
ci/woodpecker/push/manual Pipeline failed Details
ci/woodpecker/push/release Pipeline was successful Details

rei/rakerstore_postgres_overhaul
Olivier 'reivilibre' 2022-11-26 22:45:51 +00:00
parent 2ce8e2ba8e
commit 08f4b7aeaa
1 changed files with 32 additions and 16 deletions

View File

@ -145,7 +145,8 @@ impl TaskContext {
// Get a URL to process
let url = {
let txn = self.store.ro_txn()?;
txn.choose_url_for_domain(&domain)?
txn.choose_url_for_domain(&domain)
.context("failed to choose URL for domain")?
};
let (url_str, url_record) = if let Some(url) = url {
@ -166,12 +167,14 @@ impl TaskContext {
}
// Delete the active domain from the store
txn.remove_active_domain(&domain)?;
txn.remove_active_domain(&domain)
.context("failed to remove active domain")?;
txn.commit()?;
Ok(true)
})
.await?;
.await
.context("failed to check if we're out of URLs")?;
if out_of_urls {
break;
} else {
@ -179,10 +182,12 @@ impl TaskContext {
}
};
let url = Url::parse(&url_str)?;
let url = Url::parse(&url_str)
.with_context(|| format!("failed to parse as URL: {url_str:?}"))?;
// Check our robot rules are valid for that URL.
let robot_url = robots_txt_url_for(&url)?;
let robot_url = robots_txt_url_for(&url)
.with_context(|| format!("failed to get robots.txt URL for {url_str:?}"))?;
if Some(&robot_url) != current_robot_rules_url.as_ref() {
// We need to update our robot rules!
match self.get_robot_rules(&url).await {
@ -201,7 +206,8 @@ impl TaskContext {
backoff_sec: 86400,
}),
)
.await?;
.await
.context("failed to handle TemporaryFailure outcome for robots.txt")?;
// Forcefully change domain
return Ok(());
}
@ -219,7 +225,8 @@ impl TaskContext {
reason: PermanentFailureReason::DeniedToRobots,
}),
)
.await?;
.await
.context("failed to process PermanentFailure outcome for robots.txt")?;
continue;
}
}
@ -323,7 +330,8 @@ impl TaskContext {
txn.commit()?;
Ok(())
})
.await?;
.await
.context("failure whilst turning long crawl delay into backoff")?;
}
}
@ -349,10 +357,12 @@ impl TaskContext {
self.as_event_processor()
.process_page(url.clone(), page.page_entry, today)
.await?;
.await
.context("failure processing page for RakedPage")?;
self.as_event_processor()
.process_refs(url.clone(), page.referrer_entry, today, false)
.await?;
.await
.context("failure processing refs for RakedPage")?;
Ok(NextAction::Continue)
}
@ -369,7 +379,8 @@ impl TaskContext {
self.as_event_processor()
.process_refs(url.clone(), refs, today, true)
.await?;
.await
.context("failure processing refs for RakedFeed")?;
Ok(NextAction::Continue)
}
@ -386,7 +397,8 @@ impl TaskContext {
self.as_event_processor()
.process_refs(url.clone(), refs, today, true)
.await?;
.await
.context("failure processing refs for RakedSitemap")?;
Ok(NextAction::Continue)
}
@ -405,7 +417,8 @@ impl TaskContext {
self.as_event_processor()
.process_icon(url.clone(), today)
.await?;
.await
.context("failure processing icon for RakedIcon")?;
Ok(NextAction::Continue)
}
@ -431,7 +444,8 @@ impl TaskContext {
self.as_event_processor()
.process_refs(url.clone(), refs, today, false)
.await?;
.await
.context("Failure processing refs for Redirect")?;
Ok(NextAction::Continue)
}
@ -452,7 +466,8 @@ impl TaskContext {
txn.commit()?;
Ok(())
})
.await?;
.await
.context("failed to store backoff")?;
// Change domain now
Ok(NextAction::ChangeDomain)
@ -465,7 +480,8 @@ impl TaskContext {
.context("Rejection processor shut down; can't stream rejection!!")?;
self.as_event_processor()
.process_rejection(url.clone(), today)
.await?;
.await
.context("failed to process rejection for PermanentFailure")?;
// Reasons for permanent rejection aren't our fault or a site-wide fault;
// so don't worry about carrying on.