Add a lot of debug output
This commit is contained in:
parent
2ce8e2ba8e
commit
08f4b7aeaa
@ -145,7 +145,8 @@ impl TaskContext {
|
|||||||
// Get a URL to process
|
// Get a URL to process
|
||||||
let url = {
|
let url = {
|
||||||
let txn = self.store.ro_txn()?;
|
let txn = self.store.ro_txn()?;
|
||||||
txn.choose_url_for_domain(&domain)?
|
txn.choose_url_for_domain(&domain)
|
||||||
|
.context("failed to choose URL for domain")?
|
||||||
};
|
};
|
||||||
|
|
||||||
let (url_str, url_record) = if let Some(url) = url {
|
let (url_str, url_record) = if let Some(url) = url {
|
||||||
@ -166,12 +167,14 @@ impl TaskContext {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Delete the active domain from the store
|
// Delete the active domain from the store
|
||||||
txn.remove_active_domain(&domain)?;
|
txn.remove_active_domain(&domain)
|
||||||
|
.context("failed to remove active domain")?;
|
||||||
|
|
||||||
txn.commit()?;
|
txn.commit()?;
|
||||||
Ok(true)
|
Ok(true)
|
||||||
})
|
})
|
||||||
.await?;
|
.await
|
||||||
|
.context("failed to check if we're out of URLs")?;
|
||||||
if out_of_urls {
|
if out_of_urls {
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
@ -179,10 +182,12 @@ impl TaskContext {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let url = Url::parse(&url_str)?;
|
let url = Url::parse(&url_str)
|
||||||
|
.with_context(|| format!("failed to parse as URL: {url_str:?}"))?;
|
||||||
|
|
||||||
// Check our robot rules are valid for that URL.
|
// Check our robot rules are valid for that URL.
|
||||||
let robot_url = robots_txt_url_for(&url)?;
|
let robot_url = robots_txt_url_for(&url)
|
||||||
|
.with_context(|| format!("failed to get robots.txt URL for {url_str:?}"))?;
|
||||||
if Some(&robot_url) != current_robot_rules_url.as_ref() {
|
if Some(&robot_url) != current_robot_rules_url.as_ref() {
|
||||||
// We need to update our robot rules!
|
// We need to update our robot rules!
|
||||||
match self.get_robot_rules(&url).await {
|
match self.get_robot_rules(&url).await {
|
||||||
@ -201,7 +206,8 @@ impl TaskContext {
|
|||||||
backoff_sec: 86400,
|
backoff_sec: 86400,
|
||||||
}),
|
}),
|
||||||
)
|
)
|
||||||
.await?;
|
.await
|
||||||
|
.context("failed to handle TemporaryFailure outcome for robots.txt")?;
|
||||||
// Forcefully change domain
|
// Forcefully change domain
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
@ -219,7 +225,8 @@ impl TaskContext {
|
|||||||
reason: PermanentFailureReason::DeniedToRobots,
|
reason: PermanentFailureReason::DeniedToRobots,
|
||||||
}),
|
}),
|
||||||
)
|
)
|
||||||
.await?;
|
.await
|
||||||
|
.context("failed to process PermanentFailure outcome for robots.txt")?;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -323,7 +330,8 @@ impl TaskContext {
|
|||||||
txn.commit()?;
|
txn.commit()?;
|
||||||
Ok(())
|
Ok(())
|
||||||
})
|
})
|
||||||
.await?;
|
.await
|
||||||
|
.context("failure whilst turning long crawl delay into backoff")?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -349,10 +357,12 @@ impl TaskContext {
|
|||||||
|
|
||||||
self.as_event_processor()
|
self.as_event_processor()
|
||||||
.process_page(url.clone(), page.page_entry, today)
|
.process_page(url.clone(), page.page_entry, today)
|
||||||
.await?;
|
.await
|
||||||
|
.context("failure processing page for RakedPage")?;
|
||||||
self.as_event_processor()
|
self.as_event_processor()
|
||||||
.process_refs(url.clone(), page.referrer_entry, today, false)
|
.process_refs(url.clone(), page.referrer_entry, today, false)
|
||||||
.await?;
|
.await
|
||||||
|
.context("failure processing refs for RakedPage")?;
|
||||||
|
|
||||||
Ok(NextAction::Continue)
|
Ok(NextAction::Continue)
|
||||||
}
|
}
|
||||||
@ -369,7 +379,8 @@ impl TaskContext {
|
|||||||
|
|
||||||
self.as_event_processor()
|
self.as_event_processor()
|
||||||
.process_refs(url.clone(), refs, today, true)
|
.process_refs(url.clone(), refs, today, true)
|
||||||
.await?;
|
.await
|
||||||
|
.context("failure processing refs for RakedFeed")?;
|
||||||
|
|
||||||
Ok(NextAction::Continue)
|
Ok(NextAction::Continue)
|
||||||
}
|
}
|
||||||
@ -386,7 +397,8 @@ impl TaskContext {
|
|||||||
|
|
||||||
self.as_event_processor()
|
self.as_event_processor()
|
||||||
.process_refs(url.clone(), refs, today, true)
|
.process_refs(url.clone(), refs, today, true)
|
||||||
.await?;
|
.await
|
||||||
|
.context("failure processing refs for RakedSitemap")?;
|
||||||
|
|
||||||
Ok(NextAction::Continue)
|
Ok(NextAction::Continue)
|
||||||
}
|
}
|
||||||
@ -405,7 +417,8 @@ impl TaskContext {
|
|||||||
|
|
||||||
self.as_event_processor()
|
self.as_event_processor()
|
||||||
.process_icon(url.clone(), today)
|
.process_icon(url.clone(), today)
|
||||||
.await?;
|
.await
|
||||||
|
.context("failure processing icon for RakedIcon")?;
|
||||||
|
|
||||||
Ok(NextAction::Continue)
|
Ok(NextAction::Continue)
|
||||||
}
|
}
|
||||||
@ -431,7 +444,8 @@ impl TaskContext {
|
|||||||
|
|
||||||
self.as_event_processor()
|
self.as_event_processor()
|
||||||
.process_refs(url.clone(), refs, today, false)
|
.process_refs(url.clone(), refs, today, false)
|
||||||
.await?;
|
.await
|
||||||
|
.context("Failure processing refs for Redirect")?;
|
||||||
|
|
||||||
Ok(NextAction::Continue)
|
Ok(NextAction::Continue)
|
||||||
}
|
}
|
||||||
@ -452,7 +466,8 @@ impl TaskContext {
|
|||||||
txn.commit()?;
|
txn.commit()?;
|
||||||
Ok(())
|
Ok(())
|
||||||
})
|
})
|
||||||
.await?;
|
.await
|
||||||
|
.context("failed to store backoff")?;
|
||||||
|
|
||||||
// Change domain now
|
// Change domain now
|
||||||
Ok(NextAction::ChangeDomain)
|
Ok(NextAction::ChangeDomain)
|
||||||
@ -465,7 +480,8 @@ impl TaskContext {
|
|||||||
.context("Rejection processor shut down; can't stream rejection!!")?;
|
.context("Rejection processor shut down; can't stream rejection!!")?;
|
||||||
self.as_event_processor()
|
self.as_event_processor()
|
||||||
.process_rejection(url.clone(), today)
|
.process_rejection(url.clone(), today)
|
||||||
.await?;
|
.await
|
||||||
|
.context("failed to process rejection for PermanentFailure")?;
|
||||||
|
|
||||||
// Reasons for permanent rejection aren't our fault or a site-wide fault;
|
// Reasons for permanent rejection aren't our fault or a site-wide fault;
|
||||||
// so don't worry about carrying on.
|
// so don't worry about carrying on.
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user