Pass the bytes through when extracting HTML

This commit is contained in:
Olivier 'reivilibre' 2022-06-12 15:26:44 +01:00
parent c783f89f72
commit c451a12e44
2 changed files with 7 additions and 8 deletions

View File

@ -396,7 +396,7 @@ impl Raker {
{
// We don't try any fallbacks for an HTML page
return Ok(self
.rake_html_page(&content, url, is_cf, &headers)
.rake_html_page(content, url, is_cf, &headers)
.await
.context("Raking HTML page")?);
}
@ -445,16 +445,14 @@ impl Raker {
pub async fn rake_html_page(
&self,
content: &[u8],
content: Vec<u8>,
url: &Url,
is_cf: bool,
headers: &HeaderMap,
) -> anyhow::Result<RakeOutcome> {
let content_str = std::str::from_utf8(content)?.to_owned();
match self
.page_extraction
.extract(content_str, url.clone(), headers.clone(), is_cf)
.extract(content, url.clone(), headers.clone(), is_cf)
.await?
{
ExtractedPage::Success {

View File

@ -29,7 +29,7 @@ pub struct PageExtractionService {
}
pub struct ExtractionTask {
content: String,
content: Vec<u8>,
url: Url,
headers: HeaderMap,
is_cf: bool,
@ -39,7 +39,7 @@ pub struct ExtractionTask {
impl PageExtractionService {
pub async fn extract(
&self,
content: String,
content: Vec<u8>,
url: Url,
headers: HeaderMap,
is_cf: bool,
@ -111,11 +111,12 @@ struct PageExtractionServiceInternal {
impl PageExtractionServiceInternal {
fn extract_page(
&self,
content_str: String,
content_bytes: Vec<u8>,
url: Url,
headers: HeaderMap,
is_cf: bool,
) -> anyhow::Result<ExtractedPage> {
let content_str: &str = todo!();
let root_node: NodeRef = kuchiki::parse_html().one(content_str.as_ref());
// See whether this page is at the canonical URL for the page.