From c451a12e44c40fcc61d0c5594d5bd5bad9e5143d Mon Sep 17 00:00:00 2001 From: Olivier 'reivilibre Date: Sun, 12 Jun 2022 15:26:44 +0100 Subject: [PATCH] Pass the bytes through when extracting HTML --- quickpeep_raker/src/raking.rs | 8 +++----- quickpeep_raker/src/raking/page_extraction.rs | 7 ++++--- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/quickpeep_raker/src/raking.rs b/quickpeep_raker/src/raking.rs index 6aed87e..b98fc06 100644 --- a/quickpeep_raker/src/raking.rs +++ b/quickpeep_raker/src/raking.rs @@ -396,7 +396,7 @@ impl Raker { { // We don't try any fallbacks for an HTML page return Ok(self - .rake_html_page(&content, url, is_cf, &headers) + .rake_html_page(content, url, is_cf, &headers) .await .context("Raking HTML page")?); } @@ -445,16 +445,14 @@ impl Raker { pub async fn rake_html_page( &self, - content: &[u8], + content: Vec, url: &Url, is_cf: bool, headers: &HeaderMap, ) -> anyhow::Result { - let content_str = std::str::from_utf8(content)?.to_owned(); - match self .page_extraction - .extract(content_str, url.clone(), headers.clone(), is_cf) + .extract(content, url.clone(), headers.clone(), is_cf) .await? { ExtractedPage::Success { diff --git a/quickpeep_raker/src/raking/page_extraction.rs b/quickpeep_raker/src/raking/page_extraction.rs index 9422726..1f0c213 100644 --- a/quickpeep_raker/src/raking/page_extraction.rs +++ b/quickpeep_raker/src/raking/page_extraction.rs @@ -29,7 +29,7 @@ pub struct PageExtractionService { } pub struct ExtractionTask { - content: String, + content: Vec, url: Url, headers: HeaderMap, is_cf: bool, @@ -39,7 +39,7 @@ pub struct ExtractionTask { impl PageExtractionService { pub async fn extract( &self, - content: String, + content: Vec, url: Url, headers: HeaderMap, is_cf: bool, @@ -111,11 +111,12 @@ struct PageExtractionServiceInternal { impl PageExtractionServiceInternal { fn extract_page( &self, - content_str: String, + content_bytes: Vec, url: Url, headers: HeaderMap, is_cf: bool, ) -> anyhow::Result { + let content_str: &str = todo!(); let root_node: NodeRef = kuchiki::parse_html().one(content_str.as_ref()); // See whether this page is at the canonical URL for the page.