Pass the bytes through when extracting HTML

2022-06-12 15:26:44 +01:00 · 2022-06-12 15:26:44 +01:00 · c451a12e44
commit c451a12e44
parent c783f89f72
2 changed files with 7 additions and 8 deletions
--- a/quickpeep_raker/src/raking.rs
+++ b/quickpeep_raker/src/raking.rs
@ -396,7 +396,7 @@ impl Raker {
        {
            // We don't try any fallbacks for an HTML page
            return Ok(self
-                .rake_html_page(&content, url, is_cf, &headers)
+                .rake_html_page(content, url, is_cf, &headers)
                .await
                .context("Raking HTML page")?);
        }
@ -445,16 +445,14 @@ impl Raker {

    pub async fn rake_html_page(
        &self,
-        content: &[u8],
+        content: Vec<u8>,
        url: &Url,
        is_cf: bool,
        headers: &HeaderMap,
    ) -> anyhow::Result<RakeOutcome> {
-        let content_str = std::str::from_utf8(content)?.to_owned();
-
        match self
            .page_extraction
-            .extract(content_str, url.clone(), headers.clone(), is_cf)
+            .extract(content, url.clone(), headers.clone(), is_cf)
            .await?
        {
            ExtractedPage::Success {
--- a/quickpeep_raker/src/raking/page_extraction.rs
+++ b/quickpeep_raker/src/raking/page_extraction.rs
@ -29,7 +29,7 @@ pub struct PageExtractionService {
 }

 pub struct ExtractionTask {
-    content: String,
+    content: Vec<u8>,
    url: Url,
    headers: HeaderMap,
    is_cf: bool,
@ -39,7 +39,7 @@ pub struct ExtractionTask {
 impl PageExtractionService {
    pub async fn extract(
        &self,
-        content: String,
+        content: Vec<u8>,
        url: Url,
        headers: HeaderMap,
        is_cf: bool,
@ -111,11 +111,12 @@ struct PageExtractionServiceInternal {
 impl PageExtractionServiceInternal {
    fn extract_page(
        &self,
-        content_str: String,
+        content_bytes: Vec<u8>,
        url: Url,
        headers: HeaderMap,
        is_cf: bool,
    ) -> anyhow::Result<ExtractedPage> {
+        let content_str: &str = todo!();
        let root_node: NodeRef = kuchiki::parse_html().one(content_str.as_ref());

        // See whether this page is at the canonical URL for the page.