Use the sniffed encoding in page extraction

2022-06-12 15:49:02 +01:00 · 2022-06-12 15:49:02 +01:00 · aa4567c623
commit aa4567c623
parent 5701b1e6d8
2 changed files with 16 additions and 6 deletions
--- a/quickpeep_html_charset_detection/src/lib.rs
+++ b/quickpeep_html_charset_detection/src/lib.rs
@ -44,14 +44,14 @@ pub fn sniff(
    sniff_window: &[u8],
    is_sniff_window_the_entire_document: bool,
    content_type_header: Option<&[u8]>,
-) -> Option<&'static Encoding> {
+) -> &'static Encoding {
    if let Some(certain) = sniff_with_certain_confidence(sniff_window, content_type_header) {
-        return Some(certain);
+        return certain;
    }

    // 5. Optionally prescan the byte stream to determine its encoding
    if let Some(prescan_tentative) = prescan(sniff_window) {
-        return Some(prescan_tentative);
+        return prescan_tentative;
    }

    // 8. The user agent may attempt to autodetect the character encoding from applying frequency
@ -63,7 +63,7 @@ pub fn sniff(
    detector.feed(sniff_window, is_sniff_window_the_entire_document);
    // 'Allow UTF-8' should be set to false for non-file: URIs, apparently.
    // We do that here, though I'm not sure this is what we actually want outside of a browser...
-    Some(detector.guess(None, false))
+    detector.guess(None, false)
 }

 pub fn extract_encoding_from_content_type_header(
--- a/quickpeep_raker/src/raking/page_extraction.rs
+++ b/quickpeep_raker/src/raking/page_extraction.rs
@ -12,11 +12,13 @@ use itertools::Itertools;
 use kuchiki::NodeRef;
 use log::{debug, error, trace, warn};
 use quickpeep_densedoc::{DenseDocument, DenseHead, DenseTree};
+use quickpeep_html_charset_detection::sniff;
 use quickpeep_structs::rake_entries::AnalysisAntifeatures;
 use quickpeep_utils::lazy::Lazy;
 use reqwest::header::HeaderMap;
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
+use std::borrow::Borrow;
 use tokio::runtime;
 use tokio::sync::mpsc::Sender;
 use tokio::sync::{mpsc, oneshot};
@ -116,8 +118,16 @@ impl PageExtractionServiceInternal {
        headers: HeaderMap,
        is_cf: bool,
    ) -> anyhow::Result<ExtractedPage> {
-        let content_str: &str = todo!();
-        let root_node: NodeRef = kuchiki::parse_html().one(content_str.as_ref());
+        let encoding = sniff(
+            &content_bytes,
+            true,
+            headers.get("content-type").map(|hv| hv.as_bytes()),
+        );
+        let (content_text, _actual_codec_used, replacements_made) = encoding.decode(&content_bytes);
+        if replacements_made {
+            warn!("Character replacements made!");
+        }
+        let root_node: NodeRef = kuchiki::parse_html().one(content_text.borrow());

        // See whether this page is at the canonical URL for the page.
        // If it's not, then we redirect the raker to the canonical URL.