From 5701b1e6d84697ea873df272e48e4454702c20d6 Mon Sep 17 00:00:00 2001
From: Olivier 'reivilibre <olivier@librepush.net>
Date: Sun, 12 Jun 2022 15:41:17 +0100
Subject: [PATCH] Add full-procedure sniffer

---
 Cargo.lock                                  | 12 +++++++
 quickpeep_html_charset_detection/Cargo.toml |  1 +
 quickpeep_html_charset_detection/src/lib.rs | 40 ++++++++++++++++++---
 3 files changed, 49 insertions(+), 4 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 974e4ee..87838ab 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -523,6 +523,17 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
+[[package]]
+name = "chardetng"
+version = "0.1.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "14b8f0b65b7b08ae3c8187e8d77174de20cb6777864c6b832d8ad365999cf1ea"
+dependencies = [
+ "cfg-if",
+ "encoding_rs",
+ "memchr",
+]
+
 [[package]]
 name = "chrono"
 version = "0.4.19"
@@ -3716,6 +3727,7 @@ dependencies = [
 name = "quickpeep_html_charset_detection"
 version = "0.1.0"
 dependencies = [
+ "chardetng",
  "encoding_rs",
  "subslice",
 ]
diff --git a/quickpeep_html_charset_detection/Cargo.toml b/quickpeep_html_charset_detection/Cargo.toml
index b93b6a2..3d662a4 100644
--- a/quickpeep_html_charset_detection/Cargo.toml
+++ b/quickpeep_html_charset_detection/Cargo.toml
@@ -8,3 +8,4 @@ edition = "2021"
 [dependencies]
 encoding_rs = "0.8.31"
 subslice = "0.2.3"
+chardetng = "0.1.17"
diff --git a/quickpeep_html_charset_detection/src/lib.rs b/quickpeep_html_charset_detection/src/lib.rs
index e7aa9cd..62cad4b 100644
--- a/quickpeep_html_charset_detection/src/lib.rs
+++ b/quickpeep_html_charset_detection/src/lib.rs
@@ -1,4 +1,5 @@
-use crate::steps::{bom_sniff, BOM_SNIFF_NEEDED_BYTES};
+use crate::steps::{bom_sniff, prescan, BOM_SNIFF_NEEDED_BYTES};
+use chardetng::EncodingDetector;
 use encoding_rs::Encoding;
 
 pub mod steps;
@@ -6,11 +7,10 @@ pub mod steps;
 /// The spec requires document authors to place their <meta> tags in the first 1024 bytes.
 pub const SNIFF_WINDOW_SIZE: usize = 1024;
 
-/// Attempts to implementing the 'certain' stages of the encoding sniffing algorithm described at:
+/// Attempts to implement the 'certain' stages of the encoding sniffing algorithm described at:
 /// https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
 ///
-/// You should pass in the first 1024 bytes as the `sniff_window`.
-/// TODO content-type header
+/// You should pass in at least the first 3 bytes as the `sniff_window` here.
 pub fn sniff_with_certain_confidence(
     sniff_window: &[u8],
     content_type_header: Option<&[u8]>,
@@ -34,6 +34,38 @@ pub fn sniff_with_certain_confidence(
     extract_encoding_from_content_type_header(content_type_header.unwrap_or(b""))
 }
 
+/// Implementing the all the stages of the encoding sniffing algorithm described at:
+/// https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
+/// except where those stages rely on other information (e.g. 'prior knowledge', a user default
+/// or a parent window context).
+///
+/// You should pass in at least the first 1024 bytes as the `sniff_window` here.
+pub fn sniff(
+    sniff_window: &[u8],
+    is_sniff_window_the_entire_document: bool,
+    content_type_header: Option<&[u8]>,
+) -> Option<&'static Encoding> {
+    if let Some(certain) = sniff_with_certain_confidence(sniff_window, content_type_header) {
+        return Some(certain);
+    }
+
+    // 5. Optionally prescan the byte stream to determine its encoding
+    if let Some(prescan_tentative) = prescan(sniff_window) {
+        return Some(prescan_tentative);
+    }
+
+    // 8. The user agent may attempt to autodetect the character encoding from applying frequency
+    // analysis or other algorithms to the data stream. Such algorithms may use information about
+    // the resource other than the resource's contents, including the address of the resource.
+    // If autodetection succeeds in determining a character encoding, and that encoding is a
+    // supported encoding, then return that encoding, with the confidence tentative.
+    let mut detector = EncodingDetector::new();
+    detector.feed(sniff_window, is_sniff_window_the_entire_document);
+    // 'Allow UTF-8' should be set to false for non-file: URIs, apparently.
+    // We do that here, though I'm not sure this is what we actually want outside of a browser...
+    Some(detector.guess(None, false))
+}
+
 pub fn extract_encoding_from_content_type_header(
     content_type_header: &[u8],
 ) -> Option<&'static Encoding> {