Add full-procedure sniffer

2022-06-12 15:41:17 +01:00 · 2022-06-12 15:41:17 +01:00 · 5701b1e6d8
commit 5701b1e6d8
parent c451a12e44
3 changed files with 49 additions and 4 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -523,6 +523,17 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"

+[[package]]
+name = "chardetng"
+version = "0.1.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "14b8f0b65b7b08ae3c8187e8d77174de20cb6777864c6b832d8ad365999cf1ea"
+dependencies = [
+ "cfg-if",
+ "encoding_rs",
+ "memchr",
+]
+
 [[package]]
 name = "chrono"
 version = "0.4.19"
@ -3716,6 +3727,7 @@ dependencies = [
 name = "quickpeep_html_charset_detection"
 version = "0.1.0"
 dependencies = [
+ "chardetng",
 "encoding_rs",
 "subslice",
 ]
--- a/quickpeep_html_charset_detection/Cargo.toml
+++ b/quickpeep_html_charset_detection/Cargo.toml
@ -8,3 +8,4 @@ edition = "2021"
 [dependencies]
 encoding_rs = "0.8.31"
 subslice = "0.2.3"
+chardetng = "0.1.17"
--- a/quickpeep_html_charset_detection/src/lib.rs
+++ b/quickpeep_html_charset_detection/src/lib.rs
@ -1,4 +1,5 @@
-use crate::steps::{bom_sniff, BOM_SNIFF_NEEDED_BYTES};
+use crate::steps::{bom_sniff, prescan, BOM_SNIFF_NEEDED_BYTES};
+use chardetng::EncodingDetector;
 use encoding_rs::Encoding;

 pub mod steps;
@ -6,11 +7,10 @@ pub mod steps;
 /// The spec requires document authors to place their <meta> tags in the first 1024 bytes.
 pub const SNIFF_WINDOW_SIZE: usize = 1024;

-/// Attempts to implementing the 'certain' stages of the encoding sniffing algorithm described at:
+/// Attempts to implement the 'certain' stages of the encoding sniffing algorithm described at:
 /// https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
 ///
-/// You should pass in the first 1024 bytes as the `sniff_window`.
-/// TODO content-type header
+/// You should pass in at least the first 3 bytes as the `sniff_window` here.
 pub fn sniff_with_certain_confidence(
    sniff_window: &[u8],
    content_type_header: Option<&[u8]>,
@ -34,6 +34,38 @@ pub fn sniff_with_certain_confidence(
    extract_encoding_from_content_type_header(content_type_header.unwrap_or(b""))
 }

+/// Implementing the all the stages of the encoding sniffing algorithm described at:
+/// https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
+/// except where those stages rely on other information (e.g. 'prior knowledge', a user default
+/// or a parent window context).
+///
+/// You should pass in at least the first 1024 bytes as the `sniff_window` here.
+pub fn sniff(
+    sniff_window: &[u8],
+    is_sniff_window_the_entire_document: bool,
+    content_type_header: Option<&[u8]>,
+) -> Option<&'static Encoding> {
+    if let Some(certain) = sniff_with_certain_confidence(sniff_window, content_type_header) {
+        return Some(certain);
+    }
+
+    // 5. Optionally prescan the byte stream to determine its encoding
+    if let Some(prescan_tentative) = prescan(sniff_window) {
+        return Some(prescan_tentative);
+    }
+
+    // 8. The user agent may attempt to autodetect the character encoding from applying frequency
+    // analysis or other algorithms to the data stream. Such algorithms may use information about
+    // the resource other than the resource's contents, including the address of the resource.
+    // If autodetection succeeds in determining a character encoding, and that encoding is a
+    // supported encoding, then return that encoding, with the confidence tentative.
+    let mut detector = EncodingDetector::new();
+    detector.feed(sniff_window, is_sniff_window_the_entire_document);
+    // 'Allow UTF-8' should be set to false for non-file: URIs, apparently.
+    // We do that here, though I'm not sure this is what we actually want outside of a browser...
+    Some(detector.guess(None, false))
+}
+
 pub fn extract_encoding_from_content_type_header(
    content_type_header: &[u8],
 ) -> Option<&'static Encoding> {