From 5701b1e6d84697ea873df272e48e4454702c20d6 Mon Sep 17 00:00:00 2001 From: Olivier 'reivilibre Date: Sun, 12 Jun 2022 15:41:17 +0100 Subject: [PATCH] Add full-procedure sniffer --- Cargo.lock | 12 +++++++ quickpeep_html_charset_detection/Cargo.toml | 1 + quickpeep_html_charset_detection/src/lib.rs | 40 ++++++++++++++++++--- 3 files changed, 49 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 974e4ee..87838ab 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -523,6 +523,17 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "chardetng" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14b8f0b65b7b08ae3c8187e8d77174de20cb6777864c6b832d8ad365999cf1ea" +dependencies = [ + "cfg-if", + "encoding_rs", + "memchr", +] + [[package]] name = "chrono" version = "0.4.19" @@ -3716,6 +3727,7 @@ dependencies = [ name = "quickpeep_html_charset_detection" version = "0.1.0" dependencies = [ + "chardetng", "encoding_rs", "subslice", ] diff --git a/quickpeep_html_charset_detection/Cargo.toml b/quickpeep_html_charset_detection/Cargo.toml index b93b6a2..3d662a4 100644 --- a/quickpeep_html_charset_detection/Cargo.toml +++ b/quickpeep_html_charset_detection/Cargo.toml @@ -8,3 +8,4 @@ edition = "2021" [dependencies] encoding_rs = "0.8.31" subslice = "0.2.3" +chardetng = "0.1.17" diff --git a/quickpeep_html_charset_detection/src/lib.rs b/quickpeep_html_charset_detection/src/lib.rs index e7aa9cd..62cad4b 100644 --- a/quickpeep_html_charset_detection/src/lib.rs +++ b/quickpeep_html_charset_detection/src/lib.rs @@ -1,4 +1,5 @@ -use crate::steps::{bom_sniff, BOM_SNIFF_NEEDED_BYTES}; +use crate::steps::{bom_sniff, prescan, BOM_SNIFF_NEEDED_BYTES}; +use chardetng::EncodingDetector; use encoding_rs::Encoding; pub mod steps; @@ -6,11 +7,10 @@ pub mod steps; /// The spec requires document authors to place their tags in the first 1024 bytes. pub const SNIFF_WINDOW_SIZE: usize = 1024; -/// Attempts to implementing the 'certain' stages of the encoding sniffing algorithm described at: +/// Attempts to implement the 'certain' stages of the encoding sniffing algorithm described at: /// https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding /// -/// You should pass in the first 1024 bytes as the `sniff_window`. -/// TODO content-type header +/// You should pass in at least the first 3 bytes as the `sniff_window` here. pub fn sniff_with_certain_confidence( sniff_window: &[u8], content_type_header: Option<&[u8]>, @@ -34,6 +34,38 @@ pub fn sniff_with_certain_confidence( extract_encoding_from_content_type_header(content_type_header.unwrap_or(b"")) } +/// Implementing the all the stages of the encoding sniffing algorithm described at: +/// https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding +/// except where those stages rely on other information (e.g. 'prior knowledge', a user default +/// or a parent window context). +/// +/// You should pass in at least the first 1024 bytes as the `sniff_window` here. +pub fn sniff( + sniff_window: &[u8], + is_sniff_window_the_entire_document: bool, + content_type_header: Option<&[u8]>, +) -> Option<&'static Encoding> { + if let Some(certain) = sniff_with_certain_confidence(sniff_window, content_type_header) { + return Some(certain); + } + + // 5. Optionally prescan the byte stream to determine its encoding + if let Some(prescan_tentative) = prescan(sniff_window) { + return Some(prescan_tentative); + } + + // 8. The user agent may attempt to autodetect the character encoding from applying frequency + // analysis or other algorithms to the data stream. Such algorithms may use information about + // the resource other than the resource's contents, including the address of the resource. + // If autodetection succeeds in determining a character encoding, and that encoding is a + // supported encoding, then return that encoding, with the confidence tentative. + let mut detector = EncodingDetector::new(); + detector.feed(sniff_window, is_sniff_window_the_entire_document); + // 'Allow UTF-8' should be set to false for non-file: URIs, apparently. + // We do that here, though I'm not sure this is what we actually want outside of a browser... + Some(detector.guess(None, false)) +} + pub fn extract_encoding_from_content_type_header( content_type_header: &[u8], ) -> Option<&'static Encoding> {