From c783f89f72b5f4ffcfebceb39afbb052380810eb Mon Sep 17 00:00:00 2001 From: Olivier 'reivilibre Date: Sun, 12 Jun 2022 14:47:42 +0100 Subject: [PATCH] Create a crate for HTML charset detection --- Cargo.lock | 22 +- Cargo.toml | 3 +- quickpeep_html_charset_detection/Cargo.toml | 10 + quickpeep_html_charset_detection/src/lib.rs | 52 +++ quickpeep_html_charset_detection/src/steps.rs | 345 ++++++++++++++++++ quickpeep_raker/Cargo.toml | 1 + 6 files changed, 430 insertions(+), 3 deletions(-) create mode 100644 quickpeep_html_charset_detection/Cargo.toml create mode 100644 quickpeep_html_charset_detection/src/lib.rs create mode 100644 quickpeep_html_charset_detection/src/steps.rs diff --git a/Cargo.lock b/Cargo.lock index 5305c5a..974e4ee 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1001,9 +1001,9 @@ checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" [[package]] name = "encoding_rs" -version = "0.8.30" +version = "0.8.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7896dc8abb250ffdda33912550faa54c88ec8b998dec0b2c55ab224921ce11df" +checksum = "9852635589dc9f9ea1b6fe9f05b50ef208c85c834a562f0c6abb1c475736ec2b" dependencies = [ "cfg-if", ] @@ -3712,6 +3712,14 @@ dependencies = [ "serde", ] +[[package]] +name = "quickpeep_html_charset_detection" +version = "0.1.0" +dependencies = [ + "encoding_rs", + "subslice", +] + [[package]] name = "quickpeep_index" version = "0.1.0" @@ -3806,6 +3814,7 @@ dependencies = [ "ouroboros", "publicsuffix", "quickpeep_densedoc", + "quickpeep_html_charset_detection", "quickpeep_moz_readability", "quickpeep_seed_parser", "quickpeep_structs", @@ -4642,6 +4651,15 @@ dependencies = [ "syn", ] +[[package]] +name = "subslice" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0a8e4809a3bb02de01f1f7faf1ba01a83af9e8eabcd4d31dd6e413d14d56aae" +dependencies = [ + "memchr", +] + [[package]] name = "subtle" version = "2.4.1" diff --git a/Cargo.toml b/Cargo.toml index d2ce3e9..40583e4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,5 +8,6 @@ members = [ "quickpeep_moz_readability", "quickpeep_seed_parser", "quickpeep_structs", - "quickpeep_utils" + "quickpeep_utils", + "quickpeep_html_charset_detection", ] diff --git a/quickpeep_html_charset_detection/Cargo.toml b/quickpeep_html_charset_detection/Cargo.toml new file mode 100644 index 0000000..b93b6a2 --- /dev/null +++ b/quickpeep_html_charset_detection/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "quickpeep_html_charset_detection" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +encoding_rs = "0.8.31" +subslice = "0.2.3" diff --git a/quickpeep_html_charset_detection/src/lib.rs b/quickpeep_html_charset_detection/src/lib.rs new file mode 100644 index 0000000..e7aa9cd --- /dev/null +++ b/quickpeep_html_charset_detection/src/lib.rs @@ -0,0 +1,52 @@ +use crate::steps::{bom_sniff, BOM_SNIFF_NEEDED_BYTES}; +use encoding_rs::Encoding; + +pub mod steps; + +/// The spec requires document authors to place their tags in the first 1024 bytes. +pub const SNIFF_WINDOW_SIZE: usize = 1024; + +/// Attempts to implementing the 'certain' stages of the encoding sniffing algorithm described at: +/// https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding +/// +/// You should pass in the first 1024 bytes as the `sniff_window`. +/// TODO content-type header +pub fn sniff_with_certain_confidence( + sniff_window: &[u8], + content_type_header: Option<&[u8]>, +) -> Option<&'static Encoding> { + // 1. BOM sniffing. + if sniff_window.len() > BOM_SNIFF_NEEDED_BYTES { + if let Some(encoding) = bom_sniff( + sniff_window[0..BOM_SNIFF_NEEDED_BYTES] + .try_into() + .expect("checked size cast"), + ) { + return Some(encoding); + } + } + + // 2. User override (Not implemented) + + // 3. 'Wait for bytes' — we already have 1024 + + // 4. If the transport layer specifies an encoding, return as certain. + extract_encoding_from_content_type_header(content_type_header.unwrap_or(b"")) +} + +pub fn extract_encoding_from_content_type_header( + content_type_header: &[u8], +) -> Option<&'static Encoding> { + for header_part in content_type_header.split(|b| *b == b';') { + // To-UTF-8-lossy is definitely not spec compliant, but trim_ascii() on byte slices is + // unstable, so let's just help ourselves out of a pickle. + let header_part_ascii_ish = String::from_utf8_lossy(header_part); + let key_value: Vec<&str> = header_part_ascii_ish.trim().split("=").collect(); + let key = key_value.get(0).cloned().unwrap_or(""); + if key.to_ascii_lowercase() == "charset" { + let value = key_value.get(0).cloned().unwrap_or(""); + return Encoding::for_label(value.as_bytes()); + } + } + None +} diff --git a/quickpeep_html_charset_detection/src/steps.rs b/quickpeep_html_charset_detection/src/steps.rs new file mode 100644 index 0000000..72c2f1b --- /dev/null +++ b/quickpeep_html_charset_detection/src/steps.rs @@ -0,0 +1,345 @@ +use crate::extract_encoding_from_content_type_header; +use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8, WINDOWS_1252, X_USER_DEFINED}; +use subslice::SubsliceExt; + +pub const BOM_SNIFF_NEEDED_BYTES: usize = 3; + +/// Implements BOM sniffing +/// https://encoding.spec.whatwg.org/#bom-sniff +pub fn bom_sniff(first_3_bytes: [u8; 3]) -> Option<&'static Encoding> { + if first_3_bytes == [0xEF, 0xBB, 0xBF] { + Some(&UTF_8) + } else { + None + } +} + +pub fn prescan(bytes: &[u8]) -> Option<&'static Encoding> { + // Let fallback encoding be null. + // TODO let fallback = None; ?? + + // Let position be a pointer to a byte in the input byte stream, initially pointing at the first byte. + let mut position = 0; + + // Prescan for UTF-16 XML declarations: If position points to: + // + // A sequence of bytes starting with: 0x3C, 0x0, 0x3F, 0x0, 0x78, 0x0 (case-sensitive UTF-16 little-endian '' sequence) and comes + // after the 0x3C byte that was found. (The two 0x2D bytes can be the same as those in + // the '