Create a crate for HTML charset detection

2022-06-12 14:47:42 +01:00 · 2022-06-12 14:47:42 +01:00 · c783f89f72
commit c783f89f72
parent b08a883831
6 changed files with 430 additions and 3 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1001,9 +1001,9 @@ checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"

 [[package]]
 name = "encoding_rs"
-version = "0.8.30"
+version = "0.8.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7896dc8abb250ffdda33912550faa54c88ec8b998dec0b2c55ab224921ce11df"
+checksum = "9852635589dc9f9ea1b6fe9f05b50ef208c85c834a562f0c6abb1c475736ec2b"
 dependencies = [
 "cfg-if",
 ]
@ -3712,6 +3712,14 @@ dependencies = [
 "serde",
 ]

+[[package]]
+name = "quickpeep_html_charset_detection"
+version = "0.1.0"
+dependencies = [
+ "encoding_rs",
+ "subslice",
+]
+
 [[package]]
 name = "quickpeep_index"
 version = "0.1.0"
@ -3806,6 +3814,7 @@ dependencies = [
 "ouroboros",
 "publicsuffix",
 "quickpeep_densedoc",
+ "quickpeep_html_charset_detection",
 "quickpeep_moz_readability",
 "quickpeep_seed_parser",
 "quickpeep_structs",
@ -4642,6 +4651,15 @@ dependencies = [
 "syn",
 ]

+[[package]]
+name = "subslice"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e0a8e4809a3bb02de01f1f7faf1ba01a83af9e8eabcd4d31dd6e413d14d56aae"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "subtle"
 version = "2.4.1"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -8,5 +8,6 @@ members = [
    "quickpeep_moz_readability",
    "quickpeep_seed_parser",
    "quickpeep_structs",
-    "quickpeep_utils"
+    "quickpeep_utils",
+    "quickpeep_html_charset_detection",
 ]
--- a/quickpeep_html_charset_detection/Cargo.toml
+++ b/quickpeep_html_charset_detection/Cargo.toml
@ -0,0 +1,10 @@
+[package]
+name = "quickpeep_html_charset_detection"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+encoding_rs = "0.8.31"
+subslice = "0.2.3"
--- a/quickpeep_html_charset_detection/src/lib.rs
+++ b/quickpeep_html_charset_detection/src/lib.rs
@ -0,0 +1,52 @@
+use crate::steps::{bom_sniff, BOM_SNIFF_NEEDED_BYTES};
+use encoding_rs::Encoding;
+
+pub mod steps;
+
+/// The spec requires document authors to place their <meta> tags in the first 1024 bytes.
+pub const SNIFF_WINDOW_SIZE: usize = 1024;
+
+/// Attempts to implementing the 'certain' stages of the encoding sniffing algorithm described at:
+/// https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
+///
+/// You should pass in the first 1024 bytes as the `sniff_window`.
+/// TODO content-type header
+pub fn sniff_with_certain_confidence(
+    sniff_window: &[u8],
+    content_type_header: Option<&[u8]>,
+) -> Option<&'static Encoding> {
+    // 1. BOM sniffing.
+    if sniff_window.len() > BOM_SNIFF_NEEDED_BYTES {
+        if let Some(encoding) = bom_sniff(
+            sniff_window[0..BOM_SNIFF_NEEDED_BYTES]
+                .try_into()
+                .expect("checked size cast"),
+        ) {
+            return Some(encoding);
+        }
+    }
+
+    // 2. User override (Not implemented)
+
+    // 3. 'Wait for bytes' — we already have 1024
+
+    // 4. If the transport layer specifies an encoding, return as certain.
+    extract_encoding_from_content_type_header(content_type_header.unwrap_or(b""))
+}
+
+pub fn extract_encoding_from_content_type_header(
+    content_type_header: &[u8],
+) -> Option<&'static Encoding> {
+    for header_part in content_type_header.split(|b| *b == b';') {
+        // To-UTF-8-lossy is definitely not spec compliant, but trim_ascii() on byte slices is
+        // unstable, so let's just help ourselves out of a pickle.
+        let header_part_ascii_ish = String::from_utf8_lossy(header_part);
+        let key_value: Vec<&str> = header_part_ascii_ish.trim().split("=").collect();
+        let key = key_value.get(0).cloned().unwrap_or("");
+        if key.to_ascii_lowercase() == "charset" {
+            let value = key_value.get(0).cloned().unwrap_or("");
+            return Encoding::for_label(value.as_bytes());
+        }
+    }
+    None
+}
--- a/quickpeep_html_charset_detection/src/steps.rs
+++ b/quickpeep_html_charset_detection/src/steps.rs
@ -0,0 +1,345 @@
+use crate::extract_encoding_from_content_type_header;
+use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8, WINDOWS_1252, X_USER_DEFINED};
+use subslice::SubsliceExt;
+
+pub const BOM_SNIFF_NEEDED_BYTES: usize = 3;
+
+/// Implements BOM sniffing
+/// https://encoding.spec.whatwg.org/#bom-sniff
+pub fn bom_sniff(first_3_bytes: [u8; 3]) -> Option<&'static Encoding> {
+    if first_3_bytes == [0xEF, 0xBB, 0xBF] {
+        Some(&UTF_8)
+    } else {
+        None
+    }
+}
+
+pub fn prescan(bytes: &[u8]) -> Option<&'static Encoding> {
+    // Let fallback encoding be null.
+    // TODO let fallback = None; ??
+
+    // Let position be a pointer to a byte in the input byte stream, initially pointing at the first byte.
+    let mut position = 0;
+
+    // Prescan for UTF-16 XML declarations: If position points to:
+    //
+    // A sequence of bytes starting with: 0x3C, 0x0, 0x3F, 0x0, 0x78, 0x0 (case-sensitive UTF-16 little-endian '<?x')
+    if bytes.starts_with(&[0x3C, 0x0, 0x3F, 0x0, 0x78, 0x0]) {
+        // Return UTF-16LE.
+        return Some(UTF_16LE);
+    }
+
+    // A sequence of bytes starting with: 0x0, 0x3C, 0x0, 0x3F, 0x0, 0x78 (case-sensitive UTF-16 big-endian '<?x')
+    if bytes.starts_with(&[0x0, 0x3C, 0x0, 0x3F, 0x0, 0x78]) {
+        // Return UTF-16BE.
+        return Some(UTF_16BE);
+    }
+    // For historical reasons, the prefix is two bytes longer than in Appendix F of XML and the encoding name is not checked.
+
+    // Loop: If position points to:
+    while position < bytes.len() {
+        // A sequence of bytes starting with: 0x3C 0x21 0x2D 0x2D (`<!--`)
+        if bytes[position..].starts_with(&[0x3C, 0x21, 0x2D, 0x2D]) {
+            // Advance the position pointer so that it points at the first 0x3E byte which is
+            // preceded by two 0x2D bytes (i.e. at the end of an ASCII '-->' sequence) and comes
+            // after the 0x3C byte that was found. (The two 0x2D bytes can be the same as those in
+            // the '<!--' sequence.)
+            match bytes[position..].find(&[0x2D, 0x2D, 0x3C]) {
+                Some(location_of_closer) => {
+                    position += location_of_closer + 3;
+                    continue;
+                }
+                None => {
+                    return None;
+                }
+            }
+        }
+
+        // A sequence of bytes starting with: 0x3C, 0x4D or 0x6D, 0x45 or 0x65, 0x54 or 0x74, 0x41 or 0x61, and one of 0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x2F (case-insensitive ASCII '<meta' followed by a space or slash)
+        let check = bytes[position..position + b"<meta ".len()].to_ascii_lowercase();
+        let ends_in_whitespace = bytes
+            .get(position + b"<meta".len())
+            .map(|c| matches!(c, 0x09 | 0x0A | 0x0C | 0x0D | 0x20 | 0x2F))
+            .unwrap_or(false);
+        if check.starts_with(b"<meta") && ends_in_whitespace {
+            // Advance the position pointer so that it points at the next 0x09, 0x0A, 0x0C, 0x0D, 0x20, or 0x2F byte (the one in sequence of characters matched above).
+            position += 6;
+
+            // Let attribute list be an empty list of strings.
+            let mut attributes = Vec::new();
+
+            // Let got pragma be false.
+            let mut got_pragma = false;
+
+            // Let need pragma be null.
+            let mut need_pragma = None;
+
+            // Let charset be the null value (which, for the purposes of this algorithm, is distinct from an unrecognized encoding or the empty string).
+            // We'll represent 'unrecognised' as Some(None) and 'empty string' as Some(Some("")).
+            let mut charset: Option<Option<&'static Encoding>> = None;
+
+            // Attributes: Get an attribute and its value. If no attribute was sniffed, then jump to the processing step below.
+            while let Some((key, value)) = prescan_get_attribute(&bytes, &mut position) {
+                // If the attribute's name is already in attribute list, then return to the step labeled attributes.
+                if attributes.contains(&key) {
+                    continue;
+                }
+
+                // Add the attribute's name to attribute list.
+                attributes.push(key.clone());
+
+                // Run the appropriate step from the following list, if one applies:
+
+                match &key.to_ascii_lowercase()[..] {
+                    // If the attribute's name is "http-equiv"
+                    b"http-equiv" => {
+                        // If the attribute's value is "content-type", then set got pragma to true.
+                        if value.to_ascii_lowercase() == b"content-type" {
+                            got_pragma = true;
+                        }
+                    }
+                    // If the attribute's name is "content"
+                    b"content" => {
+                        // Apply the algorithm for extracting a character encoding from a meta element,
+                        // giving the attribute's value as the string to parse.
+                        // If a character encoding is returned, and if charset is still set to null,
+                        // let charset be the encoding returned, and set need pragma to true.
+                        let content_charset = extract_encoding_from_content_type_header(&value);
+                        if let Some(content_charset) = content_charset {
+                            if charset.is_none() {
+                                charset = Some(Some(content_charset));
+                                need_pragma = Some(true);
+                            }
+                        }
+                    }
+                    // If the attribute's name is "charset"
+                    b"charset" => {
+                        // Let charset be the result of getting an encoding from the attribute's value,
+                        // and set need pragma to false.
+                        charset = Some(Encoding::for_label(&value));
+                        need_pragma = Some(false);
+                    }
+                    _ => {}
+                }
+                // Return to the step labeled attributes.
+            }
+
+            // Processing: If need pragma is null, then jump to the step below labeled next byte.
+            if let Some(need_pragma) = need_pragma {
+                // If need pragma is true but got pragma is false, then jump to the step below labeled next byte.
+                if need_pragma && !got_pragma {
+                    position += 1;
+                    continue;
+                }
+
+                // If charset is failure, then jump to the step below labeled next byte.
+                if charset == Some(None) {
+                    position += 1;
+                    continue;
+                }
+
+                // If charset is UTF-16BE/LE, then set charset to UTF-8.
+                if charset
+                    .map(|i| i.map(|i| i.name() == UTF_16BE.name() || i.name() == UTF_16LE.name()))
+                    .flatten()
+                    .unwrap_or(false)
+                {
+                    charset = Some(Some(UTF_8));
+                }
+
+                // If charset is x-user-defined, then set charset to windows-1252.
+                if charset
+                    .map(|i| i.map(|i| i.name() == X_USER_DEFINED.name()))
+                    .flatten()
+                    .unwrap_or(false)
+                {
+                    charset = Some(Some(WINDOWS_1252));
+                }
+
+                // Return charset.
+                return charset.flatten();
+            } else {
+                position += 1;
+                continue;
+            }
+        }
+
+        // A sequence of bytes starting with a 0x3C byte (<), optionally a 0x2F byte (/), and finally a byte in the range 0x41-0x5A or 0x61-0x7A (A-Z or a-z)
+        if (bytes[position..].starts_with(b"</")
+            && bytes[position..].len() > 3
+            && bytes[position + 2].is_ascii_alphabetic())
+            || (bytes[position..].starts_with(b"<")
+                && bytes[position..].len() > 2
+                && bytes[position + 1].is_ascii_alphabetic())
+        {
+            // Advance the position pointer so that it points at the next 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP), or 0x3E (>) byte.
+            while position < bytes.len()
+                && !matches!(bytes[position], 0x09 | 0x0A | 0x0C | 0x0D | 0x20 | 0x3E)
+            {
+                position += 1;
+            }
+
+            // Repeatedly get an attribute until no further attributes can be found, then jump to the step below labeled next byte.
+            while prescan_get_attribute(&bytes, &mut position).is_some() {}
+            position += 1;
+            continue;
+        }
+
+        // A sequence of bytes starting with: 0x3C 0x21 (`<!`)
+        // A sequence of bytes starting with: 0x3C 0x2F (`</`)
+        // A sequence of bytes starting with: 0x3C 0x3F (`<?`)
+        if bytes[position..].starts_with(b"<!")
+            || bytes[position..].starts_with(b"</")
+            || bytes[position..].starts_with(b"<?")
+        {
+            // Advance the position pointer so that it points at the first 0x3E byte (>) that comes after the 0x3C byte that was found.
+            match bytes[position..].find(b">") {
+                None => {
+                    return None;
+                }
+                Some(at) => {
+                    position += at + 1;
+                    continue;
+                }
+            }
+        }
+
+        // Any other byte
+        //
+        //     Do nothing with that byte.
+
+        // Next byte: Move position so it points at the next byte in the input byte stream, and return to the step above labeled loop.
+        position += 1;
+    }
+
+    None
+}
+
+pub fn prescan_get_attribute(bytes: &[u8], position: &mut usize) -> Option<(Vec<u8>, Vec<u8>)> {
+    // If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP),
+    // or 0x2F (/) then advance position to the next byte and redo this step.
+    while matches!(
+        bytes.get(*position),
+        Some(0x09 | 0x0A | 0x0C | 0x0D | 0x20 | 0x2F)
+    ) {
+        *position += 1;
+    }
+
+    // If the byte at position is 0x3E (>), then abort the get an attribute algorithm. There isn't one.
+    if bytes.get(*position) == Some(&0x3E) {
+        return None;
+    }
+
+    // Otherwise, the byte at position is the start of the attribute name. Let attribute name and attribute value be the empty string.
+    let mut attribute_name = Vec::new();
+    let mut attribute_value = Vec::new();
+
+    // Process the byte at position as follows:
+    //
+    // If it is 0x3D (=), and the attribute name is longer than the empty string
+    loop {
+        if bytes.get(*position) == Some(&0x3D) && !attribute_name.is_empty() {
+            // Advance position to the next byte and jump to the step below labeled value.
+            *position += 1;
+            break;
+        } else {
+            // If it is 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), or 0x20 (SP)
+            //     Jump to the step below labeled spaces.
+            if !matches!(bytes.get(*position), Some(0x09 | 0x0A | 0x0C | 0x0D | 0x20)) {
+                // If it is 0x2F (/) or 0x3E (>)
+                if matches!(bytes.get(*position), Some(0x2F | 0x3E)) {
+                    // Abort the get an attribute algorithm. The attribute's name is the value of attribute name, its value is the empty string.
+                    return Some((attribute_name, Vec::with_capacity(0)));
+                }
+
+                // If it is in the range 0x41 (A) to 0x5A (Z)
+                //     Append the code point b+0x20 to attribute name (where b is the value of the byte at position). (This converts the input to lowercase.)
+                // Anything else
+                //     Append the code point with the same value as the byte at position to attribute name. (It doesn't actually matter how bytes outside the ASCII range are handled here, since only ASCII bytes can contribute to the detection of a character encoding.)
+                let new_byte: &u8 = bytes.get(*position)?;
+                attribute_name.push(new_byte.to_ascii_lowercase());
+
+                // Advance position to the next byte and return to the previous step.
+            }
+
+            // Spaces: If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), or 0x20 (SP) then advance position to the next byte, then, repeat this step.
+            while matches!(bytes.get(*position), Some(0x09 | 0x0A | 0x0C | 0x0D | 0x20)) {
+                *position += 1;
+            }
+
+            // If the byte at position is not 0x3D (=), abort the get an attribute algorithm. The attribute's name is the value of attribute name, its value is the empty string.
+            if bytes.get(*position) != Some(&0x3D) {
+                return Some((attribute_name, Vec::with_capacity(0)));
+            }
+
+            // Advance position past the 0x3D (=) byte.
+            *position += 1;
+        }
+    }
+
+    // Value: If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), or 0x20 (SP) then advance position to the next byte, then, repeat this step.
+    while matches!(bytes.get(*position), Some(0x09 | 0x0A | 0x0C | 0x0D | 0x20)) {
+        *position += 1;
+    }
+
+    // Process the byte at position as follows:
+    //
+    // If it is 0x22 (") or 0x27 (')
+    if matches!(bytes.get(*position), Some(0x22 | 0x27)) {
+        let quote_byte = bytes[*position];
+        // Let b be the value of the byte at position.
+        // Quote loop: Advance position to the next byte.
+        loop {
+            *position += 1;
+            // If the value of the byte at position is the value of b, then advance position to the next byte and abort the "get an attribute" algorithm. The attribute's name is the value of attribute name, and its value is the value of attribute value.
+            // (That part of the algorithm isn't clear, but it must mean the closing quote)
+            if bytes.get(*position) == Some(&quote_byte) {
+                *position += 1;
+                return Some((attribute_name, attribute_value));
+            }
+
+            // Otherwise, if the value of the byte at position is in the range 0x41 (A) to 0x5A (Z), then append a code point to attribute value whose value is 0x20 more than the value of the byte at position.
+            // Otherwise, append a code point to attribute value whose value is the same as the value of the byte at position.
+            attribute_value.push(bytes.get(*position)?.to_ascii_lowercase());
+
+            // Return to the step above labeled quote loop.
+        }
+    }
+
+    // If it is 0x3E (>)
+    if bytes.get(*position) == Some(&0x3E) {
+        // Abort the get an attribute algorithm. The attribute's name is the value of attribute name, its value is the empty string.
+        return Some((attribute_name, Vec::with_capacity(0)));
+    }
+
+    // If it is in the range 0x41 (A) to 0x5A (Z)
+    //     Append a code point b+0x20 to attribute value (where b is the value of the byte at position). Advance position to the next byte.
+    // Anything else
+    //     Append a code point with the same value as the byte at position to attribute value. Advance position to the next byte.
+    attribute_value.push(bytes.get(*position)?.to_ascii_lowercase());
+    *position += 1;
+
+    loop {
+        // Process the byte at position as follows:
+        //
+        // If it is 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP), or 0x3E (>)
+        if matches!(bytes.get(*position), Some(0x09 | 0x0A | 0x0C | 0x0D | 0x20)) {
+            // Abort the get an attribute algorithm. The attribute's name is the value of attribute name and its value is the value of attribute value.
+            return Some((attribute_name, attribute_value));
+        }
+
+        // If it is in the range 0x41 (A) to 0x5A (Z)
+        //     Append a code point b+0x20 to attribute value (where b is the value of the byte at position).
+        // Anything else
+        //     Append a code point with the same value as the byte at position to attribute value.
+        attribute_value.push(bytes.get(*position)?.to_ascii_lowercase());
+
+        // Advance position to the next byte and return to the previous step.
+        *position += 1;
+    }
+}
+
+fn get_an_xml_encoding() -> Option<&'static Encoding> {
+    // TODO NOT IMPLEMENTED
+    None
+}
--- a/quickpeep_raker/Cargo.toml
+++ b/quickpeep_raker/Cargo.toml
@ -58,6 +58,7 @@ arc-interner = "0.7.0"
 smartstring = "1.0.0"
 signal-hook = "0.3.13"
 nix = "0.23.1"
+quickpeep_html_charset_detection = { version = "0.1.0", path = "../quickpeep_html_charset_detection" }

 ### Raking helpers
 # HTTP Requests