quickpeep/quickpeep_html_charset_dete.../src/lib.rs

use crate::steps::{bom_sniff, BOM_SNIFF_NEEDED_BYTES};
use encoding_rs::Encoding;

pub mod steps;

/// The spec requires document authors to place their <meta> tags in the first 1024 bytes.
pub const SNIFF_WINDOW_SIZE: usize = 1024;

/// Attempts to implementing the 'certain' stages of the encoding sniffing algorithm described at:
/// https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
///
/// You should pass in the first 1024 bytes as the `sniff_window`.
/// TODO content-type header
pub fn sniff_with_certain_confidence(
    sniff_window: &[u8],
    content_type_header: Option<&[u8]>,
) -> Option<&'static Encoding> {
    // 1. BOM sniffing.
    if sniff_window.len() > BOM_SNIFF_NEEDED_BYTES {
        if let Some(encoding) = bom_sniff(
            sniff_window[0..BOM_SNIFF_NEEDED_BYTES]
                .try_into()
                .expect("checked size cast"),
        ) {
            return Some(encoding);
        }
    }

    // 2. User override (Not implemented)

    // 3. 'Wait for bytes' — we already have 1024

    // 4. If the transport layer specifies an encoding, return as certain.
    extract_encoding_from_content_type_header(content_type_header.unwrap_or(b""))
}

pub fn extract_encoding_from_content_type_header(
    content_type_header: &[u8],
) -> Option<&'static Encoding> {
    for header_part in content_type_header.split(|b| *b == b';') {
        // To-UTF-8-lossy is definitely not spec compliant, but trim_ascii() on byte slices is
        // unstable, so let's just help ourselves out of a pickle.
        let header_part_ascii_ish = String::from_utf8_lossy(header_part);
        let key_value: Vec<&str> = header_part_ascii_ish.trim().split("=").collect();
        let key = key_value.get(0).cloned().unwrap_or("");
        if key.to_ascii_lowercase() == "charset" {
            let value = key_value.get(0).cloned().unwrap_or("");
            return Encoding::for_label(value.as_bytes());
        }
    }
    None
}