quickpeep/quickpeep_html_charset_dete.../src/lib.rs

53 lines
1.9 KiB
Rust

use crate::steps::{bom_sniff, BOM_SNIFF_NEEDED_BYTES};
use encoding_rs::Encoding;
pub mod steps;
/// The spec requires document authors to place their <meta> tags in the first 1024 bytes.
pub const SNIFF_WINDOW_SIZE: usize = 1024;
/// Attempts to implementing the 'certain' stages of the encoding sniffing algorithm described at:
/// https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
///
/// You should pass in the first 1024 bytes as the `sniff_window`.
/// TODO content-type header
pub fn sniff_with_certain_confidence(
sniff_window: &[u8],
content_type_header: Option<&[u8]>,
) -> Option<&'static Encoding> {
// 1. BOM sniffing.
if sniff_window.len() > BOM_SNIFF_NEEDED_BYTES {
if let Some(encoding) = bom_sniff(
sniff_window[0..BOM_SNIFF_NEEDED_BYTES]
.try_into()
.expect("checked size cast"),
) {
return Some(encoding);
}
}
// 2. User override (Not implemented)
// 3. 'Wait for bytes' — we already have 1024
// 4. If the transport layer specifies an encoding, return as certain.
extract_encoding_from_content_type_header(content_type_header.unwrap_or(b""))
}
pub fn extract_encoding_from_content_type_header(
content_type_header: &[u8],
) -> Option<&'static Encoding> {
for header_part in content_type_header.split(|b| *b == b';') {
// To-UTF-8-lossy is definitely not spec compliant, but trim_ascii() on byte slices is
// unstable, so let's just help ourselves out of a pickle.
let header_part_ascii_ish = String::from_utf8_lossy(header_part);
let key_value: Vec<&str> = header_part_ascii_ish.trim().split("=").collect();
let key = key_value.get(0).cloned().unwrap_or("");
if key.to_ascii_lowercase() == "charset" {
let value = key_value.get(0).cloned().unwrap_or("");
return Encoding::for_label(value.as_bytes());
}
}
None
}