53 lines
1.9 KiB
Rust
53 lines
1.9 KiB
Rust
use crate::steps::{bom_sniff, BOM_SNIFF_NEEDED_BYTES};
|
|
use encoding_rs::Encoding;
|
|
|
|
pub mod steps;
|
|
|
|
/// The spec requires document authors to place their <meta> tags in the first 1024 bytes.
|
|
pub const SNIFF_WINDOW_SIZE: usize = 1024;
|
|
|
|
/// Attempts to implementing the 'certain' stages of the encoding sniffing algorithm described at:
|
|
/// https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
|
|
///
|
|
/// You should pass in the first 1024 bytes as the `sniff_window`.
|
|
/// TODO content-type header
|
|
pub fn sniff_with_certain_confidence(
|
|
sniff_window: &[u8],
|
|
content_type_header: Option<&[u8]>,
|
|
) -> Option<&'static Encoding> {
|
|
// 1. BOM sniffing.
|
|
if sniff_window.len() > BOM_SNIFF_NEEDED_BYTES {
|
|
if let Some(encoding) = bom_sniff(
|
|
sniff_window[0..BOM_SNIFF_NEEDED_BYTES]
|
|
.try_into()
|
|
.expect("checked size cast"),
|
|
) {
|
|
return Some(encoding);
|
|
}
|
|
}
|
|
|
|
// 2. User override (Not implemented)
|
|
|
|
// 3. 'Wait for bytes' — we already have 1024
|
|
|
|
// 4. If the transport layer specifies an encoding, return as certain.
|
|
extract_encoding_from_content_type_header(content_type_header.unwrap_or(b""))
|
|
}
|
|
|
|
pub fn extract_encoding_from_content_type_header(
|
|
content_type_header: &[u8],
|
|
) -> Option<&'static Encoding> {
|
|
for header_part in content_type_header.split(|b| *b == b';') {
|
|
// To-UTF-8-lossy is definitely not spec compliant, but trim_ascii() on byte slices is
|
|
// unstable, so let's just help ourselves out of a pickle.
|
|
let header_part_ascii_ish = String::from_utf8_lossy(header_part);
|
|
let key_value: Vec<&str> = header_part_ascii_ish.trim().split("=").collect();
|
|
let key = key_value.get(0).cloned().unwrap_or("");
|
|
if key.to_ascii_lowercase() == "charset" {
|
|
let value = key_value.get(0).cloned().unwrap_or("");
|
|
return Encoding::for_label(value.as_bytes());
|
|
}
|
|
}
|
|
None
|
|
}
|