Add full-procedure sniffer

This commit is contained in:
Olivier 'reivilibre' 2022-06-12 15:41:17 +01:00
parent c451a12e44
commit 5701b1e6d8
3 changed files with 49 additions and 4 deletions

12
Cargo.lock generated
View File

@ -523,6 +523,17 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "chardetng"
version = "0.1.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14b8f0b65b7b08ae3c8187e8d77174de20cb6777864c6b832d8ad365999cf1ea"
dependencies = [
"cfg-if",
"encoding_rs",
"memchr",
]
[[package]]
name = "chrono"
version = "0.4.19"
@ -3716,6 +3727,7 @@ dependencies = [
name = "quickpeep_html_charset_detection"
version = "0.1.0"
dependencies = [
"chardetng",
"encoding_rs",
"subslice",
]

View File

@ -8,3 +8,4 @@ edition = "2021"
[dependencies]
encoding_rs = "0.8.31"
subslice = "0.2.3"
chardetng = "0.1.17"

View File

@ -1,4 +1,5 @@
use crate::steps::{bom_sniff, BOM_SNIFF_NEEDED_BYTES};
use crate::steps::{bom_sniff, prescan, BOM_SNIFF_NEEDED_BYTES};
use chardetng::EncodingDetector;
use encoding_rs::Encoding;
pub mod steps;
@ -6,11 +7,10 @@ pub mod steps;
/// The spec requires document authors to place their <meta> tags in the first 1024 bytes.
pub const SNIFF_WINDOW_SIZE: usize = 1024;
/// Attempts to implementing the 'certain' stages of the encoding sniffing algorithm described at:
/// Attempts to implement the 'certain' stages of the encoding sniffing algorithm described at:
/// https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
///
/// You should pass in the first 1024 bytes as the `sniff_window`.
/// TODO content-type header
/// You should pass in at least the first 3 bytes as the `sniff_window` here.
pub fn sniff_with_certain_confidence(
sniff_window: &[u8],
content_type_header: Option<&[u8]>,
@ -34,6 +34,38 @@ pub fn sniff_with_certain_confidence(
extract_encoding_from_content_type_header(content_type_header.unwrap_or(b""))
}
/// Implementing the all the stages of the encoding sniffing algorithm described at:
/// https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
/// except where those stages rely on other information (e.g. 'prior knowledge', a user default
/// or a parent window context).
///
/// You should pass in at least the first 1024 bytes as the `sniff_window` here.
pub fn sniff(
sniff_window: &[u8],
is_sniff_window_the_entire_document: bool,
content_type_header: Option<&[u8]>,
) -> Option<&'static Encoding> {
if let Some(certain) = sniff_with_certain_confidence(sniff_window, content_type_header) {
return Some(certain);
}
// 5. Optionally prescan the byte stream to determine its encoding
if let Some(prescan_tentative) = prescan(sniff_window) {
return Some(prescan_tentative);
}
// 8. The user agent may attempt to autodetect the character encoding from applying frequency
// analysis or other algorithms to the data stream. Such algorithms may use information about
// the resource other than the resource's contents, including the address of the resource.
// If autodetection succeeds in determining a character encoding, and that encoding is a
// supported encoding, then return that encoding, with the confidence tentative.
let mut detector = EncodingDetector::new();
detector.feed(sniff_window, is_sniff_window_the_entire_document);
// 'Allow UTF-8' should be set to false for non-file: URIs, apparently.
// We do that here, though I'm not sure this is what we actually want outside of a browser...
Some(detector.guess(None, false))
}
pub fn extract_encoding_from_content_type_header(
content_type_header: &[u8],
) -> Option<&'static Encoding> {