Add full-procedure sniffer
This commit is contained in:
parent
c451a12e44
commit
5701b1e6d8
|
@ -523,6 +523,17 @@ version = "1.0.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "chardetng"
|
||||||
|
version = "0.1.17"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "14b8f0b65b7b08ae3c8187e8d77174de20cb6777864c6b832d8ad365999cf1ea"
|
||||||
|
dependencies = [
|
||||||
|
"cfg-if",
|
||||||
|
"encoding_rs",
|
||||||
|
"memchr",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "chrono"
|
name = "chrono"
|
||||||
version = "0.4.19"
|
version = "0.4.19"
|
||||||
|
@ -3716,6 +3727,7 @@ dependencies = [
|
||||||
name = "quickpeep_html_charset_detection"
|
name = "quickpeep_html_charset_detection"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"chardetng",
|
||||||
"encoding_rs",
|
"encoding_rs",
|
||||||
"subslice",
|
"subslice",
|
||||||
]
|
]
|
||||||
|
|
|
@ -8,3 +8,4 @@ edition = "2021"
|
||||||
[dependencies]
|
[dependencies]
|
||||||
encoding_rs = "0.8.31"
|
encoding_rs = "0.8.31"
|
||||||
subslice = "0.2.3"
|
subslice = "0.2.3"
|
||||||
|
chardetng = "0.1.17"
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
use crate::steps::{bom_sniff, BOM_SNIFF_NEEDED_BYTES};
|
use crate::steps::{bom_sniff, prescan, BOM_SNIFF_NEEDED_BYTES};
|
||||||
|
use chardetng::EncodingDetector;
|
||||||
use encoding_rs::Encoding;
|
use encoding_rs::Encoding;
|
||||||
|
|
||||||
pub mod steps;
|
pub mod steps;
|
||||||
|
@ -6,11 +7,10 @@ pub mod steps;
|
||||||
/// The spec requires document authors to place their <meta> tags in the first 1024 bytes.
|
/// The spec requires document authors to place their <meta> tags in the first 1024 bytes.
|
||||||
pub const SNIFF_WINDOW_SIZE: usize = 1024;
|
pub const SNIFF_WINDOW_SIZE: usize = 1024;
|
||||||
|
|
||||||
/// Attempts to implementing the 'certain' stages of the encoding sniffing algorithm described at:
|
/// Attempts to implement the 'certain' stages of the encoding sniffing algorithm described at:
|
||||||
/// https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
|
/// https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
|
||||||
///
|
///
|
||||||
/// You should pass in the first 1024 bytes as the `sniff_window`.
|
/// You should pass in at least the first 3 bytes as the `sniff_window` here.
|
||||||
/// TODO content-type header
|
|
||||||
pub fn sniff_with_certain_confidence(
|
pub fn sniff_with_certain_confidence(
|
||||||
sniff_window: &[u8],
|
sniff_window: &[u8],
|
||||||
content_type_header: Option<&[u8]>,
|
content_type_header: Option<&[u8]>,
|
||||||
|
@ -34,6 +34,38 @@ pub fn sniff_with_certain_confidence(
|
||||||
extract_encoding_from_content_type_header(content_type_header.unwrap_or(b""))
|
extract_encoding_from_content_type_header(content_type_header.unwrap_or(b""))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Implementing the all the stages of the encoding sniffing algorithm described at:
|
||||||
|
/// https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
|
||||||
|
/// except where those stages rely on other information (e.g. 'prior knowledge', a user default
|
||||||
|
/// or a parent window context).
|
||||||
|
///
|
||||||
|
/// You should pass in at least the first 1024 bytes as the `sniff_window` here.
|
||||||
|
pub fn sniff(
|
||||||
|
sniff_window: &[u8],
|
||||||
|
is_sniff_window_the_entire_document: bool,
|
||||||
|
content_type_header: Option<&[u8]>,
|
||||||
|
) -> Option<&'static Encoding> {
|
||||||
|
if let Some(certain) = sniff_with_certain_confidence(sniff_window, content_type_header) {
|
||||||
|
return Some(certain);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 5. Optionally prescan the byte stream to determine its encoding
|
||||||
|
if let Some(prescan_tentative) = prescan(sniff_window) {
|
||||||
|
return Some(prescan_tentative);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 8. The user agent may attempt to autodetect the character encoding from applying frequency
|
||||||
|
// analysis or other algorithms to the data stream. Such algorithms may use information about
|
||||||
|
// the resource other than the resource's contents, including the address of the resource.
|
||||||
|
// If autodetection succeeds in determining a character encoding, and that encoding is a
|
||||||
|
// supported encoding, then return that encoding, with the confidence tentative.
|
||||||
|
let mut detector = EncodingDetector::new();
|
||||||
|
detector.feed(sniff_window, is_sniff_window_the_entire_document);
|
||||||
|
// 'Allow UTF-8' should be set to false for non-file: URIs, apparently.
|
||||||
|
// We do that here, though I'm not sure this is what we actually want outside of a browser...
|
||||||
|
Some(detector.guess(None, false))
|
||||||
|
}
|
||||||
|
|
||||||
pub fn extract_encoding_from_content_type_header(
|
pub fn extract_encoding_from_content_type_header(
|
||||||
content_type_header: &[u8],
|
content_type_header: &[u8],
|
||||||
) -> Option<&'static Encoding> {
|
) -> Option<&'static Encoding> {
|
||||||
|
|
Loading…
Reference in New Issue