Add full-procedure sniffer
This commit is contained in:
parent
c451a12e44
commit
5701b1e6d8
|
@ -523,6 +523,17 @@ version = "1.0.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||
|
||||
[[package]]
|
||||
name = "chardetng"
|
||||
version = "0.1.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "14b8f0b65b7b08ae3c8187e8d77174de20cb6777864c6b832d8ad365999cf1ea"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"encoding_rs",
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "chrono"
|
||||
version = "0.4.19"
|
||||
|
@ -3716,6 +3727,7 @@ dependencies = [
|
|||
name = "quickpeep_html_charset_detection"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"chardetng",
|
||||
"encoding_rs",
|
||||
"subslice",
|
||||
]
|
||||
|
|
|
@ -8,3 +8,4 @@ edition = "2021"
|
|||
[dependencies]
|
||||
encoding_rs = "0.8.31"
|
||||
subslice = "0.2.3"
|
||||
chardetng = "0.1.17"
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
use crate::steps::{bom_sniff, BOM_SNIFF_NEEDED_BYTES};
|
||||
use crate::steps::{bom_sniff, prescan, BOM_SNIFF_NEEDED_BYTES};
|
||||
use chardetng::EncodingDetector;
|
||||
use encoding_rs::Encoding;
|
||||
|
||||
pub mod steps;
|
||||
|
@ -6,11 +7,10 @@ pub mod steps;
|
|||
/// The spec requires document authors to place their <meta> tags in the first 1024 bytes.
|
||||
pub const SNIFF_WINDOW_SIZE: usize = 1024;
|
||||
|
||||
/// Attempts to implementing the 'certain' stages of the encoding sniffing algorithm described at:
|
||||
/// Attempts to implement the 'certain' stages of the encoding sniffing algorithm described at:
|
||||
/// https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
|
||||
///
|
||||
/// You should pass in the first 1024 bytes as the `sniff_window`.
|
||||
/// TODO content-type header
|
||||
/// You should pass in at least the first 3 bytes as the `sniff_window` here.
|
||||
pub fn sniff_with_certain_confidence(
|
||||
sniff_window: &[u8],
|
||||
content_type_header: Option<&[u8]>,
|
||||
|
@ -34,6 +34,38 @@ pub fn sniff_with_certain_confidence(
|
|||
extract_encoding_from_content_type_header(content_type_header.unwrap_or(b""))
|
||||
}
|
||||
|
||||
/// Implementing the all the stages of the encoding sniffing algorithm described at:
|
||||
/// https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
|
||||
/// except where those stages rely on other information (e.g. 'prior knowledge', a user default
|
||||
/// or a parent window context).
|
||||
///
|
||||
/// You should pass in at least the first 1024 bytes as the `sniff_window` here.
|
||||
pub fn sniff(
|
||||
sniff_window: &[u8],
|
||||
is_sniff_window_the_entire_document: bool,
|
||||
content_type_header: Option<&[u8]>,
|
||||
) -> Option<&'static Encoding> {
|
||||
if let Some(certain) = sniff_with_certain_confidence(sniff_window, content_type_header) {
|
||||
return Some(certain);
|
||||
}
|
||||
|
||||
// 5. Optionally prescan the byte stream to determine its encoding
|
||||
if let Some(prescan_tentative) = prescan(sniff_window) {
|
||||
return Some(prescan_tentative);
|
||||
}
|
||||
|
||||
// 8. The user agent may attempt to autodetect the character encoding from applying frequency
|
||||
// analysis or other algorithms to the data stream. Such algorithms may use information about
|
||||
// the resource other than the resource's contents, including the address of the resource.
|
||||
// If autodetection succeeds in determining a character encoding, and that encoding is a
|
||||
// supported encoding, then return that encoding, with the confidence tentative.
|
||||
let mut detector = EncodingDetector::new();
|
||||
detector.feed(sniff_window, is_sniff_window_the_entire_document);
|
||||
// 'Allow UTF-8' should be set to false for non-file: URIs, apparently.
|
||||
// We do that here, though I'm not sure this is what we actually want outside of a browser...
|
||||
Some(detector.guess(None, false))
|
||||
}
|
||||
|
||||
pub fn extract_encoding_from_content_type_header(
|
||||
content_type_header: &[u8],
|
||||
) -> Option<&'static Encoding> {
|
||||
|
|
Loading…
Reference in New Issue