Add tests and fixes
This commit is contained in:
parent
aa4567c623
commit
4896ecd426
|
@ -76,9 +76,39 @@ pub fn extract_encoding_from_content_type_header(
|
|||
let key_value: Vec<&str> = header_part_ascii_ish.trim().split("=").collect();
|
||||
let key = key_value.get(0).cloned().unwrap_or("");
|
||||
if key.to_ascii_lowercase() == "charset" {
|
||||
let value = key_value.get(0).cloned().unwrap_or("");
|
||||
let value = key_value.get(1).cloned().unwrap_or("");
|
||||
return Encoding::for_label(value.as_bytes());
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use crate::sniff;
|
||||
|
||||
#[test]
|
||||
fn test_simple_cases() {
|
||||
assert_eq!(sniff(b"<u>hi</u>", true, None).name(), "windows-1252");
|
||||
|
||||
assert_eq!(
|
||||
sniff(b"<meta charset=UTF8><u>hi</u>", true, None).name(),
|
||||
"UTF-8"
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
sniff(
|
||||
b"<meta charset=UTF8><u>hi</u>",
|
||||
true,
|
||||
Some("text/html; charset=Shift-JIS".as_bytes())
|
||||
)
|
||||
.name(),
|
||||
"Shift_JIS"
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
sniff(b"<!-- haha we wish <meta charset=UTF8> --><meta http-equiv='content-type' content='text/html; charset=Shift-JIS'><u>hi</u>", true, None).name(),
|
||||
"Shift_JIS"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -39,12 +39,12 @@ pub fn prescan(bytes: &[u8]) -> Option<&'static Encoding> {
|
|||
// Loop: If position points to:
|
||||
while position < bytes.len() {
|
||||
// A sequence of bytes starting with: 0x3C 0x21 0x2D 0x2D (`<!--`)
|
||||
if bytes[position..].starts_with(&[0x3C, 0x21, 0x2D, 0x2D]) {
|
||||
if bytes[position..].starts_with(b"<!--") {
|
||||
// Advance the position pointer so that it points at the first 0x3E byte which is
|
||||
// preceded by two 0x2D bytes (i.e. at the end of an ASCII '-->' sequence) and comes
|
||||
// after the 0x3C byte that was found. (The two 0x2D bytes can be the same as those in
|
||||
// the '<!--' sequence.)
|
||||
match bytes[position..].find(&[0x2D, 0x2D, 0x3C]) {
|
||||
match bytes[position..].find(b"-->") {
|
||||
Some(location_of_closer) => {
|
||||
position += location_of_closer + 3;
|
||||
continue;
|
||||
|
@ -56,12 +56,15 @@ pub fn prescan(bytes: &[u8]) -> Option<&'static Encoding> {
|
|||
}
|
||||
|
||||
// A sequence of bytes starting with: 0x3C, 0x4D or 0x6D, 0x45 or 0x65, 0x54 or 0x74, 0x41 or 0x61, and one of 0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x2F (case-insensitive ASCII '<meta' followed by a space or slash)
|
||||
let check = bytes[position..position + b"<meta ".len()].to_ascii_lowercase();
|
||||
let check = bytes
|
||||
.get(position..position + b"<meta ".len())?
|
||||
.to_ascii_lowercase();
|
||||
let ends_in_whitespace = bytes
|
||||
.get(position + b"<meta".len())
|
||||
.map(|c| matches!(c, 0x09 | 0x0A | 0x0C | 0x0D | 0x20 | 0x2F))
|
||||
.unwrap_or(false);
|
||||
if check.starts_with(b"<meta") && ends_in_whitespace {
|
||||
println!("c");
|
||||
// Advance the position pointer so that it points at the next 0x09, 0x0A, 0x0C, 0x0D, 0x20, or 0x2F byte (the one in sequence of characters matched above).
|
||||
position += 6;
|
||||
|
||||
|
@ -80,6 +83,7 @@ pub fn prescan(bytes: &[u8]) -> Option<&'static Encoding> {
|
|||
|
||||
// Attributes: Get an attribute and its value. If no attribute was sniffed, then jump to the processing step below.
|
||||
while let Some((key, value)) = prescan_get_attribute(&bytes, &mut position) {
|
||||
println!("att {key:?} {value:?}");
|
||||
// If the attribute's name is already in attribute list, then return to the step labeled attributes.
|
||||
if attributes.contains(&key) {
|
||||
continue;
|
||||
|
@ -218,6 +222,7 @@ pub fn prescan(bytes: &[u8]) -> Option<&'static Encoding> {
|
|||
pub fn prescan_get_attribute(bytes: &[u8], position: &mut usize) -> Option<(Vec<u8>, Vec<u8>)> {
|
||||
// If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP),
|
||||
// or 0x2F (/) then advance position to the next byte and redo this step.
|
||||
println!("A{position}");
|
||||
while matches!(
|
||||
bytes.get(*position),
|
||||
Some(0x09 | 0x0A | 0x0C | 0x0D | 0x20 | 0x2F)
|
||||
|
@ -260,6 +265,8 @@ pub fn prescan_get_attribute(bytes: &[u8], position: &mut usize) -> Option<(Vec<
|
|||
attribute_name.push(new_byte.to_ascii_lowercase());
|
||||
|
||||
// Advance position to the next byte and return to the previous step.
|
||||
*position += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Spaces: If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), or 0x20 (SP) then advance position to the next byte, then, repeat this step.
|
||||
|
@ -323,7 +330,10 @@ pub fn prescan_get_attribute(bytes: &[u8], position: &mut usize) -> Option<(Vec<
|
|||
// Process the byte at position as follows:
|
||||
//
|
||||
// If it is 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP), or 0x3E (>)
|
||||
if matches!(bytes.get(*position), Some(0x09 | 0x0A | 0x0C | 0x0D | 0x20)) {
|
||||
if matches!(
|
||||
bytes.get(*position),
|
||||
Some(0x09 | 0x0A | 0x0C | 0x0D | 0x20 | 0x3E)
|
||||
) {
|
||||
// Abort the get an attribute algorithm. The attribute's name is the value of attribute name and its value is the value of attribute value.
|
||||
return Some((attribute_name, attribute_value));
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue