Add tests and fixes
This commit is contained in:
parent
aa4567c623
commit
4896ecd426
@ -76,9 +76,39 @@ pub fn extract_encoding_from_content_type_header(
|
|||||||
let key_value: Vec<&str> = header_part_ascii_ish.trim().split("=").collect();
|
let key_value: Vec<&str> = header_part_ascii_ish.trim().split("=").collect();
|
||||||
let key = key_value.get(0).cloned().unwrap_or("");
|
let key = key_value.get(0).cloned().unwrap_or("");
|
||||||
if key.to_ascii_lowercase() == "charset" {
|
if key.to_ascii_lowercase() == "charset" {
|
||||||
let value = key_value.get(0).cloned().unwrap_or("");
|
let value = key_value.get(1).cloned().unwrap_or("");
|
||||||
return Encoding::for_label(value.as_bytes());
|
return Encoding::for_label(value.as_bytes());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod test {
|
||||||
|
use crate::sniff;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_simple_cases() {
|
||||||
|
assert_eq!(sniff(b"<u>hi</u>", true, None).name(), "windows-1252");
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
sniff(b"<meta charset=UTF8><u>hi</u>", true, None).name(),
|
||||||
|
"UTF-8"
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
sniff(
|
||||||
|
b"<meta charset=UTF8><u>hi</u>",
|
||||||
|
true,
|
||||||
|
Some("text/html; charset=Shift-JIS".as_bytes())
|
||||||
|
)
|
||||||
|
.name(),
|
||||||
|
"Shift_JIS"
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
sniff(b"<!-- haha we wish <meta charset=UTF8> --><meta http-equiv='content-type' content='text/html; charset=Shift-JIS'><u>hi</u>", true, None).name(),
|
||||||
|
"Shift_JIS"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -39,12 +39,12 @@ pub fn prescan(bytes: &[u8]) -> Option<&'static Encoding> {
|
|||||||
// Loop: If position points to:
|
// Loop: If position points to:
|
||||||
while position < bytes.len() {
|
while position < bytes.len() {
|
||||||
// A sequence of bytes starting with: 0x3C 0x21 0x2D 0x2D (`<!--`)
|
// A sequence of bytes starting with: 0x3C 0x21 0x2D 0x2D (`<!--`)
|
||||||
if bytes[position..].starts_with(&[0x3C, 0x21, 0x2D, 0x2D]) {
|
if bytes[position..].starts_with(b"<!--") {
|
||||||
// Advance the position pointer so that it points at the first 0x3E byte which is
|
// Advance the position pointer so that it points at the first 0x3E byte which is
|
||||||
// preceded by two 0x2D bytes (i.e. at the end of an ASCII '-->' sequence) and comes
|
// preceded by two 0x2D bytes (i.e. at the end of an ASCII '-->' sequence) and comes
|
||||||
// after the 0x3C byte that was found. (The two 0x2D bytes can be the same as those in
|
// after the 0x3C byte that was found. (The two 0x2D bytes can be the same as those in
|
||||||
// the '<!--' sequence.)
|
// the '<!--' sequence.)
|
||||||
match bytes[position..].find(&[0x2D, 0x2D, 0x3C]) {
|
match bytes[position..].find(b"-->") {
|
||||||
Some(location_of_closer) => {
|
Some(location_of_closer) => {
|
||||||
position += location_of_closer + 3;
|
position += location_of_closer + 3;
|
||||||
continue;
|
continue;
|
||||||
@ -56,12 +56,15 @@ pub fn prescan(bytes: &[u8]) -> Option<&'static Encoding> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// A sequence of bytes starting with: 0x3C, 0x4D or 0x6D, 0x45 or 0x65, 0x54 or 0x74, 0x41 or 0x61, and one of 0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x2F (case-insensitive ASCII '<meta' followed by a space or slash)
|
// A sequence of bytes starting with: 0x3C, 0x4D or 0x6D, 0x45 or 0x65, 0x54 or 0x74, 0x41 or 0x61, and one of 0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x2F (case-insensitive ASCII '<meta' followed by a space or slash)
|
||||||
let check = bytes[position..position + b"<meta ".len()].to_ascii_lowercase();
|
let check = bytes
|
||||||
|
.get(position..position + b"<meta ".len())?
|
||||||
|
.to_ascii_lowercase();
|
||||||
let ends_in_whitespace = bytes
|
let ends_in_whitespace = bytes
|
||||||
.get(position + b"<meta".len())
|
.get(position + b"<meta".len())
|
||||||
.map(|c| matches!(c, 0x09 | 0x0A | 0x0C | 0x0D | 0x20 | 0x2F))
|
.map(|c| matches!(c, 0x09 | 0x0A | 0x0C | 0x0D | 0x20 | 0x2F))
|
||||||
.unwrap_or(false);
|
.unwrap_or(false);
|
||||||
if check.starts_with(b"<meta") && ends_in_whitespace {
|
if check.starts_with(b"<meta") && ends_in_whitespace {
|
||||||
|
println!("c");
|
||||||
// Advance the position pointer so that it points at the next 0x09, 0x0A, 0x0C, 0x0D, 0x20, or 0x2F byte (the one in sequence of characters matched above).
|
// Advance the position pointer so that it points at the next 0x09, 0x0A, 0x0C, 0x0D, 0x20, or 0x2F byte (the one in sequence of characters matched above).
|
||||||
position += 6;
|
position += 6;
|
||||||
|
|
||||||
@ -80,6 +83,7 @@ pub fn prescan(bytes: &[u8]) -> Option<&'static Encoding> {
|
|||||||
|
|
||||||
// Attributes: Get an attribute and its value. If no attribute was sniffed, then jump to the processing step below.
|
// Attributes: Get an attribute and its value. If no attribute was sniffed, then jump to the processing step below.
|
||||||
while let Some((key, value)) = prescan_get_attribute(&bytes, &mut position) {
|
while let Some((key, value)) = prescan_get_attribute(&bytes, &mut position) {
|
||||||
|
println!("att {key:?} {value:?}");
|
||||||
// If the attribute's name is already in attribute list, then return to the step labeled attributes.
|
// If the attribute's name is already in attribute list, then return to the step labeled attributes.
|
||||||
if attributes.contains(&key) {
|
if attributes.contains(&key) {
|
||||||
continue;
|
continue;
|
||||||
@ -218,6 +222,7 @@ pub fn prescan(bytes: &[u8]) -> Option<&'static Encoding> {
|
|||||||
pub fn prescan_get_attribute(bytes: &[u8], position: &mut usize) -> Option<(Vec<u8>, Vec<u8>)> {
|
pub fn prescan_get_attribute(bytes: &[u8], position: &mut usize) -> Option<(Vec<u8>, Vec<u8>)> {
|
||||||
// If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP),
|
// If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP),
|
||||||
// or 0x2F (/) then advance position to the next byte and redo this step.
|
// or 0x2F (/) then advance position to the next byte and redo this step.
|
||||||
|
println!("A{position}");
|
||||||
while matches!(
|
while matches!(
|
||||||
bytes.get(*position),
|
bytes.get(*position),
|
||||||
Some(0x09 | 0x0A | 0x0C | 0x0D | 0x20 | 0x2F)
|
Some(0x09 | 0x0A | 0x0C | 0x0D | 0x20 | 0x2F)
|
||||||
@ -260,6 +265,8 @@ pub fn prescan_get_attribute(bytes: &[u8], position: &mut usize) -> Option<(Vec<
|
|||||||
attribute_name.push(new_byte.to_ascii_lowercase());
|
attribute_name.push(new_byte.to_ascii_lowercase());
|
||||||
|
|
||||||
// Advance position to the next byte and return to the previous step.
|
// Advance position to the next byte and return to the previous step.
|
||||||
|
*position += 1;
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Spaces: If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), or 0x20 (SP) then advance position to the next byte, then, repeat this step.
|
// Spaces: If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), or 0x20 (SP) then advance position to the next byte, then, repeat this step.
|
||||||
@ -323,7 +330,10 @@ pub fn prescan_get_attribute(bytes: &[u8], position: &mut usize) -> Option<(Vec<
|
|||||||
// Process the byte at position as follows:
|
// Process the byte at position as follows:
|
||||||
//
|
//
|
||||||
// If it is 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP), or 0x3E (>)
|
// If it is 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP), or 0x3E (>)
|
||||||
if matches!(bytes.get(*position), Some(0x09 | 0x0A | 0x0C | 0x0D | 0x20)) {
|
if matches!(
|
||||||
|
bytes.get(*position),
|
||||||
|
Some(0x09 | 0x0A | 0x0C | 0x0D | 0x20 | 0x3E)
|
||||||
|
) {
|
||||||
// Abort the get an attribute algorithm. The attribute's name is the value of attribute name and its value is the value of attribute value.
|
// Abort the get an attribute algorithm. The attribute's name is the value of attribute name and its value is the value of attribute value.
|
||||||
return Some((attribute_name, attribute_value));
|
return Some((attribute_name, attribute_value));
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user