Use the sniffed encoding in page extraction
This commit is contained in:
parent
5701b1e6d8
commit
aa4567c623
|
@ -44,14 +44,14 @@ pub fn sniff(
|
||||||
sniff_window: &[u8],
|
sniff_window: &[u8],
|
||||||
is_sniff_window_the_entire_document: bool,
|
is_sniff_window_the_entire_document: bool,
|
||||||
content_type_header: Option<&[u8]>,
|
content_type_header: Option<&[u8]>,
|
||||||
) -> Option<&'static Encoding> {
|
) -> &'static Encoding {
|
||||||
if let Some(certain) = sniff_with_certain_confidence(sniff_window, content_type_header) {
|
if let Some(certain) = sniff_with_certain_confidence(sniff_window, content_type_header) {
|
||||||
return Some(certain);
|
return certain;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 5. Optionally prescan the byte stream to determine its encoding
|
// 5. Optionally prescan the byte stream to determine its encoding
|
||||||
if let Some(prescan_tentative) = prescan(sniff_window) {
|
if let Some(prescan_tentative) = prescan(sniff_window) {
|
||||||
return Some(prescan_tentative);
|
return prescan_tentative;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 8. The user agent may attempt to autodetect the character encoding from applying frequency
|
// 8. The user agent may attempt to autodetect the character encoding from applying frequency
|
||||||
|
@ -63,7 +63,7 @@ pub fn sniff(
|
||||||
detector.feed(sniff_window, is_sniff_window_the_entire_document);
|
detector.feed(sniff_window, is_sniff_window_the_entire_document);
|
||||||
// 'Allow UTF-8' should be set to false for non-file: URIs, apparently.
|
// 'Allow UTF-8' should be set to false for non-file: URIs, apparently.
|
||||||
// We do that here, though I'm not sure this is what we actually want outside of a browser...
|
// We do that here, though I'm not sure this is what we actually want outside of a browser...
|
||||||
Some(detector.guess(None, false))
|
detector.guess(None, false)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn extract_encoding_from_content_type_header(
|
pub fn extract_encoding_from_content_type_header(
|
||||||
|
|
|
@ -12,11 +12,13 @@ use itertools::Itertools;
|
||||||
use kuchiki::NodeRef;
|
use kuchiki::NodeRef;
|
||||||
use log::{debug, error, trace, warn};
|
use log::{debug, error, trace, warn};
|
||||||
use quickpeep_densedoc::{DenseDocument, DenseHead, DenseTree};
|
use quickpeep_densedoc::{DenseDocument, DenseHead, DenseTree};
|
||||||
|
use quickpeep_html_charset_detection::sniff;
|
||||||
use quickpeep_structs::rake_entries::AnalysisAntifeatures;
|
use quickpeep_structs::rake_entries::AnalysisAntifeatures;
|
||||||
use quickpeep_utils::lazy::Lazy;
|
use quickpeep_utils::lazy::Lazy;
|
||||||
use reqwest::header::HeaderMap;
|
use reqwest::header::HeaderMap;
|
||||||
use reqwest::Url;
|
use reqwest::Url;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
use std::borrow::Borrow;
|
||||||
use tokio::runtime;
|
use tokio::runtime;
|
||||||
use tokio::sync::mpsc::Sender;
|
use tokio::sync::mpsc::Sender;
|
||||||
use tokio::sync::{mpsc, oneshot};
|
use tokio::sync::{mpsc, oneshot};
|
||||||
|
@ -116,8 +118,16 @@ impl PageExtractionServiceInternal {
|
||||||
headers: HeaderMap,
|
headers: HeaderMap,
|
||||||
is_cf: bool,
|
is_cf: bool,
|
||||||
) -> anyhow::Result<ExtractedPage> {
|
) -> anyhow::Result<ExtractedPage> {
|
||||||
let content_str: &str = todo!();
|
let encoding = sniff(
|
||||||
let root_node: NodeRef = kuchiki::parse_html().one(content_str.as_ref());
|
&content_bytes,
|
||||||
|
true,
|
||||||
|
headers.get("content-type").map(|hv| hv.as_bytes()),
|
||||||
|
);
|
||||||
|
let (content_text, _actual_codec_used, replacements_made) = encoding.decode(&content_bytes);
|
||||||
|
if replacements_made {
|
||||||
|
warn!("Character replacements made!");
|
||||||
|
}
|
||||||
|
let root_node: NodeRef = kuchiki::parse_html().one(content_text.borrow());
|
||||||
|
|
||||||
// See whether this page is at the canonical URL for the page.
|
// See whether this page is at the canonical URL for the page.
|
||||||
// If it's not, then we redirect the raker to the canonical URL.
|
// If it's not, then we redirect the raker to the canonical URL.
|
||||||
|
|
Loading…
Reference in New Issue