Use the sniffed encoding in page extraction
This commit is contained in:
parent
5701b1e6d8
commit
aa4567c623
@ -44,14 +44,14 @@ pub fn sniff(
|
||||
sniff_window: &[u8],
|
||||
is_sniff_window_the_entire_document: bool,
|
||||
content_type_header: Option<&[u8]>,
|
||||
) -> Option<&'static Encoding> {
|
||||
) -> &'static Encoding {
|
||||
if let Some(certain) = sniff_with_certain_confidence(sniff_window, content_type_header) {
|
||||
return Some(certain);
|
||||
return certain;
|
||||
}
|
||||
|
||||
// 5. Optionally prescan the byte stream to determine its encoding
|
||||
if let Some(prescan_tentative) = prescan(sniff_window) {
|
||||
return Some(prescan_tentative);
|
||||
return prescan_tentative;
|
||||
}
|
||||
|
||||
// 8. The user agent may attempt to autodetect the character encoding from applying frequency
|
||||
@ -63,7 +63,7 @@ pub fn sniff(
|
||||
detector.feed(sniff_window, is_sniff_window_the_entire_document);
|
||||
// 'Allow UTF-8' should be set to false for non-file: URIs, apparently.
|
||||
// We do that here, though I'm not sure this is what we actually want outside of a browser...
|
||||
Some(detector.guess(None, false))
|
||||
detector.guess(None, false)
|
||||
}
|
||||
|
||||
pub fn extract_encoding_from_content_type_header(
|
||||
|
@ -12,11 +12,13 @@ use itertools::Itertools;
|
||||
use kuchiki::NodeRef;
|
||||
use log::{debug, error, trace, warn};
|
||||
use quickpeep_densedoc::{DenseDocument, DenseHead, DenseTree};
|
||||
use quickpeep_html_charset_detection::sniff;
|
||||
use quickpeep_structs::rake_entries::AnalysisAntifeatures;
|
||||
use quickpeep_utils::lazy::Lazy;
|
||||
use reqwest::header::HeaderMap;
|
||||
use reqwest::Url;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::borrow::Borrow;
|
||||
use tokio::runtime;
|
||||
use tokio::sync::mpsc::Sender;
|
||||
use tokio::sync::{mpsc, oneshot};
|
||||
@ -116,8 +118,16 @@ impl PageExtractionServiceInternal {
|
||||
headers: HeaderMap,
|
||||
is_cf: bool,
|
||||
) -> anyhow::Result<ExtractedPage> {
|
||||
let content_str: &str = todo!();
|
||||
let root_node: NodeRef = kuchiki::parse_html().one(content_str.as_ref());
|
||||
let encoding = sniff(
|
||||
&content_bytes,
|
||||
true,
|
||||
headers.get("content-type").map(|hv| hv.as_bytes()),
|
||||
);
|
||||
let (content_text, _actual_codec_used, replacements_made) = encoding.decode(&content_bytes);
|
||||
if replacements_made {
|
||||
warn!("Character replacements made!");
|
||||
}
|
||||
let root_node: NodeRef = kuchiki::parse_html().one(content_text.borrow());
|
||||
|
||||
// See whether this page is at the canonical URL for the page.
|
||||
// If it's not, then we redirect the raker to the canonical URL.
|
||||
|
Loading…
Reference in New Issue
Block a user