Pass the bytes through when extracting HTML
This commit is contained in:
parent
c783f89f72
commit
c451a12e44
@ -396,7 +396,7 @@ impl Raker {
|
||||
{
|
||||
// We don't try any fallbacks for an HTML page
|
||||
return Ok(self
|
||||
.rake_html_page(&content, url, is_cf, &headers)
|
||||
.rake_html_page(content, url, is_cf, &headers)
|
||||
.await
|
||||
.context("Raking HTML page")?);
|
||||
}
|
||||
@ -445,16 +445,14 @@ impl Raker {
|
||||
|
||||
pub async fn rake_html_page(
|
||||
&self,
|
||||
content: &[u8],
|
||||
content: Vec<u8>,
|
||||
url: &Url,
|
||||
is_cf: bool,
|
||||
headers: &HeaderMap,
|
||||
) -> anyhow::Result<RakeOutcome> {
|
||||
let content_str = std::str::from_utf8(content)?.to_owned();
|
||||
|
||||
match self
|
||||
.page_extraction
|
||||
.extract(content_str, url.clone(), headers.clone(), is_cf)
|
||||
.extract(content, url.clone(), headers.clone(), is_cf)
|
||||
.await?
|
||||
{
|
||||
ExtractedPage::Success {
|
||||
|
@ -29,7 +29,7 @@ pub struct PageExtractionService {
|
||||
}
|
||||
|
||||
pub struct ExtractionTask {
|
||||
content: String,
|
||||
content: Vec<u8>,
|
||||
url: Url,
|
||||
headers: HeaderMap,
|
||||
is_cf: bool,
|
||||
@ -39,7 +39,7 @@ pub struct ExtractionTask {
|
||||
impl PageExtractionService {
|
||||
pub async fn extract(
|
||||
&self,
|
||||
content: String,
|
||||
content: Vec<u8>,
|
||||
url: Url,
|
||||
headers: HeaderMap,
|
||||
is_cf: bool,
|
||||
@ -111,11 +111,12 @@ struct PageExtractionServiceInternal {
|
||||
impl PageExtractionServiceInternal {
|
||||
fn extract_page(
|
||||
&self,
|
||||
content_str: String,
|
||||
content_bytes: Vec<u8>,
|
||||
url: Url,
|
||||
headers: HeaderMap,
|
||||
is_cf: bool,
|
||||
) -> anyhow::Result<ExtractedPage> {
|
||||
let content_str: &str = todo!();
|
||||
let root_node: NodeRef = kuchiki::parse_html().one(content_str.as_ref());
|
||||
|
||||
// See whether this page is at the canonical URL for the page.
|
||||
|
Loading…
Reference in New Issue
Block a user