Pass the bytes through when extracting HTML
This commit is contained in:
parent
c783f89f72
commit
c451a12e44
@ -396,7 +396,7 @@ impl Raker {
|
|||||||
{
|
{
|
||||||
// We don't try any fallbacks for an HTML page
|
// We don't try any fallbacks for an HTML page
|
||||||
return Ok(self
|
return Ok(self
|
||||||
.rake_html_page(&content, url, is_cf, &headers)
|
.rake_html_page(content, url, is_cf, &headers)
|
||||||
.await
|
.await
|
||||||
.context("Raking HTML page")?);
|
.context("Raking HTML page")?);
|
||||||
}
|
}
|
||||||
@ -445,16 +445,14 @@ impl Raker {
|
|||||||
|
|
||||||
pub async fn rake_html_page(
|
pub async fn rake_html_page(
|
||||||
&self,
|
&self,
|
||||||
content: &[u8],
|
content: Vec<u8>,
|
||||||
url: &Url,
|
url: &Url,
|
||||||
is_cf: bool,
|
is_cf: bool,
|
||||||
headers: &HeaderMap,
|
headers: &HeaderMap,
|
||||||
) -> anyhow::Result<RakeOutcome> {
|
) -> anyhow::Result<RakeOutcome> {
|
||||||
let content_str = std::str::from_utf8(content)?.to_owned();
|
|
||||||
|
|
||||||
match self
|
match self
|
||||||
.page_extraction
|
.page_extraction
|
||||||
.extract(content_str, url.clone(), headers.clone(), is_cf)
|
.extract(content, url.clone(), headers.clone(), is_cf)
|
||||||
.await?
|
.await?
|
||||||
{
|
{
|
||||||
ExtractedPage::Success {
|
ExtractedPage::Success {
|
||||||
|
@ -29,7 +29,7 @@ pub struct PageExtractionService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub struct ExtractionTask {
|
pub struct ExtractionTask {
|
||||||
content: String,
|
content: Vec<u8>,
|
||||||
url: Url,
|
url: Url,
|
||||||
headers: HeaderMap,
|
headers: HeaderMap,
|
||||||
is_cf: bool,
|
is_cf: bool,
|
||||||
@ -39,7 +39,7 @@ pub struct ExtractionTask {
|
|||||||
impl PageExtractionService {
|
impl PageExtractionService {
|
||||||
pub async fn extract(
|
pub async fn extract(
|
||||||
&self,
|
&self,
|
||||||
content: String,
|
content: Vec<u8>,
|
||||||
url: Url,
|
url: Url,
|
||||||
headers: HeaderMap,
|
headers: HeaderMap,
|
||||||
is_cf: bool,
|
is_cf: bool,
|
||||||
@ -111,11 +111,12 @@ struct PageExtractionServiceInternal {
|
|||||||
impl PageExtractionServiceInternal {
|
impl PageExtractionServiceInternal {
|
||||||
fn extract_page(
|
fn extract_page(
|
||||||
&self,
|
&self,
|
||||||
content_str: String,
|
content_bytes: Vec<u8>,
|
||||||
url: Url,
|
url: Url,
|
||||||
headers: HeaderMap,
|
headers: HeaderMap,
|
||||||
is_cf: bool,
|
is_cf: bool,
|
||||||
) -> anyhow::Result<ExtractedPage> {
|
) -> anyhow::Result<ExtractedPage> {
|
||||||
|
let content_str: &str = todo!();
|
||||||
let root_node: NodeRef = kuchiki::parse_html().one(content_str.as_ref());
|
let root_node: NodeRef = kuchiki::parse_html().one(content_str.as_ref());
|
||||||
|
|
||||||
// See whether this page is at the canonical URL for the page.
|
// See whether this page is at the canonical URL for the page.
|
||||||
|
Loading…
Reference in New Issue
Block a user