Add more work towards raking all the different formats
This commit is contained in:
parent
db5524eb52
commit
210e8ef10a
63
Cargo.lock
generated
63
Cargo.lock
generated
@ -196,6 +196,16 @@ dependencies = [
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cylon"
|
||||
version = "0.2.0"
|
||||
source = "git+https://github.com/reivilibre/cylon.git?branch=rei/fix_import#12cb6861d6fbd28151bf7befede910b82436034a"
|
||||
dependencies = [
|
||||
"futures-util",
|
||||
"serde",
|
||||
"serde_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "derive_more"
|
||||
version = "0.99.17"
|
||||
@ -347,6 +357,23 @@ version = "0.3.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0c09fd04b7e4073ac7156a9539b57a484a8ea920f79c7c675d05d289ab6110d3"
|
||||
|
||||
[[package]]
|
||||
name = "futures-io"
|
||||
version = "0.3.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fc4045962a5a5e935ee2fdedaa4e08284547402885ab326734432bed5d12966b"
|
||||
|
||||
[[package]]
|
||||
name = "futures-macro"
|
||||
version = "0.3.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "33c1e13800337f4d4d7a316bf45a567dbcb6ffe087f16424852d97e97a91f512"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures-sink"
|
||||
version = "0.3.21"
|
||||
@ -366,9 +393,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d8b7abd5d659d9b90c8cba917f6ec750a74e2dc23902ef9cd4cc8c8b22e6036a"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-io",
|
||||
"futures-macro",
|
||||
"futures-task",
|
||||
"memchr",
|
||||
"pin-project-lite",
|
||||
"pin-utils",
|
||||
"slab",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -422,9 +453,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "h2"
|
||||
version = "0.3.11"
|
||||
version = "0.3.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d9f1f717ddc7b2ba36df7e871fd88db79326551d3d6f1fc406fbfd28b582ff8e"
|
||||
checksum = "62eeb471aa3e3c9197aa4bfeabfe02982f6dc96f750486c0bb0009ac58b26d2b"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fnv",
|
||||
@ -709,14 +740,15 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "mio"
|
||||
version = "0.8.0"
|
||||
version = "0.8.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ba272f85fa0b41fc91872be579b3bbe0f56b792aa361a380eb669469f68dafb2"
|
||||
checksum = "7ba42135c6a5917b9db9cd7b293e5409e1c6b041e6f9825e92e55a894c63b6f8"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"log",
|
||||
"miow",
|
||||
"ntapi",
|
||||
"wasi 0.11.0+wasi-snapshot-preview1",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
@ -1029,11 +1061,15 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"adblock",
|
||||
"anyhow",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"cylon",
|
||||
"env_logger",
|
||||
"feed-rs",
|
||||
"gemini-fetch",
|
||||
"html5ever",
|
||||
"kuchiki",
|
||||
"lazy_static",
|
||||
"log",
|
||||
"quickpeep_moz_readability",
|
||||
"reqwest",
|
||||
@ -1126,9 +1162,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.5.4"
|
||||
version = "1.5.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461"
|
||||
checksum = "1a11647b6b25ff05a515cb92c365cec08801e83423a235b51e231e1808747286"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
@ -1179,6 +1215,7 @@ dependencies = [
|
||||
"serde_urlencoded",
|
||||
"tokio",
|
||||
"tokio-native-tls",
|
||||
"tokio-util",
|
||||
"url",
|
||||
"wasm-bindgen",
|
||||
"wasm-bindgen-futures",
|
||||
@ -1652,9 +1689,9 @@ checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6"
|
||||
|
||||
[[package]]
|
||||
name = "tracing"
|
||||
version = "0.1.31"
|
||||
version = "0.1.32"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f6c650a8ef0cd2dd93736f033d21cbd1224c5a967aa0c258d00fcf7dafef9b9f"
|
||||
checksum = "4a1bdf54a7c28a2bbf701e1d2233f6c77f473486b94bee4f9678da5a148dca7f"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"pin-project-lite",
|
||||
@ -1663,9 +1700,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tracing-core"
|
||||
version = "0.1.22"
|
||||
version = "0.1.23"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "03cfcb51380632a72d3111cb8d3447a8d908e577d31beeac006f836383d29a23"
|
||||
checksum = "aa31669fa42c09c34d94d8165dd2012e8ff3c66aca50f3bb226b68f216f2706c"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
]
|
||||
@ -1774,6 +1811,12 @@ version = "0.10.2+wasi-snapshot-preview1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6"
|
||||
|
||||
[[package]]
|
||||
name = "wasi"
|
||||
version = "0.11.0+wasi-snapshot-preview1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen"
|
||||
version = "0.2.79"
|
||||
|
@ -4,3 +4,7 @@ members = [
|
||||
"quickpeep_moz_readability"
|
||||
]
|
||||
|
||||
|
||||
[patch.crates-io]
|
||||
cylon = { git = "https://github.com/reivilibre/cylon.git", branch = "rei/fix_import" }
|
||||
|
||||
|
@ -18,6 +18,11 @@ html5ever = "0.25.1"
|
||||
|
||||
serde = { version = "1.0.136", features = ["derive"] }
|
||||
|
||||
chrono = "0.4.19"
|
||||
|
||||
lazy_static = "1.4.0"
|
||||
|
||||
bytes = "1.1.0"
|
||||
|
||||
# TODO: rkyv and memmap2 should be an efficient way to load index packs into processes.
|
||||
# rkyv = "0.7.35"
|
||||
@ -25,12 +30,12 @@ serde = { version = "1.0.136", features = ["derive"] }
|
||||
|
||||
### Raking helpers
|
||||
# HTTP Requests
|
||||
reqwest = { version = "0.11.9", features = [] }
|
||||
reqwest = { version = "0.11.9", features = ["stream"] }
|
||||
# Gemini Requests
|
||||
# N.B. TODO gemfeeds are Atom feeds for Gemini. Should support those.
|
||||
gemini-fetch = "0.2.1"
|
||||
# Robots.txt
|
||||
# TODO cylon = { version = "0.2.0", features = [] }
|
||||
cylon = { version = "0.2.0", features = ["crawl-delay"] }
|
||||
# RSS/Atom/JSON feeds
|
||||
feed-rs = "1.0.0"
|
||||
# Sitemaps
|
||||
|
@ -1,12 +1,4 @@
|
||||
pub mod raking;
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
pub fn test_sitemap() {
|
||||
let mut curs = std::io::Cursor::new("<url><loc>https://lol</loc></url>");
|
||||
let reader = sitemap::reader::SiteMapReader::new(curs);
|
||||
for entry in reader {
|
||||
eprintln!("{:?}", entry);
|
||||
}
|
||||
}
|
||||
}
|
||||
mod test;
|
||||
|
@ -1,28 +1,58 @@
|
||||
use anyhow::{bail, Context};
|
||||
use reqwest::header::HeaderValue;
|
||||
use chrono::{DateTime, FixedOffset, Utc};
|
||||
use cylon::Cylon;
|
||||
use html5ever::tendril::fmt::Slice;
|
||||
use lazy_static::lazy_static;
|
||||
use log::debug;
|
||||
use reqwest::{Client, Url};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use sitemap::reader::SiteMapEntity;
|
||||
use std::collections::HashSet;
|
||||
|
||||
mod analysis;
|
||||
|
||||
pub const USER_AGENT: &'static str = "QuickPeepBot";
|
||||
|
||||
pub enum RakeOutcome {
|
||||
RakedPage(RakedPage),
|
||||
RakedFeed(RakedFeed),
|
||||
RakedSitemap(RakedSitemap),
|
||||
RakedFeed(Vec<UrlRaked>),
|
||||
RakedSitemap(Vec<UrlRaked>),
|
||||
/// The page was not canonical, and should not be indexed.
|
||||
/// However here is the URL of the canonical page.
|
||||
NotCanonical {
|
||||
new_url: Url,
|
||||
},
|
||||
TemporaryFailure(TemporaryFailure),
|
||||
PermanentFailure(PermanentFailure),
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
pub struct UrlRaked {
|
||||
pub url: Url,
|
||||
pub last_changed: Option<DateTime<Utc>>,
|
||||
pub intent: RakeIntent,
|
||||
}
|
||||
|
||||
pub struct RakedPage {}
|
||||
|
||||
pub struct RakedFeed {}
|
||||
pub struct RobotsTxt {
|
||||
pub sitemaps: Vec<UrlRaked>,
|
||||
pub rules: Cylon,
|
||||
}
|
||||
|
||||
pub struct RakedSitemap {}
|
||||
|
||||
pub struct TemporaryFailure {}
|
||||
pub struct TemporaryFailure {
|
||||
pub reason: TemporaryFailureReason,
|
||||
pub backoff_sec: u32,
|
||||
}
|
||||
|
||||
pub struct PermanentFailure {
|
||||
pub reason: PermanentFailureReason,
|
||||
}
|
||||
|
||||
pub enum TemporaryFailureReason {
|
||||
MissingInformation(String),
|
||||
}
|
||||
|
||||
pub enum PermanentFailureReason {
|
||||
ResourceDenied(u32),
|
||||
WrongLanguage(String),
|
||||
@ -37,6 +67,20 @@ pub enum RakeIntent {
|
||||
SiteMap,
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
static ref SITEMAP_MIME_TYPES: HashSet<&'static str> =
|
||||
HashSet::from_iter(vec!["text/xml", "application/xml",]);
|
||||
static ref FEED_MIME_TYPES: HashSet<&'static str> = HashSet::from_iter(vec![
|
||||
"text/xml",
|
||||
"application/xml",
|
||||
"application/atom+xml",
|
||||
"application/rss+xml",
|
||||
"application/rdf+xml",
|
||||
"application/json",
|
||||
"application/feed+json"
|
||||
]);
|
||||
}
|
||||
|
||||
pub async fn rake(url: &Url, intent: RakeIntent, client: &Client) -> anyhow::Result<RakeOutcome> {
|
||||
let response = client.get(url.clone()).send().await?;
|
||||
|
||||
@ -49,31 +93,59 @@ pub async fn rake(url: &Url, intent: RakeIntent, client: &Client) -> anyhow::Res
|
||||
.to_str()
|
||||
.context("Can't convert content-type to str")?;
|
||||
eprintln!("CT {:?}", content_type);
|
||||
content_type.to_owned()
|
||||
content_type.split(";").next().unwrap().trim().to_owned()
|
||||
} else {
|
||||
// TODO ???
|
||||
"text/html".to_owned()
|
||||
return Ok(RakeOutcome::TemporaryFailure(TemporaryFailure {
|
||||
reason: TemporaryFailureReason::MissingInformation("content-type".to_owned()),
|
||||
backoff_sec: 86400 * 7,
|
||||
}));
|
||||
};
|
||||
|
||||
let content = response.bytes().await?;
|
||||
|
||||
if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page) {}
|
||||
if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page) {
|
||||
match rake_html_page(&content, url) {
|
||||
Ok(page_rake) => {
|
||||
return Ok(RakeOutcome::RakedPage(page_rake));
|
||||
}
|
||||
Err(error) => {
|
||||
debug!("Failed to rake HTML page: {:?}", error);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO JSON Feeds.
|
||||
if content_type == "application/xml"
|
||||
if FEED_MIME_TYPES.contains(content_type.as_str())
|
||||
&& (intent == RakeIntent::Any || intent == RakeIntent::Feed)
|
||||
{}
|
||||
{
|
||||
match rake_feed(&content, url) {
|
||||
Ok(feed) => {
|
||||
return Ok(RakeOutcome::RakedFeed(feed));
|
||||
}
|
||||
Err(error) => {
|
||||
debug!("Failed to rake as feed: {:?}", error);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if content_type == "application/xml"
|
||||
if SITEMAP_MIME_TYPES.contains(content_type.as_str())
|
||||
&& (intent == RakeIntent::Any || intent == RakeIntent::SiteMap)
|
||||
{}
|
||||
{
|
||||
match rake_sitemap(&content) {
|
||||
Ok(sitemap) => {
|
||||
return Ok(RakeOutcome::RakedSitemap(sitemap));
|
||||
}
|
||||
Err(error) => {
|
||||
debug!("Failed to rake as sitemap: {:?}", error);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return Ok(RakeOutcome::PermanentFailure(PermanentFailure {
|
||||
reason: PermanentFailureReason::UnknownContentType(content_type.to_owned()),
|
||||
}));
|
||||
}
|
||||
|
||||
pub fn rake_html_page(content: &[u8], url: &Url) -> anyhow::Result<()> {
|
||||
pub fn rake_html_page(content: &[u8], url: &Url) -> anyhow::Result<RakedPage> {
|
||||
let content_str = std::str::from_utf8(content)?;
|
||||
|
||||
let mut readability = quickpeep_moz_readability::Readability::new(content_str);
|
||||
@ -87,15 +159,136 @@ pub fn rake_html_page(content: &[u8], url: &Url) -> anyhow::Result<()> {
|
||||
eprintln!("{}", node.to_string());
|
||||
}
|
||||
|
||||
Ok(())
|
||||
Ok(todo!())
|
||||
}
|
||||
|
||||
pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result<()> {
|
||||
let x = feed_rs::parser::parse_with_uri(content, Some(url.as_str()))?;
|
||||
todo!()
|
||||
pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result<Vec<UrlRaked>> {
|
||||
let feed = feed_rs::parser::parse_with_uri(content, Some(url.as_str()))?;
|
||||
|
||||
let mut urls = Vec::new();
|
||||
|
||||
for entry in feed.entries {
|
||||
let link = if let Some(link) = entry.links.get(0) {
|
||||
link
|
||||
} else {
|
||||
continue;
|
||||
};
|
||||
let url = Url::parse(&link.href).context("parsing URL in feed")?; // TODO ignore failure here...?
|
||||
|
||||
let last_changed = entry.updated.or(entry.published);
|
||||
|
||||
urls.push(UrlRaked {
|
||||
url,
|
||||
last_changed,
|
||||
intent: RakeIntent::Page,
|
||||
});
|
||||
}
|
||||
|
||||
// TODO paginated feeds (e.g. JSON Feed next_url)
|
||||
|
||||
Ok(urls)
|
||||
}
|
||||
|
||||
pub fn rake_sitemap(content: &[u8]) -> anyhow::Result<()> {
|
||||
//let x = sitemap::
|
||||
todo!()
|
||||
pub fn rake_sitemap(content: &[u8]) -> anyhow::Result<Vec<UrlRaked>> {
|
||||
let curs = std::io::Cursor::new(content);
|
||||
let reader = sitemap::reader::SiteMapReader::new(curs);
|
||||
|
||||
let mut urls = Vec::new();
|
||||
|
||||
for entry in reader {
|
||||
match &entry {
|
||||
SiteMapEntity::Url(url) => {
|
||||
let loc = if let Some(loc) = url.loc.get_url() {
|
||||
loc
|
||||
} else {
|
||||
continue;
|
||||
};
|
||||
|
||||
urls.push(UrlRaked {
|
||||
url: loc,
|
||||
last_changed: url
|
||||
.lastmod
|
||||
.get_time()
|
||||
.map(|dt: DateTime<FixedOffset>| dt.into()),
|
||||
intent: RakeIntent::Page,
|
||||
});
|
||||
}
|
||||
SiteMapEntity::SiteMap(sitemap) => {
|
||||
let loc = if let Some(loc) = sitemap.loc.get_url() {
|
||||
loc
|
||||
} else {
|
||||
continue;
|
||||
};
|
||||
|
||||
urls.push(UrlRaked {
|
||||
url: loc,
|
||||
last_changed: sitemap
|
||||
.lastmod
|
||||
.get_time()
|
||||
.map(|dt: DateTime<FixedOffset>| dt.into()),
|
||||
intent: RakeIntent::SiteMap,
|
||||
});
|
||||
}
|
||||
SiteMapEntity::Err(error) => {
|
||||
debug!("Sitemap error {:?}", error);
|
||||
}
|
||||
}
|
||||
eprintln!("{:?}", entry);
|
||||
}
|
||||
|
||||
if urls.is_empty() {
|
||||
bail!("No URLs or Sitemaps picked up from sitemap; is it bad?");
|
||||
}
|
||||
|
||||
Ok(urls)
|
||||
}
|
||||
|
||||
pub async fn get_robots_txt_for(url: &Url, client: &Client) -> anyhow::Result<Option<RobotsTxt>> {
|
||||
let robots_url = url
|
||||
.join("/robots.txt")
|
||||
.context("Whilst resolving /robots.txt on URL")?;
|
||||
let resp = client.get(robots_url.clone()).send().await?;
|
||||
|
||||
if !resp.status().is_success() {
|
||||
let code = resp.status().as_u16();
|
||||
if code == 404 || code == 410 {
|
||||
// not found or gone? Assume there is intentionally no robots.txt file.
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
bail!("Failed to get {:?}: {:?}", robots_url, resp.status());
|
||||
}
|
||||
|
||||
let bytes = resp.bytes().await?;
|
||||
|
||||
Ok(decode_robots_txt(&bytes).await?)
|
||||
}
|
||||
|
||||
pub async fn decode_robots_txt(bytes: &[u8]) -> anyhow::Result<Option<RobotsTxt>> {
|
||||
let mut sitemaps = Vec::new();
|
||||
|
||||
for line in bytes.split(|b| *b == b'\n') {
|
||||
let line = line.to_ascii_lowercase();
|
||||
if line.starts_with(b"sitemap:") {
|
||||
if let Ok(value) = std::str::from_utf8(&line[8..]) {
|
||||
if let Ok(url) = Url::parse(value.trim()) {
|
||||
sitemaps.push(UrlRaked {
|
||||
url,
|
||||
last_changed: None,
|
||||
intent: RakeIntent::SiteMap,
|
||||
});
|
||||
} else {
|
||||
debug!("Failed to parse sitemap value as a URL")
|
||||
}
|
||||
} else {
|
||||
debug!("Failed to parse sitemap value as UTF-8")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let rules = cylon::Compiler::new(USER_AGENT)
|
||||
.compile(bytes.as_bytes())
|
||||
.await?;
|
||||
|
||||
Ok(Some(RobotsTxt { sitemaps, rules }))
|
||||
}
|
||||
|
43
quickpeep/src/raking/analysis.rs
Normal file
43
quickpeep/src/raking/analysis.rs
Normal file
@ -0,0 +1,43 @@
|
||||
use adblock::filters::cosmetic::CosmeticFilter;
|
||||
use anyhow::anyhow;
|
||||
use kuchiki::NodeRef;
|
||||
use log::debug;
|
||||
use std::path::Path;
|
||||
use tokio::fs::File;
|
||||
use tokio::io::{AsyncBufReadExt, AsyncRead, BufReader};
|
||||
|
||||
pub async fn load_cosmetic_filters<R: AsyncRead + Unpin>(
|
||||
reader: R,
|
||||
) -> anyhow::Result<Vec<CosmeticFilter>> {
|
||||
let mut br = BufReader::new(reader);
|
||||
let mut rules = Vec::new();
|
||||
let mut buf = String::new();
|
||||
loop {
|
||||
buf.clear();
|
||||
if br.read_line(&mut buf).await? == 0 {
|
||||
break;
|
||||
}
|
||||
if let Ok(rule) = CosmeticFilter::parse(&buf, false) {
|
||||
rules.push(rule);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(rules)
|
||||
}
|
||||
|
||||
pub fn analyse_with_ad_block_cosmetic_filter(
|
||||
root: NodeRef,
|
||||
filters: &Vec<CosmeticFilter>,
|
||||
) -> anyhow::Result<bool> {
|
||||
let mut matches = 0;
|
||||
for rule in filters {
|
||||
for ele in root
|
||||
.select(&rule.selector)
|
||||
.map_err(|_| anyhow!("Failed to select(..)"))?
|
||||
{
|
||||
debug!("Cosmetic Filter {:?} Matches {:?}", rule, ele);
|
||||
matches += 1;
|
||||
}
|
||||
}
|
||||
Ok(matches > 0)
|
||||
}
|
88
quickpeep/src/test.rs
Normal file
88
quickpeep/src/test.rs
Normal file
@ -0,0 +1,88 @@
|
||||
use crate::raking::{decode_robots_txt, rake_feed, rake_sitemap, RakeIntent, UrlRaked};
|
||||
use reqwest::Url;
|
||||
|
||||
#[test]
|
||||
pub fn test_sitemap() {
|
||||
let sm = rake_sitemap(
|
||||
br#"
|
||||
<urlset><url><loc>https://example.org/index.html</loc></url></urlset>
|
||||
"#,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
sm,
|
||||
vec![UrlRaked {
|
||||
url: Url::parse("https://example.org/index.html").unwrap(),
|
||||
last_changed: None,
|
||||
intent: RakeIntent::Page
|
||||
},]
|
||||
);
|
||||
|
||||
let sm = rake_sitemap(
|
||||
br#"
|
||||
<sitemapindex><sitemap><loc>https://example.org/sitemap.xml</loc></sitemap></sitemapindex>
|
||||
"#,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
sm,
|
||||
vec![UrlRaked {
|
||||
url: Url::parse("https://example.org/sitemap.xml").unwrap(),
|
||||
last_changed: None,
|
||||
intent: RakeIntent::SiteMap
|
||||
}]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_feed() {
|
||||
let feed = rake_feed(
|
||||
br#"
|
||||
<feed>
|
||||
<entry>
|
||||
<link href="https://example.org/index.html" />
|
||||
</entry>
|
||||
</feed>
|
||||
"#,
|
||||
&Url::parse("https://example.org/atom.xml").unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
feed,
|
||||
vec![UrlRaked {
|
||||
url: Url::parse("https://example.org/index.html").unwrap(),
|
||||
last_changed: None,
|
||||
intent: RakeIntent::Page
|
||||
},]
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
pub async fn test_robots_txt() {
|
||||
let rtxt = decode_robots_txt(
|
||||
br#"
|
||||
User-Agent: *
|
||||
Disallow: /bad
|
||||
Allow: /bad/abc
|
||||
|
||||
SiteMap: https://example.org/sitemap.xml
|
||||
"#,
|
||||
)
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
rtxt.sitemaps,
|
||||
vec![UrlRaked {
|
||||
url: Url::parse("https://example.org/sitemap.xml").unwrap(),
|
||||
last_changed: None,
|
||||
intent: RakeIntent::SiteMap
|
||||
}]
|
||||
);
|
||||
|
||||
assert!(!rtxt.rules.allow("/bad"));
|
||||
assert!(!rtxt.rules.allow("/bad/def"));
|
||||
assert!(rtxt.rules.allow("/bad/abc"));
|
||||
assert!(rtxt.rules.allow("/good"));
|
||||
}
|
Loading…
Reference in New Issue
Block a user