Add more work towards raking all the different formats

This commit is contained in:
Olivier 'reivilibre' 2022-03-13 12:40:04 +00:00
parent db5524eb52
commit 210e8ef10a
7 changed files with 413 additions and 45 deletions

63
Cargo.lock generated
View File

@ -196,6 +196,16 @@ dependencies = [
"syn",
]
[[package]]
name = "cylon"
version = "0.2.0"
source = "git+https://github.com/reivilibre/cylon.git?branch=rei/fix_import#12cb6861d6fbd28151bf7befede910b82436034a"
dependencies = [
"futures-util",
"serde",
"serde_derive",
]
[[package]]
name = "derive_more"
version = "0.99.17"
@ -347,6 +357,23 @@ version = "0.3.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c09fd04b7e4073ac7156a9539b57a484a8ea920f79c7c675d05d289ab6110d3"
[[package]]
name = "futures-io"
version = "0.3.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fc4045962a5a5e935ee2fdedaa4e08284547402885ab326734432bed5d12966b"
[[package]]
name = "futures-macro"
version = "0.3.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "33c1e13800337f4d4d7a316bf45a567dbcb6ffe087f16424852d97e97a91f512"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "futures-sink"
version = "0.3.21"
@ -366,9 +393,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d8b7abd5d659d9b90c8cba917f6ec750a74e2dc23902ef9cd4cc8c8b22e6036a"
dependencies = [
"futures-core",
"futures-io",
"futures-macro",
"futures-task",
"memchr",
"pin-project-lite",
"pin-utils",
"slab",
]
[[package]]
@ -422,9 +453,9 @@ dependencies = [
[[package]]
name = "h2"
version = "0.3.11"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9f1f717ddc7b2ba36df7e871fd88db79326551d3d6f1fc406fbfd28b582ff8e"
checksum = "62eeb471aa3e3c9197aa4bfeabfe02982f6dc96f750486c0bb0009ac58b26d2b"
dependencies = [
"bytes",
"fnv",
@ -709,14 +740,15 @@ dependencies = [
[[package]]
name = "mio"
version = "0.8.0"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba272f85fa0b41fc91872be579b3bbe0f56b792aa361a380eb669469f68dafb2"
checksum = "7ba42135c6a5917b9db9cd7b293e5409e1c6b041e6f9825e92e55a894c63b6f8"
dependencies = [
"libc",
"log",
"miow",
"ntapi",
"wasi 0.11.0+wasi-snapshot-preview1",
"winapi",
]
@ -1029,11 +1061,15 @@ version = "0.1.0"
dependencies = [
"adblock",
"anyhow",
"bytes",
"chrono",
"cylon",
"env_logger",
"feed-rs",
"gemini-fetch",
"html5ever",
"kuchiki",
"lazy_static",
"log",
"quickpeep_moz_readability",
"reqwest",
@ -1126,9 +1162,9 @@ dependencies = [
[[package]]
name = "regex"
version = "1.5.4"
version = "1.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461"
checksum = "1a11647b6b25ff05a515cb92c365cec08801e83423a235b51e231e1808747286"
dependencies = [
"aho-corasick",
"memchr",
@ -1179,6 +1215,7 @@ dependencies = [
"serde_urlencoded",
"tokio",
"tokio-native-tls",
"tokio-util",
"url",
"wasm-bindgen",
"wasm-bindgen-futures",
@ -1652,9 +1689,9 @@ checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6"
[[package]]
name = "tracing"
version = "0.1.31"
version = "0.1.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f6c650a8ef0cd2dd93736f033d21cbd1224c5a967aa0c258d00fcf7dafef9b9f"
checksum = "4a1bdf54a7c28a2bbf701e1d2233f6c77f473486b94bee4f9678da5a148dca7f"
dependencies = [
"cfg-if",
"pin-project-lite",
@ -1663,9 +1700,9 @@ dependencies = [
[[package]]
name = "tracing-core"
version = "0.1.22"
version = "0.1.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "03cfcb51380632a72d3111cb8d3447a8d908e577d31beeac006f836383d29a23"
checksum = "aa31669fa42c09c34d94d8165dd2012e8ff3c66aca50f3bb226b68f216f2706c"
dependencies = [
"lazy_static",
]
@ -1774,6 +1811,12 @@ version = "0.10.2+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6"
[[package]]
name = "wasi"
version = "0.11.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
[[package]]
name = "wasm-bindgen"
version = "0.2.79"

View File

@ -4,3 +4,7 @@ members = [
"quickpeep_moz_readability"
]
[patch.crates-io]
cylon = { git = "https://github.com/reivilibre/cylon.git", branch = "rei/fix_import" }

View File

@ -18,6 +18,11 @@ html5ever = "0.25.1"
serde = { version = "1.0.136", features = ["derive"] }
chrono = "0.4.19"
lazy_static = "1.4.0"
bytes = "1.1.0"
# TODO: rkyv and memmap2 should be an efficient way to load index packs into processes.
# rkyv = "0.7.35"
@ -25,12 +30,12 @@ serde = { version = "1.0.136", features = ["derive"] }
### Raking helpers
# HTTP Requests
reqwest = { version = "0.11.9", features = [] }
reqwest = { version = "0.11.9", features = ["stream"] }
# Gemini Requests
# N.B. TODO gemfeeds are Atom feeds for Gemini. Should support those.
gemini-fetch = "0.2.1"
# Robots.txt
# TODO cylon = { version = "0.2.0", features = [] }
cylon = { version = "0.2.0", features = ["crawl-delay"] }
# RSS/Atom/JSON feeds
feed-rs = "1.0.0"
# Sitemaps

View File

@ -1,12 +1,4 @@
pub mod raking;
#[cfg(test)]
mod test {
pub fn test_sitemap() {
let mut curs = std::io::Cursor::new("<url><loc>https://lol</loc></url>");
let reader = sitemap::reader::SiteMapReader::new(curs);
for entry in reader {
eprintln!("{:?}", entry);
}
}
}
mod test;

View File

@ -1,28 +1,58 @@
use anyhow::{bail, Context};
use reqwest::header::HeaderValue;
use chrono::{DateTime, FixedOffset, Utc};
use cylon::Cylon;
use html5ever::tendril::fmt::Slice;
use lazy_static::lazy_static;
use log::debug;
use reqwest::{Client, Url};
use serde::{Deserialize, Serialize};
use sitemap::reader::SiteMapEntity;
use std::collections::HashSet;
mod analysis;
pub const USER_AGENT: &'static str = "QuickPeepBot";
pub enum RakeOutcome {
RakedPage(RakedPage),
RakedFeed(RakedFeed),
RakedSitemap(RakedSitemap),
RakedFeed(Vec<UrlRaked>),
RakedSitemap(Vec<UrlRaked>),
/// The page was not canonical, and should not be indexed.
/// However here is the URL of the canonical page.
NotCanonical {
new_url: Url,
},
TemporaryFailure(TemporaryFailure),
PermanentFailure(PermanentFailure),
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct UrlRaked {
pub url: Url,
pub last_changed: Option<DateTime<Utc>>,
pub intent: RakeIntent,
}
pub struct RakedPage {}
pub struct RakedFeed {}
pub struct RobotsTxt {
pub sitemaps: Vec<UrlRaked>,
pub rules: Cylon,
}
pub struct RakedSitemap {}
pub struct TemporaryFailure {}
pub struct TemporaryFailure {
pub reason: TemporaryFailureReason,
pub backoff_sec: u32,
}
pub struct PermanentFailure {
pub reason: PermanentFailureReason,
}
pub enum TemporaryFailureReason {
MissingInformation(String),
}
pub enum PermanentFailureReason {
ResourceDenied(u32),
WrongLanguage(String),
@ -37,6 +67,20 @@ pub enum RakeIntent {
SiteMap,
}
lazy_static! {
static ref SITEMAP_MIME_TYPES: HashSet<&'static str> =
HashSet::from_iter(vec!["text/xml", "application/xml",]);
static ref FEED_MIME_TYPES: HashSet<&'static str> = HashSet::from_iter(vec![
"text/xml",
"application/xml",
"application/atom+xml",
"application/rss+xml",
"application/rdf+xml",
"application/json",
"application/feed+json"
]);
}
pub async fn rake(url: &Url, intent: RakeIntent, client: &Client) -> anyhow::Result<RakeOutcome> {
let response = client.get(url.clone()).send().await?;
@ -49,31 +93,59 @@ pub async fn rake(url: &Url, intent: RakeIntent, client: &Client) -> anyhow::Res
.to_str()
.context("Can't convert content-type to str")?;
eprintln!("CT {:?}", content_type);
content_type.to_owned()
content_type.split(";").next().unwrap().trim().to_owned()
} else {
// TODO ???
"text/html".to_owned()
return Ok(RakeOutcome::TemporaryFailure(TemporaryFailure {
reason: TemporaryFailureReason::MissingInformation("content-type".to_owned()),
backoff_sec: 86400 * 7,
}));
};
let content = response.bytes().await?;
if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page) {}
if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page) {
match rake_html_page(&content, url) {
Ok(page_rake) => {
return Ok(RakeOutcome::RakedPage(page_rake));
}
Err(error) => {
debug!("Failed to rake HTML page: {:?}", error);
}
}
}
// TODO JSON Feeds.
if content_type == "application/xml"
if FEED_MIME_TYPES.contains(content_type.as_str())
&& (intent == RakeIntent::Any || intent == RakeIntent::Feed)
{}
{
match rake_feed(&content, url) {
Ok(feed) => {
return Ok(RakeOutcome::RakedFeed(feed));
}
Err(error) => {
debug!("Failed to rake as feed: {:?}", error);
}
}
}
if content_type == "application/xml"
if SITEMAP_MIME_TYPES.contains(content_type.as_str())
&& (intent == RakeIntent::Any || intent == RakeIntent::SiteMap)
{}
{
match rake_sitemap(&content) {
Ok(sitemap) => {
return Ok(RakeOutcome::RakedSitemap(sitemap));
}
Err(error) => {
debug!("Failed to rake as sitemap: {:?}", error);
}
}
}
return Ok(RakeOutcome::PermanentFailure(PermanentFailure {
reason: PermanentFailureReason::UnknownContentType(content_type.to_owned()),
}));
}
pub fn rake_html_page(content: &[u8], url: &Url) -> anyhow::Result<()> {
pub fn rake_html_page(content: &[u8], url: &Url) -> anyhow::Result<RakedPage> {
let content_str = std::str::from_utf8(content)?;
let mut readability = quickpeep_moz_readability::Readability::new(content_str);
@ -87,15 +159,136 @@ pub fn rake_html_page(content: &[u8], url: &Url) -> anyhow::Result<()> {
eprintln!("{}", node.to_string());
}
Ok(())
Ok(todo!())
}
pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result<()> {
let x = feed_rs::parser::parse_with_uri(content, Some(url.as_str()))?;
todo!()
pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result<Vec<UrlRaked>> {
let feed = feed_rs::parser::parse_with_uri(content, Some(url.as_str()))?;
let mut urls = Vec::new();
for entry in feed.entries {
let link = if let Some(link) = entry.links.get(0) {
link
} else {
continue;
};
let url = Url::parse(&link.href).context("parsing URL in feed")?; // TODO ignore failure here...?
let last_changed = entry.updated.or(entry.published);
urls.push(UrlRaked {
url,
last_changed,
intent: RakeIntent::Page,
});
}
// TODO paginated feeds (e.g. JSON Feed next_url)
Ok(urls)
}
pub fn rake_sitemap(content: &[u8]) -> anyhow::Result<()> {
//let x = sitemap::
todo!()
pub fn rake_sitemap(content: &[u8]) -> anyhow::Result<Vec<UrlRaked>> {
let curs = std::io::Cursor::new(content);
let reader = sitemap::reader::SiteMapReader::new(curs);
let mut urls = Vec::new();
for entry in reader {
match &entry {
SiteMapEntity::Url(url) => {
let loc = if let Some(loc) = url.loc.get_url() {
loc
} else {
continue;
};
urls.push(UrlRaked {
url: loc,
last_changed: url
.lastmod
.get_time()
.map(|dt: DateTime<FixedOffset>| dt.into()),
intent: RakeIntent::Page,
});
}
SiteMapEntity::SiteMap(sitemap) => {
let loc = if let Some(loc) = sitemap.loc.get_url() {
loc
} else {
continue;
};
urls.push(UrlRaked {
url: loc,
last_changed: sitemap
.lastmod
.get_time()
.map(|dt: DateTime<FixedOffset>| dt.into()),
intent: RakeIntent::SiteMap,
});
}
SiteMapEntity::Err(error) => {
debug!("Sitemap error {:?}", error);
}
}
eprintln!("{:?}", entry);
}
if urls.is_empty() {
bail!("No URLs or Sitemaps picked up from sitemap; is it bad?");
}
Ok(urls)
}
pub async fn get_robots_txt_for(url: &Url, client: &Client) -> anyhow::Result<Option<RobotsTxt>> {
let robots_url = url
.join("/robots.txt")
.context("Whilst resolving /robots.txt on URL")?;
let resp = client.get(robots_url.clone()).send().await?;
if !resp.status().is_success() {
let code = resp.status().as_u16();
if code == 404 || code == 410 {
// not found or gone? Assume there is intentionally no robots.txt file.
return Ok(None);
}
bail!("Failed to get {:?}: {:?}", robots_url, resp.status());
}
let bytes = resp.bytes().await?;
Ok(decode_robots_txt(&bytes).await?)
}
pub async fn decode_robots_txt(bytes: &[u8]) -> anyhow::Result<Option<RobotsTxt>> {
let mut sitemaps = Vec::new();
for line in bytes.split(|b| *b == b'\n') {
let line = line.to_ascii_lowercase();
if line.starts_with(b"sitemap:") {
if let Ok(value) = std::str::from_utf8(&line[8..]) {
if let Ok(url) = Url::parse(value.trim()) {
sitemaps.push(UrlRaked {
url,
last_changed: None,
intent: RakeIntent::SiteMap,
});
} else {
debug!("Failed to parse sitemap value as a URL")
}
} else {
debug!("Failed to parse sitemap value as UTF-8")
}
}
}
let rules = cylon::Compiler::new(USER_AGENT)
.compile(bytes.as_bytes())
.await?;
Ok(Some(RobotsTxt { sitemaps, rules }))
}

View File

@ -0,0 +1,43 @@
use adblock::filters::cosmetic::CosmeticFilter;
use anyhow::anyhow;
use kuchiki::NodeRef;
use log::debug;
use std::path::Path;
use tokio::fs::File;
use tokio::io::{AsyncBufReadExt, AsyncRead, BufReader};
pub async fn load_cosmetic_filters<R: AsyncRead + Unpin>(
reader: R,
) -> anyhow::Result<Vec<CosmeticFilter>> {
let mut br = BufReader::new(reader);
let mut rules = Vec::new();
let mut buf = String::new();
loop {
buf.clear();
if br.read_line(&mut buf).await? == 0 {
break;
}
if let Ok(rule) = CosmeticFilter::parse(&buf, false) {
rules.push(rule);
}
}
Ok(rules)
}
pub fn analyse_with_ad_block_cosmetic_filter(
root: NodeRef,
filters: &Vec<CosmeticFilter>,
) -> anyhow::Result<bool> {
let mut matches = 0;
for rule in filters {
for ele in root
.select(&rule.selector)
.map_err(|_| anyhow!("Failed to select(..)"))?
{
debug!("Cosmetic Filter {:?} Matches {:?}", rule, ele);
matches += 1;
}
}
Ok(matches > 0)
}

88
quickpeep/src/test.rs Normal file
View File

@ -0,0 +1,88 @@
use crate::raking::{decode_robots_txt, rake_feed, rake_sitemap, RakeIntent, UrlRaked};
use reqwest::Url;
#[test]
pub fn test_sitemap() {
let sm = rake_sitemap(
br#"
<urlset><url><loc>https://example.org/index.html</loc></url></urlset>
"#,
)
.unwrap();
assert_eq!(
sm,
vec![UrlRaked {
url: Url::parse("https://example.org/index.html").unwrap(),
last_changed: None,
intent: RakeIntent::Page
},]
);
let sm = rake_sitemap(
br#"
<sitemapindex><sitemap><loc>https://example.org/sitemap.xml</loc></sitemap></sitemapindex>
"#,
)
.unwrap();
assert_eq!(
sm,
vec![UrlRaked {
url: Url::parse("https://example.org/sitemap.xml").unwrap(),
last_changed: None,
intent: RakeIntent::SiteMap
}]
);
}
#[test]
pub fn test_feed() {
let feed = rake_feed(
br#"
<feed>
<entry>
<link href="https://example.org/index.html" />
</entry>
</feed>
"#,
&Url::parse("https://example.org/atom.xml").unwrap(),
)
.unwrap();
assert_eq!(
feed,
vec![UrlRaked {
url: Url::parse("https://example.org/index.html").unwrap(),
last_changed: None,
intent: RakeIntent::Page
},]
);
}
#[tokio::test]
pub async fn test_robots_txt() {
let rtxt = decode_robots_txt(
br#"
User-Agent: *
Disallow: /bad
Allow: /bad/abc
SiteMap: https://example.org/sitemap.xml
"#,
)
.await
.unwrap()
.unwrap();
assert_eq!(
rtxt.sitemaps,
vec![UrlRaked {
url: Url::parse("https://example.org/sitemap.xml").unwrap(),
last_changed: None,
intent: RakeIntent::SiteMap
}]
);
assert!(!rtxt.rules.allow("/bad"));
assert!(!rtxt.rules.allow("/bad/def"));
assert!(rtxt.rules.allow("/bad/abc"));
assert!(rtxt.rules.allow("/good"));
}