Add more work towards raking all the different formats

2022-03-13 12:40:04 +00:00 · 2022-03-13 12:40:04 +00:00 · 210e8ef10a
commit 210e8ef10a
parent db5524eb52
7 changed files with 413 additions and 45 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -196,6 +196,16 @@ dependencies = [
 "syn",
 ]

+[[package]]
+name = "cylon"
+version = "0.2.0"
+source = "git+https://github.com/reivilibre/cylon.git?branch=rei/fix_import#12cb6861d6fbd28151bf7befede910b82436034a"
+dependencies = [
+ "futures-util",
+ "serde",
+ "serde_derive",
+]
+
 [[package]]
 name = "derive_more"
 version = "0.99.17"
@ -347,6 +357,23 @@ version = "0.3.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0c09fd04b7e4073ac7156a9539b57a484a8ea920f79c7c675d05d289ab6110d3"

+[[package]]
+name = "futures-io"
+version = "0.3.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc4045962a5a5e935ee2fdedaa4e08284547402885ab326734432bed5d12966b"
+
+[[package]]
+name = "futures-macro"
+version = "0.3.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "33c1e13800337f4d4d7a316bf45a567dbcb6ffe087f16424852d97e97a91f512"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "futures-sink"
 version = "0.3.21"
@ -366,9 +393,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d8b7abd5d659d9b90c8cba917f6ec750a74e2dc23902ef9cd4cc8c8b22e6036a"
 dependencies = [
 "futures-core",
+ "futures-io",
+ "futures-macro",
 "futures-task",
+ "memchr",
 "pin-project-lite",
 "pin-utils",
+ "slab",
 ]

 [[package]]
@ -422,9 +453,9 @@ dependencies = [

 [[package]]
 name = "h2"
-version = "0.3.11"
+version = "0.3.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d9f1f717ddc7b2ba36df7e871fd88db79326551d3d6f1fc406fbfd28b582ff8e"
+checksum = "62eeb471aa3e3c9197aa4bfeabfe02982f6dc96f750486c0bb0009ac58b26d2b"
 dependencies = [
 "bytes",
 "fnv",
@ -709,14 +740,15 @@ dependencies = [

 [[package]]
 name = "mio"
-version = "0.8.0"
+version = "0.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ba272f85fa0b41fc91872be579b3bbe0f56b792aa361a380eb669469f68dafb2"
+checksum = "7ba42135c6a5917b9db9cd7b293e5409e1c6b041e6f9825e92e55a894c63b6f8"
 dependencies = [
 "libc",
 "log",
 "miow",
 "ntapi",
+ "wasi 0.11.0+wasi-snapshot-preview1",
 "winapi",
 ]

@ -1029,11 +1061,15 @@ version = "0.1.0"
 dependencies = [
 "adblock",
 "anyhow",
+ "bytes",
+ "chrono",
+ "cylon",
 "env_logger",
 "feed-rs",
 "gemini-fetch",
 "html5ever",
 "kuchiki",
+ "lazy_static",
 "log",
 "quickpeep_moz_readability",
 "reqwest",
@ -1126,9 +1162,9 @@ dependencies = [

 [[package]]
 name = "regex"
-version = "1.5.4"
+version = "1.5.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461"
+checksum = "1a11647b6b25ff05a515cb92c365cec08801e83423a235b51e231e1808747286"
 dependencies = [
 "aho-corasick",
 "memchr",
@ -1179,6 +1215,7 @@ dependencies = [
 "serde_urlencoded",
 "tokio",
 "tokio-native-tls",
+ "tokio-util",
 "url",
 "wasm-bindgen",
 "wasm-bindgen-futures",
@ -1652,9 +1689,9 @@ checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6"

 [[package]]
 name = "tracing"
-version = "0.1.31"
+version = "0.1.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6c650a8ef0cd2dd93736f033d21cbd1224c5a967aa0c258d00fcf7dafef9b9f"
+checksum = "4a1bdf54a7c28a2bbf701e1d2233f6c77f473486b94bee4f9678da5a148dca7f"
 dependencies = [
 "cfg-if",
 "pin-project-lite",
@ -1663,9 +1700,9 @@ dependencies = [

 [[package]]
 name = "tracing-core"
-version = "0.1.22"
+version = "0.1.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "03cfcb51380632a72d3111cb8d3447a8d908e577d31beeac006f836383d29a23"
+checksum = "aa31669fa42c09c34d94d8165dd2012e8ff3c66aca50f3bb226b68f216f2706c"
 dependencies = [
 "lazy_static",
 ]
@ -1774,6 +1811,12 @@ version = "0.10.2+wasi-snapshot-preview1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6"

+[[package]]
+name = "wasi"
+version = "0.11.0+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
+
 [[package]]
 name = "wasm-bindgen"
 version = "0.2.79"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -4,3 +4,7 @@ members = [
    "quickpeep_moz_readability"
 ]

+
+[patch.crates-io]
+cylon = { git = "https://github.com/reivilibre/cylon.git", branch = "rei/fix_import" }
+
--- a/quickpeep/Cargo.toml
+++ b/quickpeep/Cargo.toml
@ -18,6 +18,11 @@ html5ever = "0.25.1"

 serde = { version = "1.0.136", features = ["derive"] }

+chrono = "0.4.19"
+
+lazy_static = "1.4.0"
+
+bytes = "1.1.0"

 # TODO: rkyv and memmap2 should be an efficient way to load index packs into processes.
 # rkyv = "0.7.35"
@ -25,12 +30,12 @@ serde = { version = "1.0.136", features = ["derive"] }

 ### Raking helpers
 # HTTP Requests
-reqwest = { version = "0.11.9", features = [] }
+reqwest = { version = "0.11.9", features = ["stream"] }
 # Gemini Requests
 # N.B. TODO gemfeeds are Atom feeds for Gemini. Should support those.
 gemini-fetch = "0.2.1"
 # Robots.txt
-# TODO cylon = { version = "0.2.0", features = [] }
+cylon = { version = "0.2.0", features = ["crawl-delay"] }
 # RSS/Atom/JSON feeds
 feed-rs = "1.0.0"
 # Sitemaps
--- a/quickpeep/src/lib.rs
+++ b/quickpeep/src/lib.rs
@ -1,12 +1,4 @@
 pub mod raking;

 #[cfg(test)]
-mod test {
-    pub fn test_sitemap() {
-        let mut curs = std::io::Cursor::new("<url><loc>https://lol</loc></url>");
-        let reader = sitemap::reader::SiteMapReader::new(curs);
-        for entry in reader {
-            eprintln!("{:?}", entry);
-        }
-    }
-}
+mod test;
--- a/quickpeep/src/raking.rs
+++ b/quickpeep/src/raking.rs
@ -1,28 +1,58 @@
 use anyhow::{bail, Context};
-use reqwest::header::HeaderValue;
+use chrono::{DateTime, FixedOffset, Utc};
+use cylon::Cylon;
+use html5ever::tendril::fmt::Slice;
+use lazy_static::lazy_static;
+use log::debug;
 use reqwest::{Client, Url};
 use serde::{Deserialize, Serialize};
+use sitemap::reader::SiteMapEntity;
+use std::collections::HashSet;
+
+mod analysis;
+
+pub const USER_AGENT: &'static str = "QuickPeepBot";

 pub enum RakeOutcome {
    RakedPage(RakedPage),
-    RakedFeed(RakedFeed),
-    RakedSitemap(RakedSitemap),
+    RakedFeed(Vec<UrlRaked>),
+    RakedSitemap(Vec<UrlRaked>),
+    /// The page was not canonical, and should not be indexed.
+    /// However here is the URL of the canonical page.
+    NotCanonical {
+        new_url: Url,
+    },
    TemporaryFailure(TemporaryFailure),
    PermanentFailure(PermanentFailure),
 }

+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct UrlRaked {
+    pub url: Url,
+    pub last_changed: Option<DateTime<Utc>>,
+    pub intent: RakeIntent,
+}
+
 pub struct RakedPage {}

-pub struct RakedFeed {}
+pub struct RobotsTxt {
+    pub sitemaps: Vec<UrlRaked>,
+    pub rules: Cylon,
+}

-pub struct RakedSitemap {}
-
-pub struct TemporaryFailure {}
+pub struct TemporaryFailure {
+    pub reason: TemporaryFailureReason,
+    pub backoff_sec: u32,
+}

 pub struct PermanentFailure {
    pub reason: PermanentFailureReason,
 }

+pub enum TemporaryFailureReason {
+    MissingInformation(String),
+}
+
 pub enum PermanentFailureReason {
    ResourceDenied(u32),
    WrongLanguage(String),
@ -37,6 +67,20 @@ pub enum RakeIntent {
    SiteMap,
 }

+lazy_static! {
+    static ref SITEMAP_MIME_TYPES: HashSet<&'static str> =
+        HashSet::from_iter(vec!["text/xml", "application/xml",]);
+    static ref FEED_MIME_TYPES: HashSet<&'static str> = HashSet::from_iter(vec![
+        "text/xml",
+        "application/xml",
+        "application/atom+xml",
+        "application/rss+xml",
+        "application/rdf+xml",
+        "application/json",
+        "application/feed+json"
+    ]);
+}
+
 pub async fn rake(url: &Url, intent: RakeIntent, client: &Client) -> anyhow::Result<RakeOutcome> {
    let response = client.get(url.clone()).send().await?;

@ -49,31 +93,59 @@ pub async fn rake(url: &Url, intent: RakeIntent, client: &Client) -> anyhow::Res
            .to_str()
            .context("Can't convert content-type to str")?;
        eprintln!("CT {:?}", content_type);
-        content_type.to_owned()
+        content_type.split(";").next().unwrap().trim().to_owned()
    } else {
-        // TODO ???
-        "text/html".to_owned()
+        return Ok(RakeOutcome::TemporaryFailure(TemporaryFailure {
+            reason: TemporaryFailureReason::MissingInformation("content-type".to_owned()),
+            backoff_sec: 86400 * 7,
+        }));
    };

    let content = response.bytes().await?;

-    if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page) {}
+    if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page) {
+        match rake_html_page(&content, url) {
+            Ok(page_rake) => {
+                return Ok(RakeOutcome::RakedPage(page_rake));
+            }
+            Err(error) => {
+                debug!("Failed to rake HTML page: {:?}", error);
+            }
+        }
+    }

-    // TODO JSON Feeds.
-    if content_type == "application/xml"
+    if FEED_MIME_TYPES.contains(content_type.as_str())
        && (intent == RakeIntent::Any || intent == RakeIntent::Feed)
-    {}
+    {
+        match rake_feed(&content, url) {
+            Ok(feed) => {
+                return Ok(RakeOutcome::RakedFeed(feed));
+            }
+            Err(error) => {
+                debug!("Failed to rake as feed: {:?}", error);
+            }
+        }
+    }

-    if content_type == "application/xml"
+    if SITEMAP_MIME_TYPES.contains(content_type.as_str())
        && (intent == RakeIntent::Any || intent == RakeIntent::SiteMap)
-    {}
+    {
+        match rake_sitemap(&content) {
+            Ok(sitemap) => {
+                return Ok(RakeOutcome::RakedSitemap(sitemap));
+            }
+            Err(error) => {
+                debug!("Failed to rake as sitemap: {:?}", error);
+            }
+        }
+    }

    return Ok(RakeOutcome::PermanentFailure(PermanentFailure {
        reason: PermanentFailureReason::UnknownContentType(content_type.to_owned()),
    }));
 }

-pub fn rake_html_page(content: &[u8], url: &Url) -> anyhow::Result<()> {
+pub fn rake_html_page(content: &[u8], url: &Url) -> anyhow::Result<RakedPage> {
    let content_str = std::str::from_utf8(content)?;

    let mut readability = quickpeep_moz_readability::Readability::new(content_str);
@ -87,15 +159,136 @@ pub fn rake_html_page(content: &[u8], url: &Url) -> anyhow::Result<()> {
        eprintln!("{}", node.to_string());
    }

-    Ok(())
+    Ok(todo!())
 }

-pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result<()> {
-    let x = feed_rs::parser::parse_with_uri(content, Some(url.as_str()))?;
-    todo!()
+pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result<Vec<UrlRaked>> {
+    let feed = feed_rs::parser::parse_with_uri(content, Some(url.as_str()))?;
+
+    let mut urls = Vec::new();
+
+    for entry in feed.entries {
+        let link = if let Some(link) = entry.links.get(0) {
+            link
+        } else {
+            continue;
+        };
+        let url = Url::parse(&link.href).context("parsing URL in feed")?; // TODO ignore failure here...?
+
+        let last_changed = entry.updated.or(entry.published);
+
+        urls.push(UrlRaked {
+            url,
+            last_changed,
+            intent: RakeIntent::Page,
+        });
+    }
+
+    // TODO paginated feeds (e.g. JSON Feed next_url)
+
+    Ok(urls)
 }

-pub fn rake_sitemap(content: &[u8]) -> anyhow::Result<()> {
-    //let x = sitemap::
-    todo!()
+pub fn rake_sitemap(content: &[u8]) -> anyhow::Result<Vec<UrlRaked>> {
+    let curs = std::io::Cursor::new(content);
+    let reader = sitemap::reader::SiteMapReader::new(curs);
+
+    let mut urls = Vec::new();
+
+    for entry in reader {
+        match &entry {
+            SiteMapEntity::Url(url) => {
+                let loc = if let Some(loc) = url.loc.get_url() {
+                    loc
+                } else {
+                    continue;
+                };
+
+                urls.push(UrlRaked {
+                    url: loc,
+                    last_changed: url
+                        .lastmod
+                        .get_time()
+                        .map(|dt: DateTime<FixedOffset>| dt.into()),
+                    intent: RakeIntent::Page,
+                });
+            }
+            SiteMapEntity::SiteMap(sitemap) => {
+                let loc = if let Some(loc) = sitemap.loc.get_url() {
+                    loc
+                } else {
+                    continue;
+                };
+
+                urls.push(UrlRaked {
+                    url: loc,
+                    last_changed: sitemap
+                        .lastmod
+                        .get_time()
+                        .map(|dt: DateTime<FixedOffset>| dt.into()),
+                    intent: RakeIntent::SiteMap,
+                });
+            }
+            SiteMapEntity::Err(error) => {
+                debug!("Sitemap error {:?}", error);
+            }
+        }
+        eprintln!("{:?}", entry);
+    }
+
+    if urls.is_empty() {
+        bail!("No URLs or Sitemaps picked up from sitemap; is it bad?");
+    }
+
+    Ok(urls)
+}
+
+pub async fn get_robots_txt_for(url: &Url, client: &Client) -> anyhow::Result<Option<RobotsTxt>> {
+    let robots_url = url
+        .join("/robots.txt")
+        .context("Whilst resolving /robots.txt on URL")?;
+    let resp = client.get(robots_url.clone()).send().await?;
+
+    if !resp.status().is_success() {
+        let code = resp.status().as_u16();
+        if code == 404 || code == 410 {
+            // not found or gone? Assume there is intentionally no robots.txt file.
+            return Ok(None);
+        }
+
+        bail!("Failed to get {:?}: {:?}", robots_url, resp.status());
+    }
+
+    let bytes = resp.bytes().await?;
+
+    Ok(decode_robots_txt(&bytes).await?)
+}
+
+pub async fn decode_robots_txt(bytes: &[u8]) -> anyhow::Result<Option<RobotsTxt>> {
+    let mut sitemaps = Vec::new();
+
+    for line in bytes.split(|b| *b == b'\n') {
+        let line = line.to_ascii_lowercase();
+        if line.starts_with(b"sitemap:") {
+            if let Ok(value) = std::str::from_utf8(&line[8..]) {
+                if let Ok(url) = Url::parse(value.trim()) {
+                    sitemaps.push(UrlRaked {
+                        url,
+                        last_changed: None,
+                        intent: RakeIntent::SiteMap,
+                    });
+                } else {
+                    debug!("Failed to parse sitemap value as a URL")
+                }
+            } else {
+                debug!("Failed to parse sitemap value as UTF-8")
+            }
+        }
+    }
+
+    let rules = cylon::Compiler::new(USER_AGENT)
+        .compile(bytes.as_bytes())
+        .await?;
+
+    Ok(Some(RobotsTxt { sitemaps, rules }))
 }
--- a/quickpeep/src/raking/analysis.rs
+++ b/quickpeep/src/raking/analysis.rs
@ -0,0 +1,43 @@
+use adblock::filters::cosmetic::CosmeticFilter;
+use anyhow::anyhow;
+use kuchiki::NodeRef;
+use log::debug;
+use std::path::Path;
+use tokio::fs::File;
+use tokio::io::{AsyncBufReadExt, AsyncRead, BufReader};
+
+pub async fn load_cosmetic_filters<R: AsyncRead + Unpin>(
+    reader: R,
+) -> anyhow::Result<Vec<CosmeticFilter>> {
+    let mut br = BufReader::new(reader);
+    let mut rules = Vec::new();
+    let mut buf = String::new();
+    loop {
+        buf.clear();
+        if br.read_line(&mut buf).await? == 0 {
+            break;
+        }
+        if let Ok(rule) = CosmeticFilter::parse(&buf, false) {
+            rules.push(rule);
+        }
+    }
+
+    Ok(rules)
+}
+
+pub fn analyse_with_ad_block_cosmetic_filter(
+    root: NodeRef,
+    filters: &Vec<CosmeticFilter>,
+) -> anyhow::Result<bool> {
+    let mut matches = 0;
+    for rule in filters {
+        for ele in root
+            .select(&rule.selector)
+            .map_err(|_| anyhow!("Failed to select(..)"))?
+        {
+            debug!("Cosmetic Filter {:?} Matches {:?}", rule, ele);
+            matches += 1;
+        }
+    }
+    Ok(matches > 0)
+}
--- a/quickpeep/src/test.rs
+++ b/quickpeep/src/test.rs
@ -0,0 +1,88 @@
+use crate::raking::{decode_robots_txt, rake_feed, rake_sitemap, RakeIntent, UrlRaked};
+use reqwest::Url;
+
+#[test]
+pub fn test_sitemap() {
+    let sm = rake_sitemap(
+        br#"
+    <urlset><url><loc>https://example.org/index.html</loc></url></urlset>
+    "#,
+    )
+    .unwrap();
+    assert_eq!(
+        sm,
+        vec![UrlRaked {
+            url: Url::parse("https://example.org/index.html").unwrap(),
+            last_changed: None,
+            intent: RakeIntent::Page
+        },]
+    );
+
+    let sm = rake_sitemap(
+        br#"
+    <sitemapindex><sitemap><loc>https://example.org/sitemap.xml</loc></sitemap></sitemapindex>
+    "#,
+    )
+    .unwrap();
+    assert_eq!(
+        sm,
+        vec![UrlRaked {
+            url: Url::parse("https://example.org/sitemap.xml").unwrap(),
+            last_changed: None,
+            intent: RakeIntent::SiteMap
+        }]
+    );
+}
+
+#[test]
+pub fn test_feed() {
+    let feed = rake_feed(
+        br#"
+    <feed>
+        <entry>
+            <link href="https://example.org/index.html" />
+        </entry>
+    </feed>
+    "#,
+        &Url::parse("https://example.org/atom.xml").unwrap(),
+    )
+    .unwrap();
+
+    assert_eq!(
+        feed,
+        vec![UrlRaked {
+            url: Url::parse("https://example.org/index.html").unwrap(),
+            last_changed: None,
+            intent: RakeIntent::Page
+        },]
+    );
+}
+
+#[tokio::test]
+pub async fn test_robots_txt() {
+    let rtxt = decode_robots_txt(
+        br#"
+User-Agent: *
+Disallow: /bad
+Allow: /bad/abc
+
+SiteMap: https://example.org/sitemap.xml
+    "#,
+    )
+    .await
+    .unwrap()
+    .unwrap();
+    assert_eq!(
+        rtxt.sitemaps,
+        vec![UrlRaked {
+            url: Url::parse("https://example.org/sitemap.xml").unwrap(),
+            last_changed: None,
+            intent: RakeIntent::SiteMap
+        }]
+    );
+
+    assert!(!rtxt.rules.allow("/bad"));
+    assert!(!rtxt.rules.allow("/bad/def"));
+    assert!(rtxt.rules.allow("/bad/abc"));
+    assert!(rtxt.rules.allow("/good"));
+}