Add more work towards raking all the different formats
This commit is contained in:
		
							parent
							
								
									db5524eb52
								
							
						
					
					
						commit
						210e8ef10a
					
				
							
								
								
									
										63
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										63
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							| @ -196,6 +196,16 @@ dependencies = [ | |||||||
|  "syn", |  "syn", | ||||||
| ] | ] | ||||||
| 
 | 
 | ||||||
|  | [[package]] | ||||||
|  | name = "cylon" | ||||||
|  | version = "0.2.0" | ||||||
|  | source = "git+https://github.com/reivilibre/cylon.git?branch=rei/fix_import#12cb6861d6fbd28151bf7befede910b82436034a" | ||||||
|  | dependencies = [ | ||||||
|  |  "futures-util", | ||||||
|  |  "serde", | ||||||
|  |  "serde_derive", | ||||||
|  | ] | ||||||
|  | 
 | ||||||
| [[package]] | [[package]] | ||||||
| name = "derive_more" | name = "derive_more" | ||||||
| version = "0.99.17" | version = "0.99.17" | ||||||
| @ -347,6 +357,23 @@ version = "0.3.21" | |||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "0c09fd04b7e4073ac7156a9539b57a484a8ea920f79c7c675d05d289ab6110d3" | checksum = "0c09fd04b7e4073ac7156a9539b57a484a8ea920f79c7c675d05d289ab6110d3" | ||||||
| 
 | 
 | ||||||
|  | [[package]] | ||||||
|  | name = "futures-io" | ||||||
|  | version = "0.3.21" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "fc4045962a5a5e935ee2fdedaa4e08284547402885ab326734432bed5d12966b" | ||||||
|  | 
 | ||||||
|  | [[package]] | ||||||
|  | name = "futures-macro" | ||||||
|  | version = "0.3.21" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "33c1e13800337f4d4d7a316bf45a567dbcb6ffe087f16424852d97e97a91f512" | ||||||
|  | dependencies = [ | ||||||
|  |  "proc-macro2", | ||||||
|  |  "quote", | ||||||
|  |  "syn", | ||||||
|  | ] | ||||||
|  | 
 | ||||||
| [[package]] | [[package]] | ||||||
| name = "futures-sink" | name = "futures-sink" | ||||||
| version = "0.3.21" | version = "0.3.21" | ||||||
| @ -366,9 +393,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index" | |||||||
| checksum = "d8b7abd5d659d9b90c8cba917f6ec750a74e2dc23902ef9cd4cc8c8b22e6036a" | checksum = "d8b7abd5d659d9b90c8cba917f6ec750a74e2dc23902ef9cd4cc8c8b22e6036a" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "futures-core", |  "futures-core", | ||||||
|  |  "futures-io", | ||||||
|  |  "futures-macro", | ||||||
|  "futures-task", |  "futures-task", | ||||||
|  |  "memchr", | ||||||
|  "pin-project-lite", |  "pin-project-lite", | ||||||
|  "pin-utils", |  "pin-utils", | ||||||
|  |  "slab", | ||||||
| ] | ] | ||||||
| 
 | 
 | ||||||
| [[package]] | [[package]] | ||||||
| @ -422,9 +453,9 @@ dependencies = [ | |||||||
| 
 | 
 | ||||||
| [[package]] | [[package]] | ||||||
| name = "h2" | name = "h2" | ||||||
| version = "0.3.11" | version = "0.3.12" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "d9f1f717ddc7b2ba36df7e871fd88db79326551d3d6f1fc406fbfd28b582ff8e" | checksum = "62eeb471aa3e3c9197aa4bfeabfe02982f6dc96f750486c0bb0009ac58b26d2b" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "bytes", |  "bytes", | ||||||
|  "fnv", |  "fnv", | ||||||
| @ -709,14 +740,15 @@ dependencies = [ | |||||||
| 
 | 
 | ||||||
| [[package]] | [[package]] | ||||||
| name = "mio" | name = "mio" | ||||||
| version = "0.8.0" | version = "0.8.1" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "ba272f85fa0b41fc91872be579b3bbe0f56b792aa361a380eb669469f68dafb2" | checksum = "7ba42135c6a5917b9db9cd7b293e5409e1c6b041e6f9825e92e55a894c63b6f8" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "libc", |  "libc", | ||||||
|  "log", |  "log", | ||||||
|  "miow", |  "miow", | ||||||
|  "ntapi", |  "ntapi", | ||||||
|  |  "wasi 0.11.0+wasi-snapshot-preview1", | ||||||
|  "winapi", |  "winapi", | ||||||
| ] | ] | ||||||
| 
 | 
 | ||||||
| @ -1029,11 +1061,15 @@ version = "0.1.0" | |||||||
| dependencies = [ | dependencies = [ | ||||||
|  "adblock", |  "adblock", | ||||||
|  "anyhow", |  "anyhow", | ||||||
|  |  "bytes", | ||||||
|  |  "chrono", | ||||||
|  |  "cylon", | ||||||
|  "env_logger", |  "env_logger", | ||||||
|  "feed-rs", |  "feed-rs", | ||||||
|  "gemini-fetch", |  "gemini-fetch", | ||||||
|  "html5ever", |  "html5ever", | ||||||
|  "kuchiki", |  "kuchiki", | ||||||
|  |  "lazy_static", | ||||||
|  "log", |  "log", | ||||||
|  "quickpeep_moz_readability", |  "quickpeep_moz_readability", | ||||||
|  "reqwest", |  "reqwest", | ||||||
| @ -1126,9 +1162,9 @@ dependencies = [ | |||||||
| 
 | 
 | ||||||
| [[package]] | [[package]] | ||||||
| name = "regex" | name = "regex" | ||||||
| version = "1.5.4" | version = "1.5.5" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" | checksum = "1a11647b6b25ff05a515cb92c365cec08801e83423a235b51e231e1808747286" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "aho-corasick", |  "aho-corasick", | ||||||
|  "memchr", |  "memchr", | ||||||
| @ -1179,6 +1215,7 @@ dependencies = [ | |||||||
|  "serde_urlencoded", |  "serde_urlencoded", | ||||||
|  "tokio", |  "tokio", | ||||||
|  "tokio-native-tls", |  "tokio-native-tls", | ||||||
|  |  "tokio-util", | ||||||
|  "url", |  "url", | ||||||
|  "wasm-bindgen", |  "wasm-bindgen", | ||||||
|  "wasm-bindgen-futures", |  "wasm-bindgen-futures", | ||||||
| @ -1652,9 +1689,9 @@ checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6" | |||||||
| 
 | 
 | ||||||
| [[package]] | [[package]] | ||||||
| name = "tracing" | name = "tracing" | ||||||
| version = "0.1.31" | version = "0.1.32" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "f6c650a8ef0cd2dd93736f033d21cbd1224c5a967aa0c258d00fcf7dafef9b9f" | checksum = "4a1bdf54a7c28a2bbf701e1d2233f6c77f473486b94bee4f9678da5a148dca7f" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "cfg-if", |  "cfg-if", | ||||||
|  "pin-project-lite", |  "pin-project-lite", | ||||||
| @ -1663,9 +1700,9 @@ dependencies = [ | |||||||
| 
 | 
 | ||||||
| [[package]] | [[package]] | ||||||
| name = "tracing-core" | name = "tracing-core" | ||||||
| version = "0.1.22" | version = "0.1.23" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "03cfcb51380632a72d3111cb8d3447a8d908e577d31beeac006f836383d29a23" | checksum = "aa31669fa42c09c34d94d8165dd2012e8ff3c66aca50f3bb226b68f216f2706c" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "lazy_static", |  "lazy_static", | ||||||
| ] | ] | ||||||
| @ -1774,6 +1811,12 @@ version = "0.10.2+wasi-snapshot-preview1" | |||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" | checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" | ||||||
| 
 | 
 | ||||||
|  | [[package]] | ||||||
|  | name = "wasi" | ||||||
|  | version = "0.11.0+wasi-snapshot-preview1" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" | ||||||
|  | 
 | ||||||
| [[package]] | [[package]] | ||||||
| name = "wasm-bindgen" | name = "wasm-bindgen" | ||||||
| version = "0.2.79" | version = "0.2.79" | ||||||
|  | |||||||
| @ -4,3 +4,7 @@ members = [ | |||||||
|     "quickpeep_moz_readability" |     "quickpeep_moz_readability" | ||||||
| ] | ] | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
|  | [patch.crates-io] | ||||||
|  | cylon = { git = "https://github.com/reivilibre/cylon.git", branch = "rei/fix_import" } | ||||||
|  | 
 | ||||||
|  | |||||||
| @ -18,6 +18,11 @@ html5ever = "0.25.1" | |||||||
| 
 | 
 | ||||||
| serde = { version = "1.0.136", features = ["derive"] } | serde = { version = "1.0.136", features = ["derive"] } | ||||||
| 
 | 
 | ||||||
|  | chrono = "0.4.19" | ||||||
|  | 
 | ||||||
|  | lazy_static = "1.4.0" | ||||||
|  | 
 | ||||||
|  | bytes = "1.1.0" | ||||||
| 
 | 
 | ||||||
| # TODO: rkyv and memmap2 should be an efficient way to load index packs into processes. | # TODO: rkyv and memmap2 should be an efficient way to load index packs into processes. | ||||||
| # rkyv = "0.7.35" | # rkyv = "0.7.35" | ||||||
| @ -25,12 +30,12 @@ serde = { version = "1.0.136", features = ["derive"] } | |||||||
| 
 | 
 | ||||||
| ### Raking helpers | ### Raking helpers | ||||||
| # HTTP Requests | # HTTP Requests | ||||||
| reqwest = { version = "0.11.9", features = [] } | reqwest = { version = "0.11.9", features = ["stream"] } | ||||||
| # Gemini Requests | # Gemini Requests | ||||||
| # N.B. TODO gemfeeds are Atom feeds for Gemini. Should support those. | # N.B. TODO gemfeeds are Atom feeds for Gemini. Should support those. | ||||||
| gemini-fetch = "0.2.1" | gemini-fetch = "0.2.1" | ||||||
| # Robots.txt | # Robots.txt | ||||||
| # TODO cylon = { version = "0.2.0", features = [] } | cylon = { version = "0.2.0", features = ["crawl-delay"] } | ||||||
| # RSS/Atom/JSON feeds | # RSS/Atom/JSON feeds | ||||||
| feed-rs = "1.0.0" | feed-rs = "1.0.0" | ||||||
| # Sitemaps | # Sitemaps | ||||||
|  | |||||||
| @ -1,12 +1,4 @@ | |||||||
| pub mod raking; | pub mod raking; | ||||||
| 
 | 
 | ||||||
| #[cfg(test)] | #[cfg(test)] | ||||||
| mod test { | mod test; | ||||||
|     pub fn test_sitemap() { |  | ||||||
|         let mut curs = std::io::Cursor::new("<url><loc>https://lol</loc></url>"); |  | ||||||
|         let reader = sitemap::reader::SiteMapReader::new(curs); |  | ||||||
|         for entry in reader { |  | ||||||
|             eprintln!("{:?}", entry); |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  | |||||||
| @ -1,28 +1,58 @@ | |||||||
| use anyhow::{bail, Context}; | use anyhow::{bail, Context}; | ||||||
| use reqwest::header::HeaderValue; | use chrono::{DateTime, FixedOffset, Utc}; | ||||||
|  | use cylon::Cylon; | ||||||
|  | use html5ever::tendril::fmt::Slice; | ||||||
|  | use lazy_static::lazy_static; | ||||||
|  | use log::debug; | ||||||
| use reqwest::{Client, Url}; | use reqwest::{Client, Url}; | ||||||
| use serde::{Deserialize, Serialize}; | use serde::{Deserialize, Serialize}; | ||||||
|  | use sitemap::reader::SiteMapEntity; | ||||||
|  | use std::collections::HashSet; | ||||||
|  | 
 | ||||||
|  | mod analysis; | ||||||
|  | 
 | ||||||
|  | pub const USER_AGENT: &'static str = "QuickPeepBot"; | ||||||
| 
 | 
 | ||||||
| pub enum RakeOutcome { | pub enum RakeOutcome { | ||||||
|     RakedPage(RakedPage), |     RakedPage(RakedPage), | ||||||
|     RakedFeed(RakedFeed), |     RakedFeed(Vec<UrlRaked>), | ||||||
|     RakedSitemap(RakedSitemap), |     RakedSitemap(Vec<UrlRaked>), | ||||||
|  |     /// The page was not canonical, and should not be indexed.
 | ||||||
|  |     /// However here is the URL of the canonical page.
 | ||||||
|  |     NotCanonical { | ||||||
|  |         new_url: Url, | ||||||
|  |     }, | ||||||
|     TemporaryFailure(TemporaryFailure), |     TemporaryFailure(TemporaryFailure), | ||||||
|     PermanentFailure(PermanentFailure), |     PermanentFailure(PermanentFailure), | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | #[derive(Clone, Debug, PartialEq, Eq)] | ||||||
|  | pub struct UrlRaked { | ||||||
|  |     pub url: Url, | ||||||
|  |     pub last_changed: Option<DateTime<Utc>>, | ||||||
|  |     pub intent: RakeIntent, | ||||||
|  | } | ||||||
|  | 
 | ||||||
| pub struct RakedPage {} | pub struct RakedPage {} | ||||||
| 
 | 
 | ||||||
| pub struct RakedFeed {} | pub struct RobotsTxt { | ||||||
|  |     pub sitemaps: Vec<UrlRaked>, | ||||||
|  |     pub rules: Cylon, | ||||||
|  | } | ||||||
| 
 | 
 | ||||||
| pub struct RakedSitemap {} | pub struct TemporaryFailure { | ||||||
| 
 |     pub reason: TemporaryFailureReason, | ||||||
| pub struct TemporaryFailure {} |     pub backoff_sec: u32, | ||||||
|  | } | ||||||
| 
 | 
 | ||||||
| pub struct PermanentFailure { | pub struct PermanentFailure { | ||||||
|     pub reason: PermanentFailureReason, |     pub reason: PermanentFailureReason, | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | pub enum TemporaryFailureReason { | ||||||
|  |     MissingInformation(String), | ||||||
|  | } | ||||||
|  | 
 | ||||||
| pub enum PermanentFailureReason { | pub enum PermanentFailureReason { | ||||||
|     ResourceDenied(u32), |     ResourceDenied(u32), | ||||||
|     WrongLanguage(String), |     WrongLanguage(String), | ||||||
| @ -37,6 +67,20 @@ pub enum RakeIntent { | |||||||
|     SiteMap, |     SiteMap, | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | lazy_static! { | ||||||
|  |     static ref SITEMAP_MIME_TYPES: HashSet<&'static str> = | ||||||
|  |         HashSet::from_iter(vec!["text/xml", "application/xml",]); | ||||||
|  |     static ref FEED_MIME_TYPES: HashSet<&'static str> = HashSet::from_iter(vec![ | ||||||
|  |         "text/xml", | ||||||
|  |         "application/xml", | ||||||
|  |         "application/atom+xml", | ||||||
|  |         "application/rss+xml", | ||||||
|  |         "application/rdf+xml", | ||||||
|  |         "application/json", | ||||||
|  |         "application/feed+json" | ||||||
|  |     ]); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| pub async fn rake(url: &Url, intent: RakeIntent, client: &Client) -> anyhow::Result<RakeOutcome> { | pub async fn rake(url: &Url, intent: RakeIntent, client: &Client) -> anyhow::Result<RakeOutcome> { | ||||||
|     let response = client.get(url.clone()).send().await?; |     let response = client.get(url.clone()).send().await?; | ||||||
| 
 | 
 | ||||||
| @ -49,31 +93,59 @@ pub async fn rake(url: &Url, intent: RakeIntent, client: &Client) -> anyhow::Res | |||||||
|             .to_str() |             .to_str() | ||||||
|             .context("Can't convert content-type to str")?; |             .context("Can't convert content-type to str")?; | ||||||
|         eprintln!("CT {:?}", content_type); |         eprintln!("CT {:?}", content_type); | ||||||
|         content_type.to_owned() |         content_type.split(";").next().unwrap().trim().to_owned() | ||||||
|     } else { |     } else { | ||||||
|         // TODO ???
 |         return Ok(RakeOutcome::TemporaryFailure(TemporaryFailure { | ||||||
|         "text/html".to_owned() |             reason: TemporaryFailureReason::MissingInformation("content-type".to_owned()), | ||||||
|  |             backoff_sec: 86400 * 7, | ||||||
|  |         })); | ||||||
|     }; |     }; | ||||||
| 
 | 
 | ||||||
|     let content = response.bytes().await?; |     let content = response.bytes().await?; | ||||||
| 
 | 
 | ||||||
|     if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page) {} |     if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page) { | ||||||
|  |         match rake_html_page(&content, url) { | ||||||
|  |             Ok(page_rake) => { | ||||||
|  |                 return Ok(RakeOutcome::RakedPage(page_rake)); | ||||||
|  |             } | ||||||
|  |             Err(error) => { | ||||||
|  |                 debug!("Failed to rake HTML page: {:?}", error); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } | ||||||
| 
 | 
 | ||||||
|     // TODO JSON Feeds.
 |     if FEED_MIME_TYPES.contains(content_type.as_str()) | ||||||
|     if content_type == "application/xml" |  | ||||||
|         && (intent == RakeIntent::Any || intent == RakeIntent::Feed) |         && (intent == RakeIntent::Any || intent == RakeIntent::Feed) | ||||||
|     {} |     { | ||||||
|  |         match rake_feed(&content, url) { | ||||||
|  |             Ok(feed) => { | ||||||
|  |                 return Ok(RakeOutcome::RakedFeed(feed)); | ||||||
|  |             } | ||||||
|  |             Err(error) => { | ||||||
|  |                 debug!("Failed to rake as feed: {:?}", error); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } | ||||||
| 
 | 
 | ||||||
|     if content_type == "application/xml" |     if SITEMAP_MIME_TYPES.contains(content_type.as_str()) | ||||||
|         && (intent == RakeIntent::Any || intent == RakeIntent::SiteMap) |         && (intent == RakeIntent::Any || intent == RakeIntent::SiteMap) | ||||||
|     {} |     { | ||||||
|  |         match rake_sitemap(&content) { | ||||||
|  |             Ok(sitemap) => { | ||||||
|  |                 return Ok(RakeOutcome::RakedSitemap(sitemap)); | ||||||
|  |             } | ||||||
|  |             Err(error) => { | ||||||
|  |                 debug!("Failed to rake as sitemap: {:?}", error); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } | ||||||
| 
 | 
 | ||||||
|     return Ok(RakeOutcome::PermanentFailure(PermanentFailure { |     return Ok(RakeOutcome::PermanentFailure(PermanentFailure { | ||||||
|         reason: PermanentFailureReason::UnknownContentType(content_type.to_owned()), |         reason: PermanentFailureReason::UnknownContentType(content_type.to_owned()), | ||||||
|     })); |     })); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| pub fn rake_html_page(content: &[u8], url: &Url) -> anyhow::Result<()> { | pub fn rake_html_page(content: &[u8], url: &Url) -> anyhow::Result<RakedPage> { | ||||||
|     let content_str = std::str::from_utf8(content)?; |     let content_str = std::str::from_utf8(content)?; | ||||||
| 
 | 
 | ||||||
|     let mut readability = quickpeep_moz_readability::Readability::new(content_str); |     let mut readability = quickpeep_moz_readability::Readability::new(content_str); | ||||||
| @ -87,15 +159,136 @@ pub fn rake_html_page(content: &[u8], url: &Url) -> anyhow::Result<()> { | |||||||
|         eprintln!("{}", node.to_string()); |         eprintln!("{}", node.to_string()); | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     Ok(()) |     Ok(todo!()) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result<()> { | pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result<Vec<UrlRaked>> { | ||||||
|     let x = feed_rs::parser::parse_with_uri(content, Some(url.as_str()))?; |     let feed = feed_rs::parser::parse_with_uri(content, Some(url.as_str()))?; | ||||||
|     todo!() | 
 | ||||||
|  |     let mut urls = Vec::new(); | ||||||
|  | 
 | ||||||
|  |     for entry in feed.entries { | ||||||
|  |         let link = if let Some(link) = entry.links.get(0) { | ||||||
|  |             link | ||||||
|  |         } else { | ||||||
|  |             continue; | ||||||
|  |         }; | ||||||
|  |         let url = Url::parse(&link.href).context("parsing URL in feed")?; // TODO ignore failure here...?
 | ||||||
|  | 
 | ||||||
|  |         let last_changed = entry.updated.or(entry.published); | ||||||
|  | 
 | ||||||
|  |         urls.push(UrlRaked { | ||||||
|  |             url, | ||||||
|  |             last_changed, | ||||||
|  |             intent: RakeIntent::Page, | ||||||
|  |         }); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     // TODO paginated feeds (e.g. JSON Feed next_url)
 | ||||||
|  | 
 | ||||||
|  |     Ok(urls) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| pub fn rake_sitemap(content: &[u8]) -> anyhow::Result<()> { | pub fn rake_sitemap(content: &[u8]) -> anyhow::Result<Vec<UrlRaked>> { | ||||||
|     //let x = sitemap::
 |     let curs = std::io::Cursor::new(content); | ||||||
|     todo!() |     let reader = sitemap::reader::SiteMapReader::new(curs); | ||||||
|  | 
 | ||||||
|  |     let mut urls = Vec::new(); | ||||||
|  | 
 | ||||||
|  |     for entry in reader { | ||||||
|  |         match &entry { | ||||||
|  |             SiteMapEntity::Url(url) => { | ||||||
|  |                 let loc = if let Some(loc) = url.loc.get_url() { | ||||||
|  |                     loc | ||||||
|  |                 } else { | ||||||
|  |                     continue; | ||||||
|  |                 }; | ||||||
|  | 
 | ||||||
|  |                 urls.push(UrlRaked { | ||||||
|  |                     url: loc, | ||||||
|  |                     last_changed: url | ||||||
|  |                         .lastmod | ||||||
|  |                         .get_time() | ||||||
|  |                         .map(|dt: DateTime<FixedOffset>| dt.into()), | ||||||
|  |                     intent: RakeIntent::Page, | ||||||
|  |                 }); | ||||||
|  |             } | ||||||
|  |             SiteMapEntity::SiteMap(sitemap) => { | ||||||
|  |                 let loc = if let Some(loc) = sitemap.loc.get_url() { | ||||||
|  |                     loc | ||||||
|  |                 } else { | ||||||
|  |                     continue; | ||||||
|  |                 }; | ||||||
|  | 
 | ||||||
|  |                 urls.push(UrlRaked { | ||||||
|  |                     url: loc, | ||||||
|  |                     last_changed: sitemap | ||||||
|  |                         .lastmod | ||||||
|  |                         .get_time() | ||||||
|  |                         .map(|dt: DateTime<FixedOffset>| dt.into()), | ||||||
|  |                     intent: RakeIntent::SiteMap, | ||||||
|  |                 }); | ||||||
|  |             } | ||||||
|  |             SiteMapEntity::Err(error) => { | ||||||
|  |                 debug!("Sitemap error {:?}", error); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |         eprintln!("{:?}", entry); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     if urls.is_empty() { | ||||||
|  |         bail!("No URLs or Sitemaps picked up from sitemap; is it bad?"); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     Ok(urls) | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | pub async fn get_robots_txt_for(url: &Url, client: &Client) -> anyhow::Result<Option<RobotsTxt>> { | ||||||
|  |     let robots_url = url | ||||||
|  |         .join("/robots.txt") | ||||||
|  |         .context("Whilst resolving /robots.txt on URL")?; | ||||||
|  |     let resp = client.get(robots_url.clone()).send().await?; | ||||||
|  | 
 | ||||||
|  |     if !resp.status().is_success() { | ||||||
|  |         let code = resp.status().as_u16(); | ||||||
|  |         if code == 404 || code == 410 { | ||||||
|  |             // not found or gone? Assume there is intentionally no robots.txt file.
 | ||||||
|  |             return Ok(None); | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         bail!("Failed to get {:?}: {:?}", robots_url, resp.status()); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     let bytes = resp.bytes().await?; | ||||||
|  | 
 | ||||||
|  |     Ok(decode_robots_txt(&bytes).await?) | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | pub async fn decode_robots_txt(bytes: &[u8]) -> anyhow::Result<Option<RobotsTxt>> { | ||||||
|  |     let mut sitemaps = Vec::new(); | ||||||
|  | 
 | ||||||
|  |     for line in bytes.split(|b| *b == b'\n') { | ||||||
|  |         let line = line.to_ascii_lowercase(); | ||||||
|  |         if line.starts_with(b"sitemap:") { | ||||||
|  |             if let Ok(value) = std::str::from_utf8(&line[8..]) { | ||||||
|  |                 if let Ok(url) = Url::parse(value.trim()) { | ||||||
|  |                     sitemaps.push(UrlRaked { | ||||||
|  |                         url, | ||||||
|  |                         last_changed: None, | ||||||
|  |                         intent: RakeIntent::SiteMap, | ||||||
|  |                     }); | ||||||
|  |                 } else { | ||||||
|  |                     debug!("Failed to parse sitemap value as a URL") | ||||||
|  |                 } | ||||||
|  |             } else { | ||||||
|  |                 debug!("Failed to parse sitemap value as UTF-8") | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     let rules = cylon::Compiler::new(USER_AGENT) | ||||||
|  |         .compile(bytes.as_bytes()) | ||||||
|  |         .await?; | ||||||
|  | 
 | ||||||
|  |     Ok(Some(RobotsTxt { sitemaps, rules })) | ||||||
| } | } | ||||||
|  | |||||||
							
								
								
									
										43
									
								
								quickpeep/src/raking/analysis.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										43
									
								
								quickpeep/src/raking/analysis.rs
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,43 @@ | |||||||
|  | use adblock::filters::cosmetic::CosmeticFilter; | ||||||
|  | use anyhow::anyhow; | ||||||
|  | use kuchiki::NodeRef; | ||||||
|  | use log::debug; | ||||||
|  | use std::path::Path; | ||||||
|  | use tokio::fs::File; | ||||||
|  | use tokio::io::{AsyncBufReadExt, AsyncRead, BufReader}; | ||||||
|  | 
 | ||||||
|  | pub async fn load_cosmetic_filters<R: AsyncRead + Unpin>( | ||||||
|  |     reader: R, | ||||||
|  | ) -> anyhow::Result<Vec<CosmeticFilter>> { | ||||||
|  |     let mut br = BufReader::new(reader); | ||||||
|  |     let mut rules = Vec::new(); | ||||||
|  |     let mut buf = String::new(); | ||||||
|  |     loop { | ||||||
|  |         buf.clear(); | ||||||
|  |         if br.read_line(&mut buf).await? == 0 { | ||||||
|  |             break; | ||||||
|  |         } | ||||||
|  |         if let Ok(rule) = CosmeticFilter::parse(&buf, false) { | ||||||
|  |             rules.push(rule); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     Ok(rules) | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | pub fn analyse_with_ad_block_cosmetic_filter( | ||||||
|  |     root: NodeRef, | ||||||
|  |     filters: &Vec<CosmeticFilter>, | ||||||
|  | ) -> anyhow::Result<bool> { | ||||||
|  |     let mut matches = 0; | ||||||
|  |     for rule in filters { | ||||||
|  |         for ele in root | ||||||
|  |             .select(&rule.selector) | ||||||
|  |             .map_err(|_| anyhow!("Failed to select(..)"))? | ||||||
|  |         { | ||||||
|  |             debug!("Cosmetic Filter {:?} Matches {:?}", rule, ele); | ||||||
|  |             matches += 1; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |     Ok(matches > 0) | ||||||
|  | } | ||||||
							
								
								
									
										88
									
								
								quickpeep/src/test.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										88
									
								
								quickpeep/src/test.rs
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,88 @@ | |||||||
|  | use crate::raking::{decode_robots_txt, rake_feed, rake_sitemap, RakeIntent, UrlRaked}; | ||||||
|  | use reqwest::Url; | ||||||
|  | 
 | ||||||
|  | #[test] | ||||||
|  | pub fn test_sitemap() { | ||||||
|  |     let sm = rake_sitemap( | ||||||
|  |         br#" | ||||||
|  |     <urlset><url><loc>https://example.org/index.html</loc></url></urlset>
 | ||||||
|  |     "#,
 | ||||||
|  |     ) | ||||||
|  |     .unwrap(); | ||||||
|  |     assert_eq!( | ||||||
|  |         sm, | ||||||
|  |         vec![UrlRaked { | ||||||
|  |             url: Url::parse("https://example.org/index.html").unwrap(), | ||||||
|  |             last_changed: None, | ||||||
|  |             intent: RakeIntent::Page | ||||||
|  |         },] | ||||||
|  |     ); | ||||||
|  | 
 | ||||||
|  |     let sm = rake_sitemap( | ||||||
|  |         br#" | ||||||
|  |     <sitemapindex><sitemap><loc>https://example.org/sitemap.xml</loc></sitemap></sitemapindex>
 | ||||||
|  |     "#,
 | ||||||
|  |     ) | ||||||
|  |     .unwrap(); | ||||||
|  |     assert_eq!( | ||||||
|  |         sm, | ||||||
|  |         vec![UrlRaked { | ||||||
|  |             url: Url::parse("https://example.org/sitemap.xml").unwrap(), | ||||||
|  |             last_changed: None, | ||||||
|  |             intent: RakeIntent::SiteMap | ||||||
|  |         }] | ||||||
|  |     ); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #[test] | ||||||
|  | pub fn test_feed() { | ||||||
|  |     let feed = rake_feed( | ||||||
|  |         br#" | ||||||
|  |     <feed> | ||||||
|  |         <entry> | ||||||
|  |             <link href="https://example.org/index.html" /> | ||||||
|  |         </entry> | ||||||
|  |     </feed> | ||||||
|  |     "#,
 | ||||||
|  |         &Url::parse("https://example.org/atom.xml").unwrap(), | ||||||
|  |     ) | ||||||
|  |     .unwrap(); | ||||||
|  | 
 | ||||||
|  |     assert_eq!( | ||||||
|  |         feed, | ||||||
|  |         vec![UrlRaked { | ||||||
|  |             url: Url::parse("https://example.org/index.html").unwrap(), | ||||||
|  |             last_changed: None, | ||||||
|  |             intent: RakeIntent::Page | ||||||
|  |         },] | ||||||
|  |     ); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #[tokio::test] | ||||||
|  | pub async fn test_robots_txt() { | ||||||
|  |     let rtxt = decode_robots_txt( | ||||||
|  |         br#" | ||||||
|  | User-Agent: * | ||||||
|  | Disallow: /bad | ||||||
|  | Allow: /bad/abc | ||||||
|  | 
 | ||||||
|  | SiteMap: https://example.org/sitemap.xml
 | ||||||
|  |     "#,
 | ||||||
|  |     ) | ||||||
|  |     .await | ||||||
|  |     .unwrap() | ||||||
|  |     .unwrap(); | ||||||
|  |     assert_eq!( | ||||||
|  |         rtxt.sitemaps, | ||||||
|  |         vec![UrlRaked { | ||||||
|  |             url: Url::parse("https://example.org/sitemap.xml").unwrap(), | ||||||
|  |             last_changed: None, | ||||||
|  |             intent: RakeIntent::SiteMap | ||||||
|  |         }] | ||||||
|  |     ); | ||||||
|  | 
 | ||||||
|  |     assert!(!rtxt.rules.allow("/bad")); | ||||||
|  |     assert!(!rtxt.rules.allow("/bad/def")); | ||||||
|  |     assert!(rtxt.rules.allow("/bad/abc")); | ||||||
|  |     assert!(rtxt.rules.allow("/good")); | ||||||
|  | } | ||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user