Add more work towards raking all the different formats
This commit is contained in:
		
							parent
							
								
									db5524eb52
								
							
						
					
					
						commit
						210e8ef10a
					
				
							
								
								
									
										63
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										63
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							| @ -196,6 +196,16 @@ dependencies = [ | ||||
|  "syn", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "cylon" | ||||
| version = "0.2.0" | ||||
| source = "git+https://github.com/reivilibre/cylon.git?branch=rei/fix_import#12cb6861d6fbd28151bf7befede910b82436034a" | ||||
| dependencies = [ | ||||
|  "futures-util", | ||||
|  "serde", | ||||
|  "serde_derive", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "derive_more" | ||||
| version = "0.99.17" | ||||
| @ -347,6 +357,23 @@ version = "0.3.21" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "0c09fd04b7e4073ac7156a9539b57a484a8ea920f79c7c675d05d289ab6110d3" | ||||
| 
 | ||||
| [[package]] | ||||
| name = "futures-io" | ||||
| version = "0.3.21" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "fc4045962a5a5e935ee2fdedaa4e08284547402885ab326734432bed5d12966b" | ||||
| 
 | ||||
| [[package]] | ||||
| name = "futures-macro" | ||||
| version = "0.3.21" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "33c1e13800337f4d4d7a316bf45a567dbcb6ffe087f16424852d97e97a91f512" | ||||
| dependencies = [ | ||||
|  "proc-macro2", | ||||
|  "quote", | ||||
|  "syn", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "futures-sink" | ||||
| version = "0.3.21" | ||||
| @ -366,9 +393,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "d8b7abd5d659d9b90c8cba917f6ec750a74e2dc23902ef9cd4cc8c8b22e6036a" | ||||
| dependencies = [ | ||||
|  "futures-core", | ||||
|  "futures-io", | ||||
|  "futures-macro", | ||||
|  "futures-task", | ||||
|  "memchr", | ||||
|  "pin-project-lite", | ||||
|  "pin-utils", | ||||
|  "slab", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| @ -422,9 +453,9 @@ dependencies = [ | ||||
| 
 | ||||
| [[package]] | ||||
| name = "h2" | ||||
| version = "0.3.11" | ||||
| version = "0.3.12" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "d9f1f717ddc7b2ba36df7e871fd88db79326551d3d6f1fc406fbfd28b582ff8e" | ||||
| checksum = "62eeb471aa3e3c9197aa4bfeabfe02982f6dc96f750486c0bb0009ac58b26d2b" | ||||
| dependencies = [ | ||||
|  "bytes", | ||||
|  "fnv", | ||||
| @ -709,14 +740,15 @@ dependencies = [ | ||||
| 
 | ||||
| [[package]] | ||||
| name = "mio" | ||||
| version = "0.8.0" | ||||
| version = "0.8.1" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "ba272f85fa0b41fc91872be579b3bbe0f56b792aa361a380eb669469f68dafb2" | ||||
| checksum = "7ba42135c6a5917b9db9cd7b293e5409e1c6b041e6f9825e92e55a894c63b6f8" | ||||
| dependencies = [ | ||||
|  "libc", | ||||
|  "log", | ||||
|  "miow", | ||||
|  "ntapi", | ||||
|  "wasi 0.11.0+wasi-snapshot-preview1", | ||||
|  "winapi", | ||||
| ] | ||||
| 
 | ||||
| @ -1029,11 +1061,15 @@ version = "0.1.0" | ||||
| dependencies = [ | ||||
|  "adblock", | ||||
|  "anyhow", | ||||
|  "bytes", | ||||
|  "chrono", | ||||
|  "cylon", | ||||
|  "env_logger", | ||||
|  "feed-rs", | ||||
|  "gemini-fetch", | ||||
|  "html5ever", | ||||
|  "kuchiki", | ||||
|  "lazy_static", | ||||
|  "log", | ||||
|  "quickpeep_moz_readability", | ||||
|  "reqwest", | ||||
| @ -1126,9 +1162,9 @@ dependencies = [ | ||||
| 
 | ||||
| [[package]] | ||||
| name = "regex" | ||||
| version = "1.5.4" | ||||
| version = "1.5.5" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" | ||||
| checksum = "1a11647b6b25ff05a515cb92c365cec08801e83423a235b51e231e1808747286" | ||||
| dependencies = [ | ||||
|  "aho-corasick", | ||||
|  "memchr", | ||||
| @ -1179,6 +1215,7 @@ dependencies = [ | ||||
|  "serde_urlencoded", | ||||
|  "tokio", | ||||
|  "tokio-native-tls", | ||||
|  "tokio-util", | ||||
|  "url", | ||||
|  "wasm-bindgen", | ||||
|  "wasm-bindgen-futures", | ||||
| @ -1652,9 +1689,9 @@ checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6" | ||||
| 
 | ||||
| [[package]] | ||||
| name = "tracing" | ||||
| version = "0.1.31" | ||||
| version = "0.1.32" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "f6c650a8ef0cd2dd93736f033d21cbd1224c5a967aa0c258d00fcf7dafef9b9f" | ||||
| checksum = "4a1bdf54a7c28a2bbf701e1d2233f6c77f473486b94bee4f9678da5a148dca7f" | ||||
| dependencies = [ | ||||
|  "cfg-if", | ||||
|  "pin-project-lite", | ||||
| @ -1663,9 +1700,9 @@ dependencies = [ | ||||
| 
 | ||||
| [[package]] | ||||
| name = "tracing-core" | ||||
| version = "0.1.22" | ||||
| version = "0.1.23" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "03cfcb51380632a72d3111cb8d3447a8d908e577d31beeac006f836383d29a23" | ||||
| checksum = "aa31669fa42c09c34d94d8165dd2012e8ff3c66aca50f3bb226b68f216f2706c" | ||||
| dependencies = [ | ||||
|  "lazy_static", | ||||
| ] | ||||
| @ -1774,6 +1811,12 @@ version = "0.10.2+wasi-snapshot-preview1" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" | ||||
| 
 | ||||
| [[package]] | ||||
| name = "wasi" | ||||
| version = "0.11.0+wasi-snapshot-preview1" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" | ||||
| 
 | ||||
| [[package]] | ||||
| name = "wasm-bindgen" | ||||
| version = "0.2.79" | ||||
|  | ||||
| @ -4,3 +4,7 @@ members = [ | ||||
|     "quickpeep_moz_readability" | ||||
| ] | ||||
| 
 | ||||
| 
 | ||||
| [patch.crates-io] | ||||
| cylon = { git = "https://github.com/reivilibre/cylon.git", branch = "rei/fix_import" } | ||||
| 
 | ||||
|  | ||||
| @ -18,6 +18,11 @@ html5ever = "0.25.1" | ||||
| 
 | ||||
| serde = { version = "1.0.136", features = ["derive"] } | ||||
| 
 | ||||
| chrono = "0.4.19" | ||||
| 
 | ||||
| lazy_static = "1.4.0" | ||||
| 
 | ||||
| bytes = "1.1.0" | ||||
| 
 | ||||
| # TODO: rkyv and memmap2 should be an efficient way to load index packs into processes. | ||||
| # rkyv = "0.7.35" | ||||
| @ -25,12 +30,12 @@ serde = { version = "1.0.136", features = ["derive"] } | ||||
| 
 | ||||
| ### Raking helpers | ||||
| # HTTP Requests | ||||
| reqwest = { version = "0.11.9", features = [] } | ||||
| reqwest = { version = "0.11.9", features = ["stream"] } | ||||
| # Gemini Requests | ||||
| # N.B. TODO gemfeeds are Atom feeds for Gemini. Should support those. | ||||
| gemini-fetch = "0.2.1" | ||||
| # Robots.txt | ||||
| # TODO cylon = { version = "0.2.0", features = [] } | ||||
| cylon = { version = "0.2.0", features = ["crawl-delay"] } | ||||
| # RSS/Atom/JSON feeds | ||||
| feed-rs = "1.0.0" | ||||
| # Sitemaps | ||||
|  | ||||
| @ -1,12 +1,4 @@ | ||||
| pub mod raking; | ||||
| 
 | ||||
| #[cfg(test)] | ||||
| mod test { | ||||
|     pub fn test_sitemap() { | ||||
|         let mut curs = std::io::Cursor::new("<url><loc>https://lol</loc></url>"); | ||||
|         let reader = sitemap::reader::SiteMapReader::new(curs); | ||||
|         for entry in reader { | ||||
|             eprintln!("{:?}", entry); | ||||
|         } | ||||
|     } | ||||
| } | ||||
| mod test; | ||||
|  | ||||
| @ -1,28 +1,58 @@ | ||||
| use anyhow::{bail, Context}; | ||||
| use reqwest::header::HeaderValue; | ||||
| use chrono::{DateTime, FixedOffset, Utc}; | ||||
| use cylon::Cylon; | ||||
| use html5ever::tendril::fmt::Slice; | ||||
| use lazy_static::lazy_static; | ||||
| use log::debug; | ||||
| use reqwest::{Client, Url}; | ||||
| use serde::{Deserialize, Serialize}; | ||||
| use sitemap::reader::SiteMapEntity; | ||||
| use std::collections::HashSet; | ||||
| 
 | ||||
| mod analysis; | ||||
| 
 | ||||
| pub const USER_AGENT: &'static str = "QuickPeepBot"; | ||||
| 
 | ||||
| pub enum RakeOutcome { | ||||
|     RakedPage(RakedPage), | ||||
|     RakedFeed(RakedFeed), | ||||
|     RakedSitemap(RakedSitemap), | ||||
|     RakedFeed(Vec<UrlRaked>), | ||||
|     RakedSitemap(Vec<UrlRaked>), | ||||
|     /// The page was not canonical, and should not be indexed.
 | ||||
|     /// However here is the URL of the canonical page.
 | ||||
|     NotCanonical { | ||||
|         new_url: Url, | ||||
|     }, | ||||
|     TemporaryFailure(TemporaryFailure), | ||||
|     PermanentFailure(PermanentFailure), | ||||
| } | ||||
| 
 | ||||
| #[derive(Clone, Debug, PartialEq, Eq)] | ||||
| pub struct UrlRaked { | ||||
|     pub url: Url, | ||||
|     pub last_changed: Option<DateTime<Utc>>, | ||||
|     pub intent: RakeIntent, | ||||
| } | ||||
| 
 | ||||
| pub struct RakedPage {} | ||||
| 
 | ||||
| pub struct RakedFeed {} | ||||
| pub struct RobotsTxt { | ||||
|     pub sitemaps: Vec<UrlRaked>, | ||||
|     pub rules: Cylon, | ||||
| } | ||||
| 
 | ||||
| pub struct RakedSitemap {} | ||||
| 
 | ||||
| pub struct TemporaryFailure {} | ||||
| pub struct TemporaryFailure { | ||||
|     pub reason: TemporaryFailureReason, | ||||
|     pub backoff_sec: u32, | ||||
| } | ||||
| 
 | ||||
| pub struct PermanentFailure { | ||||
|     pub reason: PermanentFailureReason, | ||||
| } | ||||
| 
 | ||||
| pub enum TemporaryFailureReason { | ||||
|     MissingInformation(String), | ||||
| } | ||||
| 
 | ||||
| pub enum PermanentFailureReason { | ||||
|     ResourceDenied(u32), | ||||
|     WrongLanguage(String), | ||||
| @ -37,6 +67,20 @@ pub enum RakeIntent { | ||||
|     SiteMap, | ||||
| } | ||||
| 
 | ||||
| lazy_static! { | ||||
|     static ref SITEMAP_MIME_TYPES: HashSet<&'static str> = | ||||
|         HashSet::from_iter(vec!["text/xml", "application/xml",]); | ||||
|     static ref FEED_MIME_TYPES: HashSet<&'static str> = HashSet::from_iter(vec![ | ||||
|         "text/xml", | ||||
|         "application/xml", | ||||
|         "application/atom+xml", | ||||
|         "application/rss+xml", | ||||
|         "application/rdf+xml", | ||||
|         "application/json", | ||||
|         "application/feed+json" | ||||
|     ]); | ||||
| } | ||||
| 
 | ||||
| pub async fn rake(url: &Url, intent: RakeIntent, client: &Client) -> anyhow::Result<RakeOutcome> { | ||||
|     let response = client.get(url.clone()).send().await?; | ||||
| 
 | ||||
| @ -49,31 +93,59 @@ pub async fn rake(url: &Url, intent: RakeIntent, client: &Client) -> anyhow::Res | ||||
|             .to_str() | ||||
|             .context("Can't convert content-type to str")?; | ||||
|         eprintln!("CT {:?}", content_type); | ||||
|         content_type.to_owned() | ||||
|         content_type.split(";").next().unwrap().trim().to_owned() | ||||
|     } else { | ||||
|         // TODO ???
 | ||||
|         "text/html".to_owned() | ||||
|         return Ok(RakeOutcome::TemporaryFailure(TemporaryFailure { | ||||
|             reason: TemporaryFailureReason::MissingInformation("content-type".to_owned()), | ||||
|             backoff_sec: 86400 * 7, | ||||
|         })); | ||||
|     }; | ||||
| 
 | ||||
|     let content = response.bytes().await?; | ||||
| 
 | ||||
|     if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page) {} | ||||
|     if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page) { | ||||
|         match rake_html_page(&content, url) { | ||||
|             Ok(page_rake) => { | ||||
|                 return Ok(RakeOutcome::RakedPage(page_rake)); | ||||
|             } | ||||
|             Err(error) => { | ||||
|                 debug!("Failed to rake HTML page: {:?}", error); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     // TODO JSON Feeds.
 | ||||
|     if content_type == "application/xml" | ||||
|     if FEED_MIME_TYPES.contains(content_type.as_str()) | ||||
|         && (intent == RakeIntent::Any || intent == RakeIntent::Feed) | ||||
|     {} | ||||
|     { | ||||
|         match rake_feed(&content, url) { | ||||
|             Ok(feed) => { | ||||
|                 return Ok(RakeOutcome::RakedFeed(feed)); | ||||
|             } | ||||
|             Err(error) => { | ||||
|                 debug!("Failed to rake as feed: {:?}", error); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     if content_type == "application/xml" | ||||
|     if SITEMAP_MIME_TYPES.contains(content_type.as_str()) | ||||
|         && (intent == RakeIntent::Any || intent == RakeIntent::SiteMap) | ||||
|     {} | ||||
|     { | ||||
|         match rake_sitemap(&content) { | ||||
|             Ok(sitemap) => { | ||||
|                 return Ok(RakeOutcome::RakedSitemap(sitemap)); | ||||
|             } | ||||
|             Err(error) => { | ||||
|                 debug!("Failed to rake as sitemap: {:?}", error); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     return Ok(RakeOutcome::PermanentFailure(PermanentFailure { | ||||
|         reason: PermanentFailureReason::UnknownContentType(content_type.to_owned()), | ||||
|     })); | ||||
| } | ||||
| 
 | ||||
| pub fn rake_html_page(content: &[u8], url: &Url) -> anyhow::Result<()> { | ||||
| pub fn rake_html_page(content: &[u8], url: &Url) -> anyhow::Result<RakedPage> { | ||||
|     let content_str = std::str::from_utf8(content)?; | ||||
| 
 | ||||
|     let mut readability = quickpeep_moz_readability::Readability::new(content_str); | ||||
| @ -87,15 +159,136 @@ pub fn rake_html_page(content: &[u8], url: &Url) -> anyhow::Result<()> { | ||||
|         eprintln!("{}", node.to_string()); | ||||
|     } | ||||
| 
 | ||||
|     Ok(()) | ||||
|     Ok(todo!()) | ||||
| } | ||||
| 
 | ||||
| pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result<()> { | ||||
|     let x = feed_rs::parser::parse_with_uri(content, Some(url.as_str()))?; | ||||
|     todo!() | ||||
| pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result<Vec<UrlRaked>> { | ||||
|     let feed = feed_rs::parser::parse_with_uri(content, Some(url.as_str()))?; | ||||
| 
 | ||||
|     let mut urls = Vec::new(); | ||||
| 
 | ||||
|     for entry in feed.entries { | ||||
|         let link = if let Some(link) = entry.links.get(0) { | ||||
|             link | ||||
|         } else { | ||||
|             continue; | ||||
|         }; | ||||
|         let url = Url::parse(&link.href).context("parsing URL in feed")?; // TODO ignore failure here...?
 | ||||
| 
 | ||||
|         let last_changed = entry.updated.or(entry.published); | ||||
| 
 | ||||
|         urls.push(UrlRaked { | ||||
|             url, | ||||
|             last_changed, | ||||
|             intent: RakeIntent::Page, | ||||
|         }); | ||||
|     } | ||||
| 
 | ||||
|     // TODO paginated feeds (e.g. JSON Feed next_url)
 | ||||
| 
 | ||||
|     Ok(urls) | ||||
| } | ||||
| 
 | ||||
| pub fn rake_sitemap(content: &[u8]) -> anyhow::Result<()> { | ||||
|     //let x = sitemap::
 | ||||
|     todo!() | ||||
| pub fn rake_sitemap(content: &[u8]) -> anyhow::Result<Vec<UrlRaked>> { | ||||
|     let curs = std::io::Cursor::new(content); | ||||
|     let reader = sitemap::reader::SiteMapReader::new(curs); | ||||
| 
 | ||||
|     let mut urls = Vec::new(); | ||||
| 
 | ||||
|     for entry in reader { | ||||
|         match &entry { | ||||
|             SiteMapEntity::Url(url) => { | ||||
|                 let loc = if let Some(loc) = url.loc.get_url() { | ||||
|                     loc | ||||
|                 } else { | ||||
|                     continue; | ||||
|                 }; | ||||
| 
 | ||||
|                 urls.push(UrlRaked { | ||||
|                     url: loc, | ||||
|                     last_changed: url | ||||
|                         .lastmod | ||||
|                         .get_time() | ||||
|                         .map(|dt: DateTime<FixedOffset>| dt.into()), | ||||
|                     intent: RakeIntent::Page, | ||||
|                 }); | ||||
|             } | ||||
|             SiteMapEntity::SiteMap(sitemap) => { | ||||
|                 let loc = if let Some(loc) = sitemap.loc.get_url() { | ||||
|                     loc | ||||
|                 } else { | ||||
|                     continue; | ||||
|                 }; | ||||
| 
 | ||||
|                 urls.push(UrlRaked { | ||||
|                     url: loc, | ||||
|                     last_changed: sitemap | ||||
|                         .lastmod | ||||
|                         .get_time() | ||||
|                         .map(|dt: DateTime<FixedOffset>| dt.into()), | ||||
|                     intent: RakeIntent::SiteMap, | ||||
|                 }); | ||||
|             } | ||||
|             SiteMapEntity::Err(error) => { | ||||
|                 debug!("Sitemap error {:?}", error); | ||||
|             } | ||||
|         } | ||||
|         eprintln!("{:?}", entry); | ||||
|     } | ||||
| 
 | ||||
|     if urls.is_empty() { | ||||
|         bail!("No URLs or Sitemaps picked up from sitemap; is it bad?"); | ||||
|     } | ||||
| 
 | ||||
|     Ok(urls) | ||||
| } | ||||
| 
 | ||||
| pub async fn get_robots_txt_for(url: &Url, client: &Client) -> anyhow::Result<Option<RobotsTxt>> { | ||||
|     let robots_url = url | ||||
|         .join("/robots.txt") | ||||
|         .context("Whilst resolving /robots.txt on URL")?; | ||||
|     let resp = client.get(robots_url.clone()).send().await?; | ||||
| 
 | ||||
|     if !resp.status().is_success() { | ||||
|         let code = resp.status().as_u16(); | ||||
|         if code == 404 || code == 410 { | ||||
|             // not found or gone? Assume there is intentionally no robots.txt file.
 | ||||
|             return Ok(None); | ||||
|         } | ||||
| 
 | ||||
|         bail!("Failed to get {:?}: {:?}", robots_url, resp.status()); | ||||
|     } | ||||
| 
 | ||||
|     let bytes = resp.bytes().await?; | ||||
| 
 | ||||
|     Ok(decode_robots_txt(&bytes).await?) | ||||
| } | ||||
| 
 | ||||
| pub async fn decode_robots_txt(bytes: &[u8]) -> anyhow::Result<Option<RobotsTxt>> { | ||||
|     let mut sitemaps = Vec::new(); | ||||
| 
 | ||||
|     for line in bytes.split(|b| *b == b'\n') { | ||||
|         let line = line.to_ascii_lowercase(); | ||||
|         if line.starts_with(b"sitemap:") { | ||||
|             if let Ok(value) = std::str::from_utf8(&line[8..]) { | ||||
|                 if let Ok(url) = Url::parse(value.trim()) { | ||||
|                     sitemaps.push(UrlRaked { | ||||
|                         url, | ||||
|                         last_changed: None, | ||||
|                         intent: RakeIntent::SiteMap, | ||||
|                     }); | ||||
|                 } else { | ||||
|                     debug!("Failed to parse sitemap value as a URL") | ||||
|                 } | ||||
|             } else { | ||||
|                 debug!("Failed to parse sitemap value as UTF-8") | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     let rules = cylon::Compiler::new(USER_AGENT) | ||||
|         .compile(bytes.as_bytes()) | ||||
|         .await?; | ||||
| 
 | ||||
|     Ok(Some(RobotsTxt { sitemaps, rules })) | ||||
| } | ||||
|  | ||||
							
								
								
									
										43
									
								
								quickpeep/src/raking/analysis.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										43
									
								
								quickpeep/src/raking/analysis.rs
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,43 @@ | ||||
| use adblock::filters::cosmetic::CosmeticFilter; | ||||
| use anyhow::anyhow; | ||||
| use kuchiki::NodeRef; | ||||
| use log::debug; | ||||
| use std::path::Path; | ||||
| use tokio::fs::File; | ||||
| use tokio::io::{AsyncBufReadExt, AsyncRead, BufReader}; | ||||
| 
 | ||||
| pub async fn load_cosmetic_filters<R: AsyncRead + Unpin>( | ||||
|     reader: R, | ||||
| ) -> anyhow::Result<Vec<CosmeticFilter>> { | ||||
|     let mut br = BufReader::new(reader); | ||||
|     let mut rules = Vec::new(); | ||||
|     let mut buf = String::new(); | ||||
|     loop { | ||||
|         buf.clear(); | ||||
|         if br.read_line(&mut buf).await? == 0 { | ||||
|             break; | ||||
|         } | ||||
|         if let Ok(rule) = CosmeticFilter::parse(&buf, false) { | ||||
|             rules.push(rule); | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     Ok(rules) | ||||
| } | ||||
| 
 | ||||
| pub fn analyse_with_ad_block_cosmetic_filter( | ||||
|     root: NodeRef, | ||||
|     filters: &Vec<CosmeticFilter>, | ||||
| ) -> anyhow::Result<bool> { | ||||
|     let mut matches = 0; | ||||
|     for rule in filters { | ||||
|         for ele in root | ||||
|             .select(&rule.selector) | ||||
|             .map_err(|_| anyhow!("Failed to select(..)"))? | ||||
|         { | ||||
|             debug!("Cosmetic Filter {:?} Matches {:?}", rule, ele); | ||||
|             matches += 1; | ||||
|         } | ||||
|     } | ||||
|     Ok(matches > 0) | ||||
| } | ||||
							
								
								
									
										88
									
								
								quickpeep/src/test.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										88
									
								
								quickpeep/src/test.rs
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,88 @@ | ||||
| use crate::raking::{decode_robots_txt, rake_feed, rake_sitemap, RakeIntent, UrlRaked}; | ||||
| use reqwest::Url; | ||||
| 
 | ||||
| #[test] | ||||
| pub fn test_sitemap() { | ||||
|     let sm = rake_sitemap( | ||||
|         br#" | ||||
|     <urlset><url><loc>https://example.org/index.html</loc></url></urlset>
 | ||||
|     "#,
 | ||||
|     ) | ||||
|     .unwrap(); | ||||
|     assert_eq!( | ||||
|         sm, | ||||
|         vec![UrlRaked { | ||||
|             url: Url::parse("https://example.org/index.html").unwrap(), | ||||
|             last_changed: None, | ||||
|             intent: RakeIntent::Page | ||||
|         },] | ||||
|     ); | ||||
| 
 | ||||
|     let sm = rake_sitemap( | ||||
|         br#" | ||||
|     <sitemapindex><sitemap><loc>https://example.org/sitemap.xml</loc></sitemap></sitemapindex>
 | ||||
|     "#,
 | ||||
|     ) | ||||
|     .unwrap(); | ||||
|     assert_eq!( | ||||
|         sm, | ||||
|         vec![UrlRaked { | ||||
|             url: Url::parse("https://example.org/sitemap.xml").unwrap(), | ||||
|             last_changed: None, | ||||
|             intent: RakeIntent::SiteMap | ||||
|         }] | ||||
|     ); | ||||
| } | ||||
| 
 | ||||
| #[test] | ||||
| pub fn test_feed() { | ||||
|     let feed = rake_feed( | ||||
|         br#" | ||||
|     <feed> | ||||
|         <entry> | ||||
|             <link href="https://example.org/index.html" /> | ||||
|         </entry> | ||||
|     </feed> | ||||
|     "#,
 | ||||
|         &Url::parse("https://example.org/atom.xml").unwrap(), | ||||
|     ) | ||||
|     .unwrap(); | ||||
| 
 | ||||
|     assert_eq!( | ||||
|         feed, | ||||
|         vec![UrlRaked { | ||||
|             url: Url::parse("https://example.org/index.html").unwrap(), | ||||
|             last_changed: None, | ||||
|             intent: RakeIntent::Page | ||||
|         },] | ||||
|     ); | ||||
| } | ||||
| 
 | ||||
| #[tokio::test] | ||||
| pub async fn test_robots_txt() { | ||||
|     let rtxt = decode_robots_txt( | ||||
|         br#" | ||||
| User-Agent: * | ||||
| Disallow: /bad | ||||
| Allow: /bad/abc | ||||
| 
 | ||||
| SiteMap: https://example.org/sitemap.xml
 | ||||
|     "#,
 | ||||
|     ) | ||||
|     .await | ||||
|     .unwrap() | ||||
|     .unwrap(); | ||||
|     assert_eq!( | ||||
|         rtxt.sitemaps, | ||||
|         vec![UrlRaked { | ||||
|             url: Url::parse("https://example.org/sitemap.xml").unwrap(), | ||||
|             last_changed: None, | ||||
|             intent: RakeIntent::SiteMap | ||||
|         }] | ||||
|     ); | ||||
| 
 | ||||
|     assert!(!rtxt.rules.allow("/bad")); | ||||
|     assert!(!rtxt.rules.allow("/bad/def")); | ||||
|     assert!(rtxt.rules.allow("/bad/abc")); | ||||
|     assert!(rtxt.rules.allow("/good")); | ||||
| } | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user