From db5524eb529c5125cfc0cd56d3dcba58dc41b733 Mon Sep 17 00:00:00 2001 From: Olivier Date: Sat, 12 Mar 2022 17:52:01 +0000 Subject: [PATCH] First step towards minimum usability --- Cargo.lock | 26 +------------- docs/concepts.md | 66 ++++++++++++++++++++++++++++++++++++ quickpeep/Cargo.toml | 4 ++- quickpeep/src/bin/qp-rake.rs | 3 ++ quickpeep/src/lib.rs | 11 ++++++ quickpeep/src/raking.rs | 50 ++++++++++++++++++++++++--- 6 files changed, 130 insertions(+), 30 deletions(-) create mode 100644 docs/concepts.md diff --git a/Cargo.lock b/Cargo.lock index cc4bf5c..b9c4607 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -196,17 +196,6 @@ dependencies = [ "syn", ] -[[package]] -name = "cylon" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d4e899a624b708589dea1b6396de3ef38bf3843c824a89190b0fa82ae6e7fd3" -dependencies = [ - "futures-util", - "serde", - "serde_derive", -] - [[package]] name = "derive_more" version = "0.99.17" @@ -358,17 +347,6 @@ version = "0.3.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c09fd04b7e4073ac7156a9539b57a484a8ea920f79c7c675d05d289ab6110d3" -[[package]] -name = "futures-macro" -version = "0.3.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33c1e13800337f4d4d7a316bf45a567dbcb6ffe087f16424852d97e97a91f512" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "futures-sink" version = "0.3.21" @@ -388,11 +366,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d8b7abd5d659d9b90c8cba917f6ec750a74e2dc23902ef9cd4cc8c8b22e6036a" dependencies = [ "futures-core", - "futures-macro", "futures-task", "pin-project-lite", "pin-utils", - "slab", ] [[package]] @@ -1053,7 +1029,6 @@ version = "0.1.0" dependencies = [ "adblock", "anyhow", - "cylon", "env_logger", "feed-rs", "gemini-fetch", @@ -1062,6 +1037,7 @@ dependencies = [ "log", "quickpeep_moz_readability", "reqwest", + "serde", "sitemap", "tokio", ] diff --git a/docs/concepts.md b/docs/concepts.md new file mode 100644 index 0000000..6fdd011 --- /dev/null +++ b/docs/concepts.md @@ -0,0 +1,66 @@ +QuickPeep Concepts +================== + +Principles +---------- + +1. Focus on good-quality, interesting, personal content rather than completeness + for every search query. +2. Support running a search engine on modest hardware. + Critically, disk space is likely to be constrained in real-world deployments. + + +Components and Subcomponents +---------------------------- + +### On-disk Structures + +Schedule: +- List of URLs to rake +- Backoffs for failing hosts + +RakePack: +- Contains summarised results of scraping many pages + - In a streamable, dense memory-mappable format. + - Perhaps use `rkyv` to store the records. + +Index: +- Searchable index of all documents + - Might be distributable as deltas or something, not sure — to be decided. + - Might be sharded by different parameters (e.g. tags) — specifics to be decided. + - Might be sharded by date of raking — specifics to be decided. + Not sure how to best manage an ever-growing dataset. + +### Programs + +#### Importer + +Imports URLs from seed files. Needed to bootstrap the entire engine. + + +#### Raker + +Rakes a page, feed or sitemap. +Builds robot.txt file caches as necessary. + +Generates a summarised version of the page. +Also tries to extract readable content, for higher ranking in the index. + +Also analyses pages for pop-ups and other issues. +(Unsure if we should do the analysis for e.g. cloudflare at this stage or not?) + +#### Indexer + +Imports RakePacks and indexes them for searchability. + +Also maintains a graph database of all cross-page links. +We can use this to perform ranking...? + +??? TODO pagerank ??? + + +#### Searcher + +Provides a front-end for searching in the index. +Could provide an API. (Maybe we can integrate into Searx and get the best of both?) + diff --git a/quickpeep/Cargo.toml b/quickpeep/Cargo.toml index c052a5b..0e85cdd 100644 --- a/quickpeep/Cargo.toml +++ b/quickpeep/Cargo.toml @@ -16,6 +16,8 @@ quickpeep_moz_readability = { path = "../quickpeep_moz_readability" } kuchiki = "0.8.1" html5ever = "0.25.1" +serde = { version = "1.0.136", features = ["derive"] } + # TODO: rkyv and memmap2 should be an efficient way to load index packs into processes. # rkyv = "0.7.35" @@ -28,7 +30,7 @@ reqwest = { version = "0.11.9", features = [] } # N.B. TODO gemfeeds are Atom feeds for Gemini. Should support those. gemini-fetch = "0.2.1" # Robots.txt -cylon = { version = "0.2.0", features = [] } +# TODO cylon = { version = "0.2.0", features = [] } # RSS/Atom/JSON feeds feed-rs = "1.0.0" # Sitemaps diff --git a/quickpeep/src/bin/qp-rake.rs b/quickpeep/src/bin/qp-rake.rs index 0b5f4bb..67b35a5 100644 --- a/quickpeep/src/bin/qp-rake.rs +++ b/quickpeep/src/bin/qp-rake.rs @@ -1,4 +1,5 @@ use quickpeep::raking::rake; +use quickpeep::raking::RakeIntent; use reqwest::Url; use std::str::FromStr; @@ -8,12 +9,14 @@ pub async fn main() -> anyhow::Result<()> { // TODO max timeout, max body size rake( &Url::from_str("http://nothings.org/gamedev/ssao/")?, + RakeIntent::Page, &client, ) .await?; rake( &Url::from_str("https://github.com/kuchiki-rs/kuchiki")?, + RakeIntent::Page, &client, ) .await?; diff --git a/quickpeep/src/lib.rs b/quickpeep/src/lib.rs index ab6acd4..e4db309 100644 --- a/quickpeep/src/lib.rs +++ b/quickpeep/src/lib.rs @@ -1 +1,12 @@ pub mod raking; + +#[cfg(test)] +mod test { + pub fn test_sitemap() { + let mut curs = std::io::Cursor::new("https://lol"); + let reader = sitemap::reader::SiteMapReader::new(curs); + for entry in reader { + eprintln!("{:?}", entry); + } + } +} diff --git a/quickpeep/src/raking.rs b/quickpeep/src/raking.rs index 6b49baa..2c2d0b0 100644 --- a/quickpeep/src/raking.rs +++ b/quickpeep/src/raking.rs @@ -1,6 +1,7 @@ use anyhow::{bail, Context}; use reqwest::header::HeaderValue; use reqwest::{Client, Url}; +use serde::{Deserialize, Serialize}; pub enum RakeOutcome { RakedPage(RakedPage), @@ -25,24 +26,55 @@ pub struct PermanentFailure { pub enum PermanentFailureReason { ResourceDenied(u32), WrongLanguage(String), + UnknownContentType(String), } -pub async fn rake(url: &Url, client: &Client) -> anyhow::Result<()> { +#[derive(Copy, Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] +pub enum RakeIntent { + Any, + Page, + Feed, + SiteMap, +} + +pub async fn rake(url: &Url, intent: RakeIntent, client: &Client) -> anyhow::Result { let response = client.get(url.clone()).send().await?; if !response.status().is_success() { bail!("Not successful: {:?}", response.status().as_u16()); } - if let Some(content_type) = response.headers().get("content-type") { + let content_type = if let Some(content_type) = response.headers().get("content-type") { let content_type = content_type .to_str() .context("Can't convert content-type to str")?; eprintln!("CT {:?}", content_type); - } + content_type.to_owned() + } else { + // TODO ??? + "text/html".to_owned() + }; let content = response.bytes().await?; - let content_str = std::str::from_utf8(&content)?; + + if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page) {} + + // TODO JSON Feeds. + if content_type == "application/xml" + && (intent == RakeIntent::Any || intent == RakeIntent::Feed) + {} + + if content_type == "application/xml" + && (intent == RakeIntent::Any || intent == RakeIntent::SiteMap) + {} + + return Ok(RakeOutcome::PermanentFailure(PermanentFailure { + reason: PermanentFailureReason::UnknownContentType(content_type.to_owned()), + })); +} + +pub fn rake_html_page(content: &[u8], url: &Url) -> anyhow::Result<()> { + let content_str = std::str::from_utf8(content)?; let mut readability = quickpeep_moz_readability::Readability::new(content_str); readability @@ -57,3 +89,13 @@ pub async fn rake(url: &Url, client: &Client) -> anyhow::Result<()> { Ok(()) } + +pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result<()> { + let x = feed_rs::parser::parse_with_uri(content, Some(url.as_str()))?; + todo!() +} + +pub fn rake_sitemap(content: &[u8]) -> anyhow::Result<()> { + //let x = sitemap:: + todo!() +}