First step towards minimum usability

This commit is contained in:
Olivier 'reivilibre' 2022-03-12 17:52:01 +00:00
parent 4cd259d0ac
commit db5524eb52
6 changed files with 130 additions and 30 deletions

26
Cargo.lock generated
View File

@ -196,17 +196,6 @@ dependencies = [
"syn",
]
[[package]]
name = "cylon"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8d4e899a624b708589dea1b6396de3ef38bf3843c824a89190b0fa82ae6e7fd3"
dependencies = [
"futures-util",
"serde",
"serde_derive",
]
[[package]]
name = "derive_more"
version = "0.99.17"
@ -358,17 +347,6 @@ version = "0.3.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c09fd04b7e4073ac7156a9539b57a484a8ea920f79c7c675d05d289ab6110d3"
[[package]]
name = "futures-macro"
version = "0.3.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "33c1e13800337f4d4d7a316bf45a567dbcb6ffe087f16424852d97e97a91f512"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "futures-sink"
version = "0.3.21"
@ -388,11 +366,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d8b7abd5d659d9b90c8cba917f6ec750a74e2dc23902ef9cd4cc8c8b22e6036a"
dependencies = [
"futures-core",
"futures-macro",
"futures-task",
"pin-project-lite",
"pin-utils",
"slab",
]
[[package]]
@ -1053,7 +1029,6 @@ version = "0.1.0"
dependencies = [
"adblock",
"anyhow",
"cylon",
"env_logger",
"feed-rs",
"gemini-fetch",
@ -1062,6 +1037,7 @@ dependencies = [
"log",
"quickpeep_moz_readability",
"reqwest",
"serde",
"sitemap",
"tokio",
]

66
docs/concepts.md Normal file
View File

@ -0,0 +1,66 @@
QuickPeep Concepts
==================
Principles
----------
1. Focus on good-quality, interesting, personal content rather than completeness
for every search query.
2. Support running a search engine on modest hardware.
Critically, disk space is likely to be constrained in real-world deployments.
Components and Subcomponents
----------------------------
### On-disk Structures
Schedule:
- List of URLs to rake
- Backoffs for failing hosts
RakePack:
- Contains summarised results of scraping many pages
- In a streamable, dense memory-mappable format.
- Perhaps use `rkyv` to store the records.
Index:
- Searchable index of all documents
- Might be distributable as deltas or something, not sure — to be decided.
- Might be sharded by different parameters (e.g. tags) — specifics to be decided.
- Might be sharded by date of raking — specifics to be decided.
Not sure how to best manage an ever-growing dataset.
### Programs
#### Importer
Imports URLs from seed files. Needed to bootstrap the entire engine.
#### Raker
Rakes a page, feed or sitemap.
Builds robot.txt file caches as necessary.
Generates a summarised version of the page.
Also tries to extract readable content, for higher ranking in the index.
Also analyses pages for pop-ups and other issues.
(Unsure if we should do the analysis for e.g. cloudflare at this stage or not?)
#### Indexer
Imports RakePacks and indexes them for searchability.
Also maintains a graph database of all cross-page links.
We can use this to perform ranking...?
??? TODO pagerank ???
#### Searcher
Provides a front-end for searching in the index.
Could provide an API. (Maybe we can integrate into Searx and get the best of both?)

View File

@ -16,6 +16,8 @@ quickpeep_moz_readability = { path = "../quickpeep_moz_readability" }
kuchiki = "0.8.1"
html5ever = "0.25.1"
serde = { version = "1.0.136", features = ["derive"] }
# TODO: rkyv and memmap2 should be an efficient way to load index packs into processes.
# rkyv = "0.7.35"
@ -28,7 +30,7 @@ reqwest = { version = "0.11.9", features = [] }
# N.B. TODO gemfeeds are Atom feeds for Gemini. Should support those.
gemini-fetch = "0.2.1"
# Robots.txt
cylon = { version = "0.2.0", features = [] }
# TODO cylon = { version = "0.2.0", features = [] }
# RSS/Atom/JSON feeds
feed-rs = "1.0.0"
# Sitemaps

View File

@ -1,4 +1,5 @@
use quickpeep::raking::rake;
use quickpeep::raking::RakeIntent;
use reqwest::Url;
use std::str::FromStr;
@ -8,12 +9,14 @@ pub async fn main() -> anyhow::Result<()> {
// TODO max timeout, max body size
rake(
&Url::from_str("http://nothings.org/gamedev/ssao/")?,
RakeIntent::Page,
&client,
)
.await?;
rake(
&Url::from_str("https://github.com/kuchiki-rs/kuchiki")?,
RakeIntent::Page,
&client,
)
.await?;

View File

@ -1 +1,12 @@
pub mod raking;
#[cfg(test)]
mod test {
pub fn test_sitemap() {
let mut curs = std::io::Cursor::new("<url><loc>https://lol</loc></url>");
let reader = sitemap::reader::SiteMapReader::new(curs);
for entry in reader {
eprintln!("{:?}", entry);
}
}
}

View File

@ -1,6 +1,7 @@
use anyhow::{bail, Context};
use reqwest::header::HeaderValue;
use reqwest::{Client, Url};
use serde::{Deserialize, Serialize};
pub enum RakeOutcome {
RakedPage(RakedPage),
@ -25,24 +26,55 @@ pub struct PermanentFailure {
pub enum PermanentFailureReason {
ResourceDenied(u32),
WrongLanguage(String),
UnknownContentType(String),
}
pub async fn rake(url: &Url, client: &Client) -> anyhow::Result<()> {
#[derive(Copy, Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
pub enum RakeIntent {
Any,
Page,
Feed,
SiteMap,
}
pub async fn rake(url: &Url, intent: RakeIntent, client: &Client) -> anyhow::Result<RakeOutcome> {
let response = client.get(url.clone()).send().await?;
if !response.status().is_success() {
bail!("Not successful: {:?}", response.status().as_u16());
}
if let Some(content_type) = response.headers().get("content-type") {
let content_type = if let Some(content_type) = response.headers().get("content-type") {
let content_type = content_type
.to_str()
.context("Can't convert content-type to str")?;
eprintln!("CT {:?}", content_type);
}
content_type.to_owned()
} else {
// TODO ???
"text/html".to_owned()
};
let content = response.bytes().await?;
let content_str = std::str::from_utf8(&content)?;
if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page) {}
// TODO JSON Feeds.
if content_type == "application/xml"
&& (intent == RakeIntent::Any || intent == RakeIntent::Feed)
{}
if content_type == "application/xml"
&& (intent == RakeIntent::Any || intent == RakeIntent::SiteMap)
{}
return Ok(RakeOutcome::PermanentFailure(PermanentFailure {
reason: PermanentFailureReason::UnknownContentType(content_type.to_owned()),
}));
}
pub fn rake_html_page(content: &[u8], url: &Url) -> anyhow::Result<()> {
let content_str = std::str::from_utf8(content)?;
let mut readability = quickpeep_moz_readability::Readability::new(content_str);
readability
@ -57,3 +89,13 @@ pub async fn rake(url: &Url, client: &Client) -> anyhow::Result<()> {
Ok(())
}
pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result<()> {
let x = feed_rs::parser::parse_with_uri(content, Some(url.as_str()))?;
todo!()
}
pub fn rake_sitemap(content: &[u8]) -> anyhow::Result<()> {
//let x = sitemap::
todo!()
}