First step towards minimum usability
This commit is contained in:
parent
4cd259d0ac
commit
db5524eb52
26
Cargo.lock
generated
26
Cargo.lock
generated
@ -196,17 +196,6 @@ dependencies = [
|
|||||||
"syn",
|
"syn",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "cylon"
|
|
||||||
version = "0.2.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "8d4e899a624b708589dea1b6396de3ef38bf3843c824a89190b0fa82ae6e7fd3"
|
|
||||||
dependencies = [
|
|
||||||
"futures-util",
|
|
||||||
"serde",
|
|
||||||
"serde_derive",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "derive_more"
|
name = "derive_more"
|
||||||
version = "0.99.17"
|
version = "0.99.17"
|
||||||
@ -358,17 +347,6 @@ version = "0.3.21"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "0c09fd04b7e4073ac7156a9539b57a484a8ea920f79c7c675d05d289ab6110d3"
|
checksum = "0c09fd04b7e4073ac7156a9539b57a484a8ea920f79c7c675d05d289ab6110d3"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "futures-macro"
|
|
||||||
version = "0.3.21"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "33c1e13800337f4d4d7a316bf45a567dbcb6ffe087f16424852d97e97a91f512"
|
|
||||||
dependencies = [
|
|
||||||
"proc-macro2",
|
|
||||||
"quote",
|
|
||||||
"syn",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "futures-sink"
|
name = "futures-sink"
|
||||||
version = "0.3.21"
|
version = "0.3.21"
|
||||||
@ -388,11 +366,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
checksum = "d8b7abd5d659d9b90c8cba917f6ec750a74e2dc23902ef9cd4cc8c8b22e6036a"
|
checksum = "d8b7abd5d659d9b90c8cba917f6ec750a74e2dc23902ef9cd4cc8c8b22e6036a"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"futures-core",
|
"futures-core",
|
||||||
"futures-macro",
|
|
||||||
"futures-task",
|
"futures-task",
|
||||||
"pin-project-lite",
|
"pin-project-lite",
|
||||||
"pin-utils",
|
"pin-utils",
|
||||||
"slab",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -1053,7 +1029,6 @@ version = "0.1.0"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"adblock",
|
"adblock",
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"cylon",
|
|
||||||
"env_logger",
|
"env_logger",
|
||||||
"feed-rs",
|
"feed-rs",
|
||||||
"gemini-fetch",
|
"gemini-fetch",
|
||||||
@ -1062,6 +1037,7 @@ dependencies = [
|
|||||||
"log",
|
"log",
|
||||||
"quickpeep_moz_readability",
|
"quickpeep_moz_readability",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
|
"serde",
|
||||||
"sitemap",
|
"sitemap",
|
||||||
"tokio",
|
"tokio",
|
||||||
]
|
]
|
||||||
|
66
docs/concepts.md
Normal file
66
docs/concepts.md
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
QuickPeep Concepts
|
||||||
|
==================
|
||||||
|
|
||||||
|
Principles
|
||||||
|
----------
|
||||||
|
|
||||||
|
1. Focus on good-quality, interesting, personal content rather than completeness
|
||||||
|
for every search query.
|
||||||
|
2. Support running a search engine on modest hardware.
|
||||||
|
Critically, disk space is likely to be constrained in real-world deployments.
|
||||||
|
|
||||||
|
|
||||||
|
Components and Subcomponents
|
||||||
|
----------------------------
|
||||||
|
|
||||||
|
### On-disk Structures
|
||||||
|
|
||||||
|
Schedule:
|
||||||
|
- List of URLs to rake
|
||||||
|
- Backoffs for failing hosts
|
||||||
|
|
||||||
|
RakePack:
|
||||||
|
- Contains summarised results of scraping many pages
|
||||||
|
- In a streamable, dense memory-mappable format.
|
||||||
|
- Perhaps use `rkyv` to store the records.
|
||||||
|
|
||||||
|
Index:
|
||||||
|
- Searchable index of all documents
|
||||||
|
- Might be distributable as deltas or something, not sure — to be decided.
|
||||||
|
- Might be sharded by different parameters (e.g. tags) — specifics to be decided.
|
||||||
|
- Might be sharded by date of raking — specifics to be decided.
|
||||||
|
Not sure how to best manage an ever-growing dataset.
|
||||||
|
|
||||||
|
### Programs
|
||||||
|
|
||||||
|
#### Importer
|
||||||
|
|
||||||
|
Imports URLs from seed files. Needed to bootstrap the entire engine.
|
||||||
|
|
||||||
|
|
||||||
|
#### Raker
|
||||||
|
|
||||||
|
Rakes a page, feed or sitemap.
|
||||||
|
Builds robot.txt file caches as necessary.
|
||||||
|
|
||||||
|
Generates a summarised version of the page.
|
||||||
|
Also tries to extract readable content, for higher ranking in the index.
|
||||||
|
|
||||||
|
Also analyses pages for pop-ups and other issues.
|
||||||
|
(Unsure if we should do the analysis for e.g. cloudflare at this stage or not?)
|
||||||
|
|
||||||
|
#### Indexer
|
||||||
|
|
||||||
|
Imports RakePacks and indexes them for searchability.
|
||||||
|
|
||||||
|
Also maintains a graph database of all cross-page links.
|
||||||
|
We can use this to perform ranking...?
|
||||||
|
|
||||||
|
??? TODO pagerank ???
|
||||||
|
|
||||||
|
|
||||||
|
#### Searcher
|
||||||
|
|
||||||
|
Provides a front-end for searching in the index.
|
||||||
|
Could provide an API. (Maybe we can integrate into Searx and get the best of both?)
|
||||||
|
|
@ -16,6 +16,8 @@ quickpeep_moz_readability = { path = "../quickpeep_moz_readability" }
|
|||||||
kuchiki = "0.8.1"
|
kuchiki = "0.8.1"
|
||||||
html5ever = "0.25.1"
|
html5ever = "0.25.1"
|
||||||
|
|
||||||
|
serde = { version = "1.0.136", features = ["derive"] }
|
||||||
|
|
||||||
|
|
||||||
# TODO: rkyv and memmap2 should be an efficient way to load index packs into processes.
|
# TODO: rkyv and memmap2 should be an efficient way to load index packs into processes.
|
||||||
# rkyv = "0.7.35"
|
# rkyv = "0.7.35"
|
||||||
@ -28,7 +30,7 @@ reqwest = { version = "0.11.9", features = [] }
|
|||||||
# N.B. TODO gemfeeds are Atom feeds for Gemini. Should support those.
|
# N.B. TODO gemfeeds are Atom feeds for Gemini. Should support those.
|
||||||
gemini-fetch = "0.2.1"
|
gemini-fetch = "0.2.1"
|
||||||
# Robots.txt
|
# Robots.txt
|
||||||
cylon = { version = "0.2.0", features = [] }
|
# TODO cylon = { version = "0.2.0", features = [] }
|
||||||
# RSS/Atom/JSON feeds
|
# RSS/Atom/JSON feeds
|
||||||
feed-rs = "1.0.0"
|
feed-rs = "1.0.0"
|
||||||
# Sitemaps
|
# Sitemaps
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
use quickpeep::raking::rake;
|
use quickpeep::raking::rake;
|
||||||
|
use quickpeep::raking::RakeIntent;
|
||||||
use reqwest::Url;
|
use reqwest::Url;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
|
|
||||||
@ -8,12 +9,14 @@ pub async fn main() -> anyhow::Result<()> {
|
|||||||
// TODO max timeout, max body size
|
// TODO max timeout, max body size
|
||||||
rake(
|
rake(
|
||||||
&Url::from_str("http://nothings.org/gamedev/ssao/")?,
|
&Url::from_str("http://nothings.org/gamedev/ssao/")?,
|
||||||
|
RakeIntent::Page,
|
||||||
&client,
|
&client,
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
rake(
|
rake(
|
||||||
&Url::from_str("https://github.com/kuchiki-rs/kuchiki")?,
|
&Url::from_str("https://github.com/kuchiki-rs/kuchiki")?,
|
||||||
|
RakeIntent::Page,
|
||||||
&client,
|
&client,
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
@ -1 +1,12 @@
|
|||||||
pub mod raking;
|
pub mod raking;
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod test {
|
||||||
|
pub fn test_sitemap() {
|
||||||
|
let mut curs = std::io::Cursor::new("<url><loc>https://lol</loc></url>");
|
||||||
|
let reader = sitemap::reader::SiteMapReader::new(curs);
|
||||||
|
for entry in reader {
|
||||||
|
eprintln!("{:?}", entry);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
use anyhow::{bail, Context};
|
use anyhow::{bail, Context};
|
||||||
use reqwest::header::HeaderValue;
|
use reqwest::header::HeaderValue;
|
||||||
use reqwest::{Client, Url};
|
use reqwest::{Client, Url};
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
pub enum RakeOutcome {
|
pub enum RakeOutcome {
|
||||||
RakedPage(RakedPage),
|
RakedPage(RakedPage),
|
||||||
@ -25,24 +26,55 @@ pub struct PermanentFailure {
|
|||||||
pub enum PermanentFailureReason {
|
pub enum PermanentFailureReason {
|
||||||
ResourceDenied(u32),
|
ResourceDenied(u32),
|
||||||
WrongLanguage(String),
|
WrongLanguage(String),
|
||||||
|
UnknownContentType(String),
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn rake(url: &Url, client: &Client) -> anyhow::Result<()> {
|
#[derive(Copy, Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
|
||||||
|
pub enum RakeIntent {
|
||||||
|
Any,
|
||||||
|
Page,
|
||||||
|
Feed,
|
||||||
|
SiteMap,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn rake(url: &Url, intent: RakeIntent, client: &Client) -> anyhow::Result<RakeOutcome> {
|
||||||
let response = client.get(url.clone()).send().await?;
|
let response = client.get(url.clone()).send().await?;
|
||||||
|
|
||||||
if !response.status().is_success() {
|
if !response.status().is_success() {
|
||||||
bail!("Not successful: {:?}", response.status().as_u16());
|
bail!("Not successful: {:?}", response.status().as_u16());
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(content_type) = response.headers().get("content-type") {
|
let content_type = if let Some(content_type) = response.headers().get("content-type") {
|
||||||
let content_type = content_type
|
let content_type = content_type
|
||||||
.to_str()
|
.to_str()
|
||||||
.context("Can't convert content-type to str")?;
|
.context("Can't convert content-type to str")?;
|
||||||
eprintln!("CT {:?}", content_type);
|
eprintln!("CT {:?}", content_type);
|
||||||
}
|
content_type.to_owned()
|
||||||
|
} else {
|
||||||
|
// TODO ???
|
||||||
|
"text/html".to_owned()
|
||||||
|
};
|
||||||
|
|
||||||
let content = response.bytes().await?;
|
let content = response.bytes().await?;
|
||||||
let content_str = std::str::from_utf8(&content)?;
|
|
||||||
|
if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page) {}
|
||||||
|
|
||||||
|
// TODO JSON Feeds.
|
||||||
|
if content_type == "application/xml"
|
||||||
|
&& (intent == RakeIntent::Any || intent == RakeIntent::Feed)
|
||||||
|
{}
|
||||||
|
|
||||||
|
if content_type == "application/xml"
|
||||||
|
&& (intent == RakeIntent::Any || intent == RakeIntent::SiteMap)
|
||||||
|
{}
|
||||||
|
|
||||||
|
return Ok(RakeOutcome::PermanentFailure(PermanentFailure {
|
||||||
|
reason: PermanentFailureReason::UnknownContentType(content_type.to_owned()),
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn rake_html_page(content: &[u8], url: &Url) -> anyhow::Result<()> {
|
||||||
|
let content_str = std::str::from_utf8(content)?;
|
||||||
|
|
||||||
let mut readability = quickpeep_moz_readability::Readability::new(content_str);
|
let mut readability = quickpeep_moz_readability::Readability::new(content_str);
|
||||||
readability
|
readability
|
||||||
@ -57,3 +89,13 @@ pub async fn rake(url: &Url, client: &Client) -> anyhow::Result<()> {
|
|||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result<()> {
|
||||||
|
let x = feed_rs::parser::parse_with_uri(content, Some(url.as_str()))?;
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn rake_sitemap(content: &[u8]) -> anyhow::Result<()> {
|
||||||
|
//let x = sitemap::
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user