First step towards minimum usability
This commit is contained in:
parent
4cd259d0ac
commit
db5524eb52
26
Cargo.lock
generated
26
Cargo.lock
generated
@ -196,17 +196,6 @@ dependencies = [
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cylon"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8d4e899a624b708589dea1b6396de3ef38bf3843c824a89190b0fa82ae6e7fd3"
|
||||
dependencies = [
|
||||
"futures-util",
|
||||
"serde",
|
||||
"serde_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "derive_more"
|
||||
version = "0.99.17"
|
||||
@ -358,17 +347,6 @@ version = "0.3.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0c09fd04b7e4073ac7156a9539b57a484a8ea920f79c7c675d05d289ab6110d3"
|
||||
|
||||
[[package]]
|
||||
name = "futures-macro"
|
||||
version = "0.3.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "33c1e13800337f4d4d7a316bf45a567dbcb6ffe087f16424852d97e97a91f512"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures-sink"
|
||||
version = "0.3.21"
|
||||
@ -388,11 +366,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d8b7abd5d659d9b90c8cba917f6ec750a74e2dc23902ef9cd4cc8c8b22e6036a"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-macro",
|
||||
"futures-task",
|
||||
"pin-project-lite",
|
||||
"pin-utils",
|
||||
"slab",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -1053,7 +1029,6 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"adblock",
|
||||
"anyhow",
|
||||
"cylon",
|
||||
"env_logger",
|
||||
"feed-rs",
|
||||
"gemini-fetch",
|
||||
@ -1062,6 +1037,7 @@ dependencies = [
|
||||
"log",
|
||||
"quickpeep_moz_readability",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"sitemap",
|
||||
"tokio",
|
||||
]
|
||||
|
66
docs/concepts.md
Normal file
66
docs/concepts.md
Normal file
@ -0,0 +1,66 @@
|
||||
QuickPeep Concepts
|
||||
==================
|
||||
|
||||
Principles
|
||||
----------
|
||||
|
||||
1. Focus on good-quality, interesting, personal content rather than completeness
|
||||
for every search query.
|
||||
2. Support running a search engine on modest hardware.
|
||||
Critically, disk space is likely to be constrained in real-world deployments.
|
||||
|
||||
|
||||
Components and Subcomponents
|
||||
----------------------------
|
||||
|
||||
### On-disk Structures
|
||||
|
||||
Schedule:
|
||||
- List of URLs to rake
|
||||
- Backoffs for failing hosts
|
||||
|
||||
RakePack:
|
||||
- Contains summarised results of scraping many pages
|
||||
- In a streamable, dense memory-mappable format.
|
||||
- Perhaps use `rkyv` to store the records.
|
||||
|
||||
Index:
|
||||
- Searchable index of all documents
|
||||
- Might be distributable as deltas or something, not sure — to be decided.
|
||||
- Might be sharded by different parameters (e.g. tags) — specifics to be decided.
|
||||
- Might be sharded by date of raking — specifics to be decided.
|
||||
Not sure how to best manage an ever-growing dataset.
|
||||
|
||||
### Programs
|
||||
|
||||
#### Importer
|
||||
|
||||
Imports URLs from seed files. Needed to bootstrap the entire engine.
|
||||
|
||||
|
||||
#### Raker
|
||||
|
||||
Rakes a page, feed or sitemap.
|
||||
Builds robot.txt file caches as necessary.
|
||||
|
||||
Generates a summarised version of the page.
|
||||
Also tries to extract readable content, for higher ranking in the index.
|
||||
|
||||
Also analyses pages for pop-ups and other issues.
|
||||
(Unsure if we should do the analysis for e.g. cloudflare at this stage or not?)
|
||||
|
||||
#### Indexer
|
||||
|
||||
Imports RakePacks and indexes them for searchability.
|
||||
|
||||
Also maintains a graph database of all cross-page links.
|
||||
We can use this to perform ranking...?
|
||||
|
||||
??? TODO pagerank ???
|
||||
|
||||
|
||||
#### Searcher
|
||||
|
||||
Provides a front-end for searching in the index.
|
||||
Could provide an API. (Maybe we can integrate into Searx and get the best of both?)
|
||||
|
@ -16,6 +16,8 @@ quickpeep_moz_readability = { path = "../quickpeep_moz_readability" }
|
||||
kuchiki = "0.8.1"
|
||||
html5ever = "0.25.1"
|
||||
|
||||
serde = { version = "1.0.136", features = ["derive"] }
|
||||
|
||||
|
||||
# TODO: rkyv and memmap2 should be an efficient way to load index packs into processes.
|
||||
# rkyv = "0.7.35"
|
||||
@ -28,7 +30,7 @@ reqwest = { version = "0.11.9", features = [] }
|
||||
# N.B. TODO gemfeeds are Atom feeds for Gemini. Should support those.
|
||||
gemini-fetch = "0.2.1"
|
||||
# Robots.txt
|
||||
cylon = { version = "0.2.0", features = [] }
|
||||
# TODO cylon = { version = "0.2.0", features = [] }
|
||||
# RSS/Atom/JSON feeds
|
||||
feed-rs = "1.0.0"
|
||||
# Sitemaps
|
||||
|
@ -1,4 +1,5 @@
|
||||
use quickpeep::raking::rake;
|
||||
use quickpeep::raking::RakeIntent;
|
||||
use reqwest::Url;
|
||||
use std::str::FromStr;
|
||||
|
||||
@ -8,12 +9,14 @@ pub async fn main() -> anyhow::Result<()> {
|
||||
// TODO max timeout, max body size
|
||||
rake(
|
||||
&Url::from_str("http://nothings.org/gamedev/ssao/")?,
|
||||
RakeIntent::Page,
|
||||
&client,
|
||||
)
|
||||
.await?;
|
||||
|
||||
rake(
|
||||
&Url::from_str("https://github.com/kuchiki-rs/kuchiki")?,
|
||||
RakeIntent::Page,
|
||||
&client,
|
||||
)
|
||||
.await?;
|
||||
|
@ -1 +1,12 @@
|
||||
pub mod raking;
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
pub fn test_sitemap() {
|
||||
let mut curs = std::io::Cursor::new("<url><loc>https://lol</loc></url>");
|
||||
let reader = sitemap::reader::SiteMapReader::new(curs);
|
||||
for entry in reader {
|
||||
eprintln!("{:?}", entry);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
use anyhow::{bail, Context};
|
||||
use reqwest::header::HeaderValue;
|
||||
use reqwest::{Client, Url};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
pub enum RakeOutcome {
|
||||
RakedPage(RakedPage),
|
||||
@ -25,24 +26,55 @@ pub struct PermanentFailure {
|
||||
pub enum PermanentFailureReason {
|
||||
ResourceDenied(u32),
|
||||
WrongLanguage(String),
|
||||
UnknownContentType(String),
|
||||
}
|
||||
|
||||
pub async fn rake(url: &Url, client: &Client) -> anyhow::Result<()> {
|
||||
#[derive(Copy, Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub enum RakeIntent {
|
||||
Any,
|
||||
Page,
|
||||
Feed,
|
||||
SiteMap,
|
||||
}
|
||||
|
||||
pub async fn rake(url: &Url, intent: RakeIntent, client: &Client) -> anyhow::Result<RakeOutcome> {
|
||||
let response = client.get(url.clone()).send().await?;
|
||||
|
||||
if !response.status().is_success() {
|
||||
bail!("Not successful: {:?}", response.status().as_u16());
|
||||
}
|
||||
|
||||
if let Some(content_type) = response.headers().get("content-type") {
|
||||
let content_type = if let Some(content_type) = response.headers().get("content-type") {
|
||||
let content_type = content_type
|
||||
.to_str()
|
||||
.context("Can't convert content-type to str")?;
|
||||
eprintln!("CT {:?}", content_type);
|
||||
}
|
||||
content_type.to_owned()
|
||||
} else {
|
||||
// TODO ???
|
||||
"text/html".to_owned()
|
||||
};
|
||||
|
||||
let content = response.bytes().await?;
|
||||
let content_str = std::str::from_utf8(&content)?;
|
||||
|
||||
if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page) {}
|
||||
|
||||
// TODO JSON Feeds.
|
||||
if content_type == "application/xml"
|
||||
&& (intent == RakeIntent::Any || intent == RakeIntent::Feed)
|
||||
{}
|
||||
|
||||
if content_type == "application/xml"
|
||||
&& (intent == RakeIntent::Any || intent == RakeIntent::SiteMap)
|
||||
{}
|
||||
|
||||
return Ok(RakeOutcome::PermanentFailure(PermanentFailure {
|
||||
reason: PermanentFailureReason::UnknownContentType(content_type.to_owned()),
|
||||
}));
|
||||
}
|
||||
|
||||
pub fn rake_html_page(content: &[u8], url: &Url) -> anyhow::Result<()> {
|
||||
let content_str = std::str::from_utf8(content)?;
|
||||
|
||||
let mut readability = quickpeep_moz_readability::Readability::new(content_str);
|
||||
readability
|
||||
@ -57,3 +89,13 @@ pub async fn rake(url: &Url, client: &Client) -> anyhow::Result<()> {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result<()> {
|
||||
let x = feed_rs::parser::parse_with_uri(content, Some(url.as_str()))?;
|
||||
todo!()
|
||||
}
|
||||
|
||||
pub fn rake_sitemap(content: &[u8]) -> anyhow::Result<()> {
|
||||
//let x = sitemap::
|
||||
todo!()
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user