Add seed file parser

This commit is contained in:
Olivier 'reivilibre' 2022-03-20 20:29:32 +00:00
parent 5e61386a83
commit 39aa4eb9b7
6 changed files with 312 additions and 5 deletions

138
Cargo.lock generated
View File

@ -309,13 +309,34 @@ dependencies = [
"serde", "serde",
] ]
[[package]]
name = "block-buffer"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c0940dc441f31689269e10ac70eb1002a3a1d3ad1390e030043662eb7fe4688b"
dependencies = [
"block-padding",
"byte-tools",
"byteorder",
"generic-array 0.12.4",
]
[[package]] [[package]]
name = "block-buffer" name = "block-buffer"
version = "0.9.0" version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4" checksum = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4"
dependencies = [ dependencies = [
"generic-array", "generic-array 0.14.5",
]
[[package]]
name = "block-padding"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa79dedbb091f449f1f39e53edf88d5dbe95f895dae6135a8d7b881fb5af73f5"
dependencies = [
"byte-tools",
] ]
[[package]] [[package]]
@ -324,6 +345,12 @@ version = "3.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a4a45a46ab1f2412e53d3a0ade76ffad2025804294569aae387231a0cd6e0899" checksum = "a4a45a46ab1f2412e53d3a0ade76ffad2025804294569aae387231a0cd6e0899"
[[package]]
name = "byte-tools"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3b5ca7a04898ad4bcd41c90c5285445ff5b791899bb1b0abdd2a2aa791211d7"
[[package]] [[package]]
name = "byteorder" name = "byteorder"
version = "1.4.3" version = "1.4.3"
@ -682,13 +709,22 @@ dependencies = [
"syn", "syn",
] ]
[[package]]
name = "digest"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f3d0c8c8752312f9713efd397ff63acb9f85585afbf179282e720e7704954dd5"
dependencies = [
"generic-array 0.12.4",
]
[[package]] [[package]]
name = "digest" name = "digest"
version = "0.9.0" version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066" checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066"
dependencies = [ dependencies = [
"generic-array", "generic-array 0.14.5",
] ]
[[package]] [[package]]
@ -750,6 +786,12 @@ dependencies = [
"termcolor", "termcolor",
] ]
[[package]]
name = "fake-simd"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e88a8acf291dafb59c2d96e8f59828f3838bb1a70398823ade51a84de6a6deed"
[[package]] [[package]]
name = "fastrand" name = "fastrand"
version = "1.7.0" version = "1.7.0"
@ -963,6 +1005,15 @@ dependencies = [
"x509-signature", "x509-signature",
] ]
[[package]]
name = "generic-array"
version = "0.12.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ffdf9f34f1447443d37393cc6c2b8313aebddcd96906caf34e54c68d8e57d7bd"
dependencies = [
"typenum",
]
[[package]] [[package]]
name = "generic-array" name = "generic-array"
version = "0.14.5" version = "0.14.5"
@ -2511,6 +2562,12 @@ version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "87f3e037eac156d1775da914196f0f37741a274155e34a0b7e427c35d2a2ecb9" checksum = "87f3e037eac156d1775da914196f0f37741a274155e34a0b7e427c35d2a2ecb9"
[[package]]
name = "opaque-debug"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2839e79665f131bdb5782e51f2c6c9599c133c6098982a54c794358bf432529c"
[[package]] [[package]]
name = "opaque-debug" name = "opaque-debug"
version = "0.3.0" version = "0.3.0"
@ -2649,6 +2706,49 @@ version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e"
[[package]]
name = "pest"
version = "2.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53"
dependencies = [
"ucd-trie",
]
[[package]]
name = "pest_derive"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "833d1ae558dc601e9a60366421196a8d94bc0ac980476d0b67e1d0988d72b2d0"
dependencies = [
"pest",
"pest_generator",
]
[[package]]
name = "pest_generator"
version = "2.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "99b8db626e31e5b81787b9783425769681b347011cc59471e33ea46d2ea0cf55"
dependencies = [
"pest",
"pest_meta",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "pest_meta"
version = "2.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "54be6e404f5317079812fc8f9f5279de376d8856929e21c184ecf6bbd692a11d"
dependencies = [
"maplit",
"pest",
"sha-1",
]
[[package]] [[package]]
name = "phf" name = "phf"
version = "0.8.0" version = "0.8.0"
@ -2946,6 +3046,16 @@ dependencies = [
"toml", "toml",
] ]
[[package]]
name = "quickpeep_seed_parser"
version = "0.1.0"
dependencies = [
"anyhow",
"pest",
"pest_derive",
"smartstring",
]
[[package]] [[package]]
name = "quickpeep_structs" name = "quickpeep_structs"
version = "0.1.0" version = "0.1.0"
@ -3404,17 +3514,29 @@ dependencies = [
"stable_deref_trait", "stable_deref_trait",
] ]
[[package]]
name = "sha-1"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7d94d0bede923b3cea61f3f1ff57ff8cdfd77b400fb8f9998949e0cf04163df"
dependencies = [
"block-buffer 0.7.3",
"digest 0.8.1",
"fake-simd",
"opaque-debug 0.2.3",
]
[[package]] [[package]]
name = "sha2" name = "sha2"
version = "0.9.9" version = "0.9.9"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4d58a1e1bf39749807d89cf2d98ac2dfa0ff1cb3faa38fbb64dd88ac8013d800" checksum = "4d58a1e1bf39749807d89cf2d98ac2dfa0ff1cb3faa38fbb64dd88ac8013d800"
dependencies = [ dependencies = [
"block-buffer", "block-buffer 0.9.0",
"cfg-if", "cfg-if",
"cpufeatures", "cpufeatures",
"digest", "digest 0.9.0",
"opaque-debug", "opaque-debug 0.3.0",
] ]
[[package]] [[package]]
@ -3996,6 +4118,12 @@ version = "1.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dcf81ac59edc17cc8697ff311e8f5ef2d99fcbd9817b34cec66f90b6c3dfd987" checksum = "dcf81ac59edc17cc8697ff311e8f5ef2d99fcbd9817b34cec66f90b6c3dfd987"
[[package]]
name = "ucd-trie"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c"
[[package]] [[package]]
name = "unchecked-index" name = "unchecked-index"
version = "0.2.2" version = "0.2.2"

View File

@ -5,6 +5,7 @@ members = [
"quickpeep_raker", "quickpeep_raker",
"quickpeep_densedoc", "quickpeep_densedoc",
"quickpeep_moz_readability", "quickpeep_moz_readability",
"quickpeep_seed_parser",
"quickpeep_structs", "quickpeep_structs",
"quickpeep_utils" "quickpeep_utils"
] ]

View File

@ -0,0 +1,12 @@
[package]
name = "quickpeep_seed_parser"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
pest = "2.1.3"
pest_derive = "2.1.0"
smartstring = "1.0.0"
anyhow = "1.0.56"

View File

@ -0,0 +1,80 @@
extern crate pest;
#[macro_use]
extern crate pest_derive;
use anyhow::{bail, ensure, Context};
use smartstring::alias::CompactString;
#[cfg(test)]
mod test;
#[derive(Parser)]
#[grammar = "seed.pest"]
struct SeedParser;
use pest::Parser;
#[derive(Eq, PartialEq, Clone, Debug)]
pub struct SeedBlock {
pub tags: Vec<CompactString>,
pub seeds: Vec<Seed>,
}
#[derive(Eq, PartialEq, Clone, Debug)]
pub struct Seed {
pub url: String,
pub extra_tags: Vec<CompactString>,
}
pub fn parse_seeds(input: &str) -> anyhow::Result<Vec<SeedBlock>> {
use pest::iterators::Pair;
pub fn parse_tag(pair: Pair<Rule>) -> anyhow::Result<CompactString> {
ensure!(matches!(pair.as_rule(), Rule::tagName));
Ok(CompactString::from(pair.as_str()))
}
pub fn parse_url(pair: Pair<Rule>) -> anyhow::Result<String> {
ensure!(matches!(pair.as_rule(), Rule::url));
Ok(String::from(pair.as_str()))
}
pub fn parse_seedblock_header(pair: Pair<Rule>) -> anyhow::Result<Vec<CompactString>> {
pair.into_inner().map(parse_tag).collect()
}
pub fn parse_seed(pair: Pair<Rule>) -> anyhow::Result<Seed> {
let mut children = pair.into_inner();
let url = parse_url(children.next().context("Expecting URL")?)?;
let extra_tags: anyhow::Result<Vec<CompactString>> = children.map(parse_tag).collect();
let extra_tags = extra_tags?;
Ok(Seed { url, extra_tags })
}
pub fn parse_seedblock(pair: Pair<Rule>) -> anyhow::Result<SeedBlock> {
match pair.as_rule() {
Rule::seedblock => {
let mut children = pair.into_inner();
let header = children.next().context("No seedblock header")?;
let tags = parse_seedblock_header(header)?;
let urls: anyhow::Result<Vec<Seed>> = children.map(parse_seed).collect();
let seeds = urls?;
Ok(SeedBlock { tags, seeds })
}
other => {
bail!("Looking for seedblock; unexpected {:#?}", other);
}
}
}
let pairs = SeedParser::parse(Rule::main, input)?;
pairs
.into_iter()
.filter(|pair| pair.as_rule() != Rule::EOI)
.map(parse_seedblock)
.collect()
}

View File

@ -0,0 +1,20 @@
main = _{ SOI ~ (WHITESPACE | newline)* ~ seedblock* ~ (WHITESPACE | newline)* ~ EOI }
seedheader = { !urlPrefix ~ tagName ~ ("," ~ tagName)* ~ ":" }
seedblock = { seedheader ~ newline+ ~ urlEntry+ }
urlEntry = ${ url ~ (WHITESPACE+ ~ oneUrlTags)? ~ WHITESPACE* ~ newline+ }
urlPrefix = _{ "http://" | "https://" | "gemini://" }
url = @{ urlPrefix ~ nonwhitespace+ }
tagName = @{ ALPHABETIC+ }
oneUrlTags = _{ "[" ~ tagName ~ ("," ~ tagName)* ~ "]" }
COMMENT = _{ "#" ~ (!newline ~ ANY)* ~ newline }
WHITESPACE = _{ " " | "\t" }
nonwhitespace = @{ !(" " | "\t" | "\n") ~ ANY }
newline = _{ "\n" }

View File

@ -0,0 +1,66 @@
use crate::{parse_seeds, Seed, SeedBlock};
#[test]
fn test_simple_seeds() {
let parsed = parse_seeds(
r#"
Blog, Example:
https://example.org
"#,
)
.unwrap();
assert_eq!(
parsed,
vec![SeedBlock {
tags: vec!["Blog".into(), "Example".into()],
seeds: vec![Seed {
url: "https://example.org".to_string(),
extra_tags: vec![]
}]
}]
);
}
#[test]
fn test_complicated_seeds() {
let parsed = parse_seeds(
r#"
Blog, Example:
https://example.org
# This demonstrates fancy extra tags
https://example.com [Commercial]
# This demonstrates another seed block
Software:
https://www2.example.org
"#,
)
.unwrap();
assert_eq!(
parsed,
vec![
SeedBlock {
tags: vec!["Blog".into(), "Example".into()],
seeds: vec![
Seed {
url: "https://example.org".to_string(),
extra_tags: vec![]
},
Seed {
url: "https://example.com".to_string(),
extra_tags: vec!["Commercial".into()]
}
]
},
SeedBlock {
tags: vec!["Software".into()],
seeds: vec![Seed {
url: "https://www2.example.org".to_string(),
extra_tags: vec![]
}]
}
]
);
}