diff --git a/Cargo.lock b/Cargo.lock index f269262..87eb6d1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -309,13 +309,34 @@ dependencies = [ "serde", ] +[[package]] +name = "block-buffer" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0940dc441f31689269e10ac70eb1002a3a1d3ad1390e030043662eb7fe4688b" +dependencies = [ + "block-padding", + "byte-tools", + "byteorder", + "generic-array 0.12.4", +] + [[package]] name = "block-buffer" version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4" dependencies = [ - "generic-array", + "generic-array 0.14.5", +] + +[[package]] +name = "block-padding" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa79dedbb091f449f1f39e53edf88d5dbe95f895dae6135a8d7b881fb5af73f5" +dependencies = [ + "byte-tools", ] [[package]] @@ -324,6 +345,12 @@ version = "3.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a4a45a46ab1f2412e53d3a0ade76ffad2025804294569aae387231a0cd6e0899" +[[package]] +name = "byte-tools" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3b5ca7a04898ad4bcd41c90c5285445ff5b791899bb1b0abdd2a2aa791211d7" + [[package]] name = "byteorder" version = "1.4.3" @@ -682,13 +709,22 @@ dependencies = [ "syn", ] +[[package]] +name = "digest" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3d0c8c8752312f9713efd397ff63acb9f85585afbf179282e720e7704954dd5" +dependencies = [ + "generic-array 0.12.4", +] + [[package]] name = "digest" version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066" dependencies = [ - "generic-array", + "generic-array 0.14.5", ] [[package]] @@ -750,6 +786,12 @@ dependencies = [ "termcolor", ] +[[package]] +name = "fake-simd" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e88a8acf291dafb59c2d96e8f59828f3838bb1a70398823ade51a84de6a6deed" + [[package]] name = "fastrand" version = "1.7.0" @@ -963,6 +1005,15 @@ dependencies = [ "x509-signature", ] +[[package]] +name = "generic-array" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffdf9f34f1447443d37393cc6c2b8313aebddcd96906caf34e54c68d8e57d7bd" +dependencies = [ + "typenum", +] + [[package]] name = "generic-array" version = "0.14.5" @@ -2511,6 +2562,12 @@ version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "87f3e037eac156d1775da914196f0f37741a274155e34a0b7e427c35d2a2ecb9" +[[package]] +name = "opaque-debug" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2839e79665f131bdb5782e51f2c6c9599c133c6098982a54c794358bf432529c" + [[package]] name = "opaque-debug" version = "0.3.0" @@ -2649,6 +2706,49 @@ version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" +[[package]] +name = "pest" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" +dependencies = [ + "ucd-trie", +] + +[[package]] +name = "pest_derive" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "833d1ae558dc601e9a60366421196a8d94bc0ac980476d0b67e1d0988d72b2d0" +dependencies = [ + "pest", + "pest_generator", +] + +[[package]] +name = "pest_generator" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99b8db626e31e5b81787b9783425769681b347011cc59471e33ea46d2ea0cf55" +dependencies = [ + "pest", + "pest_meta", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "pest_meta" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54be6e404f5317079812fc8f9f5279de376d8856929e21c184ecf6bbd692a11d" +dependencies = [ + "maplit", + "pest", + "sha-1", +] + [[package]] name = "phf" version = "0.8.0" @@ -2946,6 +3046,16 @@ dependencies = [ "toml", ] +[[package]] +name = "quickpeep_seed_parser" +version = "0.1.0" +dependencies = [ + "anyhow", + "pest", + "pest_derive", + "smartstring", +] + [[package]] name = "quickpeep_structs" version = "0.1.0" @@ -3404,17 +3514,29 @@ dependencies = [ "stable_deref_trait", ] +[[package]] +name = "sha-1" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7d94d0bede923b3cea61f3f1ff57ff8cdfd77b400fb8f9998949e0cf04163df" +dependencies = [ + "block-buffer 0.7.3", + "digest 0.8.1", + "fake-simd", + "opaque-debug 0.2.3", +] + [[package]] name = "sha2" version = "0.9.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4d58a1e1bf39749807d89cf2d98ac2dfa0ff1cb3faa38fbb64dd88ac8013d800" dependencies = [ - "block-buffer", + "block-buffer 0.9.0", "cfg-if", "cpufeatures", - "digest", - "opaque-debug", + "digest 0.9.0", + "opaque-debug 0.3.0", ] [[package]] @@ -3996,6 +4118,12 @@ version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dcf81ac59edc17cc8697ff311e8f5ef2d99fcbd9817b34cec66f90b6c3dfd987" +[[package]] +name = "ucd-trie" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c" + [[package]] name = "unchecked-index" version = "0.2.2" diff --git a/Cargo.toml b/Cargo.toml index 522d085..daf2c51 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,6 +5,7 @@ members = [ "quickpeep_raker", "quickpeep_densedoc", "quickpeep_moz_readability", + "quickpeep_seed_parser", "quickpeep_structs", "quickpeep_utils" ] diff --git a/quickpeep_seed_parser/Cargo.toml b/quickpeep_seed_parser/Cargo.toml new file mode 100644 index 0000000..4f5a9da --- /dev/null +++ b/quickpeep_seed_parser/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "quickpeep_seed_parser" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +pest = "2.1.3" +pest_derive = "2.1.0" +smartstring = "1.0.0" +anyhow = "1.0.56" diff --git a/quickpeep_seed_parser/src/lib.rs b/quickpeep_seed_parser/src/lib.rs new file mode 100644 index 0000000..a37602e --- /dev/null +++ b/quickpeep_seed_parser/src/lib.rs @@ -0,0 +1,80 @@ +extern crate pest; +#[macro_use] +extern crate pest_derive; + +use anyhow::{bail, ensure, Context}; +use smartstring::alias::CompactString; + +#[cfg(test)] +mod test; + +#[derive(Parser)] +#[grammar = "seed.pest"] +struct SeedParser; + +use pest::Parser; + +#[derive(Eq, PartialEq, Clone, Debug)] +pub struct SeedBlock { + pub tags: Vec, + pub seeds: Vec, +} + +#[derive(Eq, PartialEq, Clone, Debug)] +pub struct Seed { + pub url: String, + pub extra_tags: Vec, +} + +pub fn parse_seeds(input: &str) -> anyhow::Result> { + use pest::iterators::Pair; + + pub fn parse_tag(pair: Pair) -> anyhow::Result { + ensure!(matches!(pair.as_rule(), Rule::tagName)); + Ok(CompactString::from(pair.as_str())) + } + + pub fn parse_url(pair: Pair) -> anyhow::Result { + ensure!(matches!(pair.as_rule(), Rule::url)); + Ok(String::from(pair.as_str())) + } + + pub fn parse_seedblock_header(pair: Pair) -> anyhow::Result> { + pair.into_inner().map(parse_tag).collect() + } + + pub fn parse_seed(pair: Pair) -> anyhow::Result { + let mut children = pair.into_inner(); + let url = parse_url(children.next().context("Expecting URL")?)?; + + let extra_tags: anyhow::Result> = children.map(parse_tag).collect(); + let extra_tags = extra_tags?; + + Ok(Seed { url, extra_tags }) + } + + pub fn parse_seedblock(pair: Pair) -> anyhow::Result { + match pair.as_rule() { + Rule::seedblock => { + let mut children = pair.into_inner(); + let header = children.next().context("No seedblock header")?; + let tags = parse_seedblock_header(header)?; + + let urls: anyhow::Result> = children.map(parse_seed).collect(); + let seeds = urls?; + + Ok(SeedBlock { tags, seeds }) + } + other => { + bail!("Looking for seedblock; unexpected {:#?}", other); + } + } + } + + let pairs = SeedParser::parse(Rule::main, input)?; + pairs + .into_iter() + .filter(|pair| pair.as_rule() != Rule::EOI) + .map(parse_seedblock) + .collect() +} diff --git a/quickpeep_seed_parser/src/seed.pest b/quickpeep_seed_parser/src/seed.pest new file mode 100644 index 0000000..836efaf --- /dev/null +++ b/quickpeep_seed_parser/src/seed.pest @@ -0,0 +1,20 @@ +main = _{ SOI ~ (WHITESPACE | newline)* ~ seedblock* ~ (WHITESPACE | newline)* ~ EOI } + +seedheader = { !urlPrefix ~ tagName ~ ("," ~ tagName)* ~ ":" } +seedblock = { seedheader ~ newline+ ~ urlEntry+ } + +urlEntry = ${ url ~ (WHITESPACE+ ~ oneUrlTags)? ~ WHITESPACE* ~ newline+ } +urlPrefix = _{ "http://" | "https://" | "gemini://" } +url = @{ urlPrefix ~ nonwhitespace+ } + +tagName = @{ ALPHABETIC+ } + +oneUrlTags = _{ "[" ~ tagName ~ ("," ~ tagName)* ~ "]" } + + +COMMENT = _{ "#" ~ (!newline ~ ANY)* ~ newline } + +WHITESPACE = _{ " " | "\t" } +nonwhitespace = @{ !(" " | "\t" | "\n") ~ ANY } + +newline = _{ "\n" } \ No newline at end of file diff --git a/quickpeep_seed_parser/src/test.rs b/quickpeep_seed_parser/src/test.rs new file mode 100644 index 0000000..70a11c2 --- /dev/null +++ b/quickpeep_seed_parser/src/test.rs @@ -0,0 +1,66 @@ +use crate::{parse_seeds, Seed, SeedBlock}; + +#[test] +fn test_simple_seeds() { + let parsed = parse_seeds( + r#" +Blog, Example: +https://example.org + "#, + ) + .unwrap(); + + assert_eq!( + parsed, + vec![SeedBlock { + tags: vec!["Blog".into(), "Example".into()], + seeds: vec![Seed { + url: "https://example.org".to_string(), + extra_tags: vec![] + }] + }] + ); +} + +#[test] +fn test_complicated_seeds() { + let parsed = parse_seeds( + r#" +Blog, Example: +https://example.org +# This demonstrates fancy extra tags +https://example.com [Commercial] + +# This demonstrates another seed block +Software: +https://www2.example.org + "#, + ) + .unwrap(); + + assert_eq!( + parsed, + vec![ + SeedBlock { + tags: vec!["Blog".into(), "Example".into()], + seeds: vec![ + Seed { + url: "https://example.org".to_string(), + extra_tags: vec![] + }, + Seed { + url: "https://example.com".to_string(), + extra_tags: vec!["Commercial".into()] + } + ] + }, + SeedBlock { + tags: vec!["Software".into()], + seeds: vec![Seed { + url: "https://www2.example.org".to_string(), + extra_tags: vec![] + }] + } + ] + ); +}