Sort through TODO items
This commit is contained in:
parent
120702ce0e
commit
6d109632a3
|
@ -272,7 +272,7 @@ impl DenseTreeBuilder {
|
||||||
self.preceding_newlines = 0;
|
self.preceding_newlines = 0;
|
||||||
}
|
}
|
||||||
"img" => {
|
"img" => {
|
||||||
// TODO Decide if this is worth the space...
|
// TODO(future) Decide if this is worth the space...
|
||||||
let attrs = element.attributes.borrow();
|
let attrs = element.attributes.borrow();
|
||||||
let src = attrs.get("src").unwrap_or("").to_owned();
|
let src = attrs.get("src").unwrap_or("").to_owned();
|
||||||
|
|
||||||
|
|
|
@ -26,17 +26,17 @@ const FLAG_WEIGHT_CLASSES: u32 = 0x2;
|
||||||
const FLAG_CLEAN_CONDITIONALLY: u32 = 0x4;
|
const FLAG_CLEAN_CONDITIONALLY: u32 = 0x4;
|
||||||
const READABILITY_SCORE: &'static str = "readability-score";
|
const READABILITY_SCORE: &'static str = "readability-score";
|
||||||
const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml";
|
const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml";
|
||||||
// TODO: Change to HashSet
|
// TODO(perf): Change to HashSet
|
||||||
const PHRASING_ELEMS: [&str; 39] = [
|
const PHRASING_ELEMS: [&str; 39] = [
|
||||||
"abbr", "audio", "b", "bdo", "br", "button", "cite", "code", "data", "datalist", "dfn", "em",
|
"abbr", "audio", "b", "bdo", "br", "button", "cite", "code", "data", "datalist", "dfn", "em",
|
||||||
"embed", "i", "img", "input", "kbd", "label", "mark", "math", "meter", "noscript", "object",
|
"embed", "i", "img", "input", "kbd", "label", "mark", "math", "meter", "noscript", "object",
|
||||||
"output", "progress", "q", "ruby", "samp", "script", "select", "small", "span", "strong",
|
"output", "progress", "q", "ruby", "samp", "script", "select", "small", "span", "strong",
|
||||||
"sub", "sup", "textarea", "time", "var", "wbr",
|
"sub", "sup", "textarea", "time", "var", "wbr",
|
||||||
];
|
];
|
||||||
// TODO: Change to HashSet
|
// TODO(perf): Change to HashSet
|
||||||
const DEFAULT_TAGS_TO_SCORE: [&str; 9] =
|
const DEFAULT_TAGS_TO_SCORE: [&str; 9] =
|
||||||
["section", "h2", "h3", "h4", "h5", "h6", "p", "td", "pre"];
|
["section", "h2", "h3", "h4", "h5", "h6", "p", "td", "pre"];
|
||||||
// TODO: Change to HashSet
|
// TODO(perf): Change to HashSet
|
||||||
const ALTER_TO_DIV_EXCEPTIONS: [&str; 4] = ["div", "article", "section", "p"];
|
const ALTER_TO_DIV_EXCEPTIONS: [&str; 4] = ["div", "article", "section", "p"];
|
||||||
const PRESENTATIONAL_ATTRIBUTES: [&str; 12] = [
|
const PRESENTATIONAL_ATTRIBUTES: [&str; 12] = [
|
||||||
"align",
|
"align",
|
||||||
|
@ -54,7 +54,7 @@ const PRESENTATIONAL_ATTRIBUTES: [&str; 12] = [
|
||||||
];
|
];
|
||||||
|
|
||||||
const DATA_TABLE_DESCENDANTS: [&str; 5] = ["col", "colgroup", "tfoot", "thead", "th"];
|
const DATA_TABLE_DESCENDANTS: [&str; 5] = ["col", "colgroup", "tfoot", "thead", "th"];
|
||||||
// TODO: Change to HashSet
|
// TODO(perf): Change to HashSet
|
||||||
const DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [&str; 5] = ["table", "th", "td", "hr", "pre"];
|
const DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [&str; 5] = ["table", "th", "td", "hr", "pre"];
|
||||||
|
|
||||||
pub mod regexes;
|
pub mod regexes;
|
||||||
|
@ -168,7 +168,7 @@ impl Readability {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if let Some(mut prev_elem) = noscript.as_node().previous_sibling() {
|
if let Some(mut prev_elem) = noscript.as_node().previous_sibling() {
|
||||||
// TODO: Fix this to have a better way of extracting nodes that are elements
|
// TODO(style): Fix this to have a better way of extracting nodes that are elements
|
||||||
while prev_elem.as_element().is_none() {
|
while prev_elem.as_element().is_none() {
|
||||||
match prev_elem.previous_sibling() {
|
match prev_elem.previous_sibling() {
|
||||||
Some(new_prev) => prev_elem = new_prev,
|
Some(new_prev) => prev_elem = new_prev,
|
||||||
|
@ -389,7 +389,7 @@ impl Readability {
|
||||||
/// The must_be_element argument ensure the next element is actually an element node.
|
/// The must_be_element argument ensure the next element is actually an element node.
|
||||||
/// This is likely to factored out into a new function.
|
/// This is likely to factored out into a new function.
|
||||||
fn next_element(node_ref: Option<NodeRef>, must_be_element: bool) -> Option<NodeRef> {
|
fn next_element(node_ref: Option<NodeRef>, must_be_element: bool) -> Option<NodeRef> {
|
||||||
// TODO: Could probably be refactored to use the elements method
|
// TODO(style): Could probably be refactored to use the elements method
|
||||||
let mut node_ref = node_ref;
|
let mut node_ref = node_ref;
|
||||||
while node_ref.is_some() {
|
while node_ref.is_some() {
|
||||||
match node_ref.as_ref().unwrap().data() {
|
match node_ref.as_ref().unwrap().data() {
|
||||||
|
@ -544,7 +544,7 @@ impl Readability {
|
||||||
/// Converts some of the common HTML entities in string to their corresponding characters.
|
/// Converts some of the common HTML entities in string to their corresponding characters.
|
||||||
fn unescape_html_entities(value: &mut String) {
|
fn unescape_html_entities(value: &mut String) {
|
||||||
if !value.is_empty() {
|
if !value.is_empty() {
|
||||||
// TODO: Extract this
|
// TODO(style): Extract this
|
||||||
let mut html_escape_map: HashMap<&str, &str> = HashMap::new();
|
let mut html_escape_map: HashMap<&str, &str> = HashMap::new();
|
||||||
html_escape_map.insert("lt", "<");
|
html_escape_map.insert("lt", "<");
|
||||||
html_escape_map.insert("gt", ">");
|
html_escape_map.insert("gt", ">");
|
||||||
|
@ -637,7 +637,7 @@ impl Readability {
|
||||||
/// Removes the class="" attribute from every element in the given subtree, except those that
|
/// Removes the class="" attribute from every element in the given subtree, except those that
|
||||||
/// match CLASSES_TO_PRESERVE and the classesToPreserve array from the options object.
|
/// match CLASSES_TO_PRESERVE and the classesToPreserve array from the options object.
|
||||||
fn clean_classes(&mut self) {
|
fn clean_classes(&mut self) {
|
||||||
// TODO: This should accessed from Self
|
// TODO(style): This should accessed from Self
|
||||||
let classes_to_preserve: HashSet<&str> = HashSet::new();
|
let classes_to_preserve: HashSet<&str> = HashSet::new();
|
||||||
if let Some(article_node) = &mut self.article_node {
|
if let Some(article_node) = &mut self.article_node {
|
||||||
for elem in article_node.inclusive_descendants().elements() {
|
for elem in article_node.inclusive_descendants().elements() {
|
||||||
|
@ -790,7 +790,7 @@ impl Readability {
|
||||||
/// Run any post-process modifications to article content as necessary.
|
/// Run any post-process modifications to article content as necessary.
|
||||||
fn post_process_content(&mut self, url: &str) {
|
fn post_process_content(&mut self, url: &str) {
|
||||||
self.fix_relative_uris(url);
|
self.fix_relative_uris(url);
|
||||||
// TODO: Add flag check
|
// TODO(style): Add flag check
|
||||||
self.clean_classes();
|
self.clean_classes();
|
||||||
self.clean_readability_attrs();
|
self.clean_readability_attrs();
|
||||||
}
|
}
|
||||||
|
@ -1041,7 +1041,7 @@ impl Readability {
|
||||||
|
|
||||||
/// Determine whether element has any children block level elements.
|
/// Determine whether element has any children block level elements.
|
||||||
fn has_child_block_element(node_ref: &NodeRef) -> bool {
|
fn has_child_block_element(node_ref: &NodeRef) -> bool {
|
||||||
// TODO: Refer to a static HashSet
|
// TODO(perf): Refer to a static HashSet
|
||||||
let block_level_elems: [&str; 32] = [
|
let block_level_elems: [&str; 32] = [
|
||||||
"address",
|
"address",
|
||||||
"article",
|
"article",
|
||||||
|
@ -1622,13 +1622,13 @@ impl Readability {
|
||||||
let page = self.root_node.select_first("body");
|
let page = self.root_node.select_first("body");
|
||||||
if page.is_err() {
|
if page.is_err() {
|
||||||
bail!("Document has no <body>");
|
bail!("Document has no <body>");
|
||||||
// TODO return Err(ErrorKind::ReadabilityError("Document has no <body>".into()).into());
|
// TODO(style) return Err(ErrorKind::ReadabilityError("Document has no <body>".into()).into());
|
||||||
}
|
}
|
||||||
let page = page.unwrap();
|
let page = page.unwrap();
|
||||||
let mut attempts: Vec<ExtractAttempt> = Vec::new();
|
let mut attempts: Vec<ExtractAttempt> = Vec::new();
|
||||||
|
|
||||||
// var pageCacheHtml = page.innerHTML;
|
// var pageCacheHtml = page.innerHTML;
|
||||||
//TODO: Add page cache
|
//TODO(???): Add page cache
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
// var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS);
|
// var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS);
|
||||||
|
@ -2197,7 +2197,7 @@ mod test {
|
||||||
use kuchiki::traits::*;
|
use kuchiki::traits::*;
|
||||||
use kuchiki::NodeRef;
|
use kuchiki::NodeRef;
|
||||||
|
|
||||||
// TODO: Refactor not to use test file possibly
|
// TODO(style): Refactor not to use test file possibly
|
||||||
const TEST_HTML: &'static str = include_str!("../test_html/simple.html");
|
const TEST_HTML: &'static str = include_str!("../test_html/simple.html");
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
@ -3867,7 +3867,7 @@ characters. For that reason, this <p> tag could not be a byline because it's too
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_clean_classes() {
|
fn test_clean_classes() {
|
||||||
// TODO: This test will later be edited to ensure it checks to only remove certain classes
|
// TODO(task): This test will later be edited to ensure it checks to only remove certain classes
|
||||||
let html_str = r#"
|
let html_str = r#"
|
||||||
<!DOCTYPE html>
|
<!DOCTYPE html>
|
||||||
<html>
|
<html>
|
||||||
|
|
|
@ -42,14 +42,14 @@ pub async fn main() -> anyhow::Result<()> {
|
||||||
let client = reqwest::ClientBuilder::new()
|
let client = reqwest::ClientBuilder::new()
|
||||||
.timeout(TIME_LIMIT)
|
.timeout(TIME_LIMIT)
|
||||||
.default_headers(header_map)
|
.default_headers(header_map)
|
||||||
// TODO We want to handle redirects ourselves so we can track them...
|
// We want to handle redirects ourselves so we can track them...
|
||||||
.redirect(Policy::none())
|
.redirect(Policy::none())
|
||||||
.build()?;
|
.build()?;
|
||||||
|
|
||||||
let mut adblock_engines = Vec::new();
|
let mut adblock_engines = Vec::new();
|
||||||
|
|
||||||
for (antifeature, name) in &ADBLOCK_FILTER_PATHS {
|
for (antifeature, name) in &ADBLOCK_FILTER_PATHS {
|
||||||
// TODO Don't hardcode these paths in quite as bad a way...
|
// TODO(not important) Don't hardcode these paths in quite as bad a way...
|
||||||
let path = PathBuf::from(format!("./data/{}.adblock", name));
|
let path = PathBuf::from(format!("./data/{}.adblock", name));
|
||||||
if !path.exists() {
|
if !path.exists() {
|
||||||
warn!("Missing adblock rules: {:?}.", path);
|
warn!("Missing adblock rules: {:?}.", path);
|
||||||
|
@ -94,16 +94,12 @@ pub async fn main() -> anyhow::Result<()> {
|
||||||
}
|
}
|
||||||
RakeOutcome::RakedFeed(feed) => {
|
RakeOutcome::RakedFeed(feed) => {
|
||||||
green_ln!("Feed");
|
green_ln!("Feed");
|
||||||
// TODO
|
|
||||||
|
|
||||||
println!();
|
println!();
|
||||||
let refs = references_from_urlrakes(&feed, ReferenceKind::FeedEntry);
|
let refs = references_from_urlrakes(&feed, ReferenceKind::FeedEntry);
|
||||||
print_references(&refs);
|
print_references(&refs);
|
||||||
}
|
}
|
||||||
RakeOutcome::RakedSitemap(sitemap) => {
|
RakeOutcome::RakedSitemap(sitemap) => {
|
||||||
green_ln!("Sitemap");
|
green_ln!("Sitemap");
|
||||||
// TODO
|
|
||||||
|
|
||||||
println!();
|
println!();
|
||||||
let refs = references_from_urlrakes(&sitemap, ReferenceKind::SitemapEntry);
|
let refs = references_from_urlrakes(&sitemap, ReferenceKind::SitemapEntry);
|
||||||
print_references(&refs);
|
print_references(&refs);
|
||||||
|
|
|
@ -84,7 +84,7 @@ pub async fn main() -> anyhow::Result<()> {
|
||||||
|
|
||||||
pub struct Seed {
|
pub struct Seed {
|
||||||
url: UrlOrUrlPattern,
|
url: UrlOrUrlPattern,
|
||||||
// TODO These make more sense at the indexer stage. tags: BTreeSet<CompactString>,
|
// TODO(later) These make more sense at the indexer stage. tags: BTreeSet<CompactString>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Either a URL or a URL prefix.
|
/// Either a URL or a URL prefix.
|
||||||
|
@ -237,6 +237,10 @@ async fn import_and_flush_batch(
|
||||||
let mut allowed_domain_record = DirtyTracker::new(
|
let mut allowed_domain_record = DirtyTracker::new(
|
||||||
allowed_domain_record.unwrap_or_else(|| AllowedDomainRecord::default()),
|
allowed_domain_record.unwrap_or_else(|| AllowedDomainRecord::default()),
|
||||||
);
|
);
|
||||||
|
if is_domain_new {
|
||||||
|
// Mark it as dirty
|
||||||
|
let _: &mut AllowedDomainRecord = allowed_domain_record.borrow_mut();
|
||||||
|
}
|
||||||
|
|
||||||
// Register the domain. This is a no-op if it's already active or backing off.
|
// Register the domain. This is a no-op if it's already active or backing off.
|
||||||
txn.insert_active_domain_with_new_raffle_ticket(domain.clone().into_owned())?;
|
txn.insert_active_domain_with_new_raffle_ticket(domain.clone().into_owned())?;
|
||||||
|
|
|
@ -361,7 +361,7 @@ pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result<Vec<UrlRaked>> {
|
||||||
} else {
|
} else {
|
||||||
continue;
|
continue;
|
||||||
};
|
};
|
||||||
let url = Url::parse(&link.href).context("parsing URL in feed")?; // TODO ignore failure here...?
|
let url = Url::parse(&link.href).context("parsing URL in feed")?; // TODO(robustness) ignore failure here...?
|
||||||
|
|
||||||
let last_changed = entry.updated.or(entry.published);
|
let last_changed = entry.updated.or(entry.published);
|
||||||
|
|
||||||
|
@ -372,7 +372,7 @@ pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result<Vec<UrlRaked>> {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO paginated feeds (e.g. JSON Feed next_url)
|
// TODO(feature) paginated feeds (e.g. JSON Feed next_url)
|
||||||
|
|
||||||
Ok(urls)
|
Ok(urls)
|
||||||
}
|
}
|
||||||
|
|
|
@ -126,7 +126,7 @@ pub fn guess_document_language(text: &str) -> Option<String> {
|
||||||
.map(|lang: Language| lang.iso_code_639_1().to_string())
|
.map(|lang: Language| lang.iso_code_639_1().to_string())
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO this isn't particularly efficient. Probably want a trie if it's important...
|
// TODO(perf) this isn't particularly efficient. Probably want a trie if it's important...
|
||||||
pub struct IpSet {
|
pub struct IpSet {
|
||||||
ips: BTreeSet<IpNetwork>,
|
ips: BTreeSet<IpNetwork>,
|
||||||
}
|
}
|
||||||
|
|
|
@ -261,7 +261,7 @@ impl PageExtractionServiceInternal {
|
||||||
document.body_remainder = DenseTree::from_body(root_node.clone());
|
document.body_remainder = DenseTree::from_body(root_node.clone());
|
||||||
document.body_content = DenseTree::from_body(article_node);
|
document.body_content = DenseTree::from_body(article_node);
|
||||||
} else {
|
} else {
|
||||||
todo!()
|
todo!() // NEEDED
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(ExtractedPage::Success {
|
Ok(ExtractedPage::Success {
|
||||||
|
|
|
@ -94,13 +94,13 @@ impl TaskContext {
|
||||||
self.process_domain(domain).await?;
|
self.process_domain(domain).await?;
|
||||||
} else {
|
} else {
|
||||||
// Already in use; need to pick again
|
// Already in use; need to pick again
|
||||||
// TODO be smarter about this.
|
// TODO(perf) be smarter about this.
|
||||||
tokio::time::sleep(Duration::from_secs(60)).await;
|
tokio::time::sleep(Duration::from_secs(60)).await;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
None => {
|
None => {
|
||||||
// No domains left! Woot.
|
// No domains left! Woot.
|
||||||
// TODO come up with some stopping conditions
|
// TODO(perf, feature) come up with some stopping conditions
|
||||||
tokio::time::sleep(Duration::from_secs(60)).await;
|
tokio::time::sleep(Duration::from_secs(60)).await;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -143,8 +143,8 @@ impl TaskContext {
|
||||||
return Ok(false);
|
return Ok(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO delete the domain from the store
|
// TODO(needed) delete the domain from the store
|
||||||
todo!();
|
todo!(); // NEEDED
|
||||||
|
|
||||||
txn.commit()?;
|
txn.commit()?;
|
||||||
Ok(true)
|
Ok(true)
|
||||||
|
@ -366,12 +366,13 @@ impl TaskContext {
|
||||||
Ok(NextAction::Continue)
|
Ok(NextAction::Continue)
|
||||||
}
|
}
|
||||||
RakeOutcome::TemporaryFailure(failure) => {
|
RakeOutcome::TemporaryFailure(failure) => {
|
||||||
// TODO do we want to log this somewhere?
|
// TODO(future) do we want to log this somewhere?
|
||||||
|
// or at least a metric
|
||||||
|
|
||||||
let domain = get_reduced_domain(url)?;
|
let domain = get_reduced_domain(url)?;
|
||||||
let url = url.clone();
|
let url = url.clone();
|
||||||
|
|
||||||
// TODO add 1.1× the previous backoff, if there was one.
|
// TODO(feature) add 1.1× the previous backoff, if there was one.
|
||||||
let new_backoff = failure.backoff_sec;
|
let new_backoff = failure.backoff_sec;
|
||||||
let domain = domain.into_owned();
|
let domain = domain.into_owned();
|
||||||
|
|
||||||
|
|
|
@ -178,7 +178,7 @@ impl RakerStore {
|
||||||
F: FnOnce(RakerTxn<'_, RW>) -> anyhow::Result<R> + Send + 'static,
|
F: FnOnce(RakerTxn<'_, RW>) -> anyhow::Result<R> + Send + 'static,
|
||||||
R: Send + 'static,
|
R: Send + 'static,
|
||||||
{
|
{
|
||||||
// TODO consider adding a lock here to prevent all the async executors getting stuck here...
|
// TODO(robustness) consider adding a lock here to prevent all the async executors getting stuck here...
|
||||||
let this = self.clone();
|
let this = self.clone();
|
||||||
Ok(tokio::task::spawn_blocking(move || -> anyhow::Result<R> {
|
Ok(tokio::task::spawn_blocking(move || -> anyhow::Result<R> {
|
||||||
let txn = this.rw_txn()?;
|
let txn = this.rw_txn()?;
|
||||||
|
|
|
@ -20,7 +20,7 @@ pub struct UrlVisitedRecord {
|
||||||
|
|
||||||
#[derive(Clone, Debug, Deserialize, Serialize)]
|
#[derive(Clone, Debug, Deserialize, Serialize)]
|
||||||
pub struct QueueUrlRecord {
|
pub struct QueueUrlRecord {
|
||||||
pub intent: RakeIntent, // TODO
|
pub intent: RakeIntent, // TODO CONSIDER
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug, Deserialize, Serialize)]
|
#[derive(Clone, Debug, Deserialize, Serialize)]
|
||||||
|
|
Loading…
Reference in New Issue