Sort through TODO items

rei/raker_storage
Olivier 'reivilibre' 2022-03-20 22:23:12 +00:00
parent 120702ce0e
commit 6d109632a3
10 changed files with 35 additions and 34 deletions

View File

@ -272,7 +272,7 @@ impl DenseTreeBuilder {
self.preceding_newlines = 0;
}
"img" => {
// TODO Decide if this is worth the space...
// TODO(future) Decide if this is worth the space...
let attrs = element.attributes.borrow();
let src = attrs.get("src").unwrap_or("").to_owned();

View File

@ -26,17 +26,17 @@ const FLAG_WEIGHT_CLASSES: u32 = 0x2;
const FLAG_CLEAN_CONDITIONALLY: u32 = 0x4;
const READABILITY_SCORE: &'static str = "readability-score";
const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml";
// TODO: Change to HashSet
// TODO(perf): Change to HashSet
const PHRASING_ELEMS: [&str; 39] = [
"abbr", "audio", "b", "bdo", "br", "button", "cite", "code", "data", "datalist", "dfn", "em",
"embed", "i", "img", "input", "kbd", "label", "mark", "math", "meter", "noscript", "object",
"output", "progress", "q", "ruby", "samp", "script", "select", "small", "span", "strong",
"sub", "sup", "textarea", "time", "var", "wbr",
];
// TODO: Change to HashSet
// TODO(perf): Change to HashSet
const DEFAULT_TAGS_TO_SCORE: [&str; 9] =
["section", "h2", "h3", "h4", "h5", "h6", "p", "td", "pre"];
// TODO: Change to HashSet
// TODO(perf): Change to HashSet
const ALTER_TO_DIV_EXCEPTIONS: [&str; 4] = ["div", "article", "section", "p"];
const PRESENTATIONAL_ATTRIBUTES: [&str; 12] = [
"align",
@ -54,7 +54,7 @@ const PRESENTATIONAL_ATTRIBUTES: [&str; 12] = [
];
const DATA_TABLE_DESCENDANTS: [&str; 5] = ["col", "colgroup", "tfoot", "thead", "th"];
// TODO: Change to HashSet
// TODO(perf): Change to HashSet
const DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [&str; 5] = ["table", "th", "td", "hr", "pre"];
pub mod regexes;
@ -168,7 +168,7 @@ impl Readability {
continue;
}
if let Some(mut prev_elem) = noscript.as_node().previous_sibling() {
// TODO: Fix this to have a better way of extracting nodes that are elements
// TODO(style): Fix this to have a better way of extracting nodes that are elements
while prev_elem.as_element().is_none() {
match prev_elem.previous_sibling() {
Some(new_prev) => prev_elem = new_prev,
@ -389,7 +389,7 @@ impl Readability {
/// The must_be_element argument ensure the next element is actually an element node.
/// This is likely to factored out into a new function.
fn next_element(node_ref: Option<NodeRef>, must_be_element: bool) -> Option<NodeRef> {
// TODO: Could probably be refactored to use the elements method
// TODO(style): Could probably be refactored to use the elements method
let mut node_ref = node_ref;
while node_ref.is_some() {
match node_ref.as_ref().unwrap().data() {
@ -544,7 +544,7 @@ impl Readability {
/// Converts some of the common HTML entities in string to their corresponding characters.
fn unescape_html_entities(value: &mut String) {
if !value.is_empty() {
// TODO: Extract this
// TODO(style): Extract this
let mut html_escape_map: HashMap<&str, &str> = HashMap::new();
html_escape_map.insert("lt", "<");
html_escape_map.insert("gt", ">");
@ -637,7 +637,7 @@ impl Readability {
/// Removes the class="" attribute from every element in the given subtree, except those that
/// match CLASSES_TO_PRESERVE and the classesToPreserve array from the options object.
fn clean_classes(&mut self) {
// TODO: This should accessed from Self
// TODO(style): This should accessed from Self
let classes_to_preserve: HashSet<&str> = HashSet::new();
if let Some(article_node) = &mut self.article_node {
for elem in article_node.inclusive_descendants().elements() {
@ -790,7 +790,7 @@ impl Readability {
/// Run any post-process modifications to article content as necessary.
fn post_process_content(&mut self, url: &str) {
self.fix_relative_uris(url);
// TODO: Add flag check
// TODO(style): Add flag check
self.clean_classes();
self.clean_readability_attrs();
}
@ -1041,7 +1041,7 @@ impl Readability {
/// Determine whether element has any children block level elements.
fn has_child_block_element(node_ref: &NodeRef) -> bool {
// TODO: Refer to a static HashSet
// TODO(perf): Refer to a static HashSet
let block_level_elems: [&str; 32] = [
"address",
"article",
@ -1622,13 +1622,13 @@ impl Readability {
let page = self.root_node.select_first("body");
if page.is_err() {
bail!("Document has no <body>");
// TODO return Err(ErrorKind::ReadabilityError("Document has no <body>".into()).into());
// TODO(style) return Err(ErrorKind::ReadabilityError("Document has no <body>".into()).into());
}
let page = page.unwrap();
let mut attempts: Vec<ExtractAttempt> = Vec::new();
// var pageCacheHtml = page.innerHTML;
//TODO: Add page cache
//TODO(???): Add page cache
loop {
// var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS);
@ -2197,7 +2197,7 @@ mod test {
use kuchiki::traits::*;
use kuchiki::NodeRef;
// TODO: Refactor not to use test file possibly
// TODO(style): Refactor not to use test file possibly
const TEST_HTML: &'static str = include_str!("../test_html/simple.html");
#[test]
@ -3867,7 +3867,7 @@ characters. For that reason, this <p> tag could not be a byline because it's too
#[test]
fn test_clean_classes() {
// TODO: This test will later be edited to ensure it checks to only remove certain classes
// TODO(task): This test will later be edited to ensure it checks to only remove certain classes
let html_str = r#"
<!DOCTYPE html>
<html>

View File

@ -42,14 +42,14 @@ pub async fn main() -> anyhow::Result<()> {
let client = reqwest::ClientBuilder::new()
.timeout(TIME_LIMIT)
.default_headers(header_map)
// TODO We want to handle redirects ourselves so we can track them...
// We want to handle redirects ourselves so we can track them...
.redirect(Policy::none())
.build()?;
let mut adblock_engines = Vec::new();
for (antifeature, name) in &ADBLOCK_FILTER_PATHS {
// TODO Don't hardcode these paths in quite as bad a way...
// TODO(not important) Don't hardcode these paths in quite as bad a way...
let path = PathBuf::from(format!("./data/{}.adblock", name));
if !path.exists() {
warn!("Missing adblock rules: {:?}.", path);
@ -94,16 +94,12 @@ pub async fn main() -> anyhow::Result<()> {
}
RakeOutcome::RakedFeed(feed) => {
green_ln!("Feed");
// TODO
println!();
let refs = references_from_urlrakes(&feed, ReferenceKind::FeedEntry);
print_references(&refs);
}
RakeOutcome::RakedSitemap(sitemap) => {
green_ln!("Sitemap");
// TODO
println!();
let refs = references_from_urlrakes(&sitemap, ReferenceKind::SitemapEntry);
print_references(&refs);

View File

@ -84,7 +84,7 @@ pub async fn main() -> anyhow::Result<()> {
pub struct Seed {
url: UrlOrUrlPattern,
// TODO These make more sense at the indexer stage. tags: BTreeSet<CompactString>,
// TODO(later) These make more sense at the indexer stage. tags: BTreeSet<CompactString>,
}
/// Either a URL or a URL prefix.
@ -237,6 +237,10 @@ async fn import_and_flush_batch(
let mut allowed_domain_record = DirtyTracker::new(
allowed_domain_record.unwrap_or_else(|| AllowedDomainRecord::default()),
);
if is_domain_new {
// Mark it as dirty
let _: &mut AllowedDomainRecord = allowed_domain_record.borrow_mut();
}
// Register the domain. This is a no-op if it's already active or backing off.
txn.insert_active_domain_with_new_raffle_ticket(domain.clone().into_owned())?;

View File

@ -361,7 +361,7 @@ pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result<Vec<UrlRaked>> {
} else {
continue;
};
let url = Url::parse(&link.href).context("parsing URL in feed")?; // TODO ignore failure here...?
let url = Url::parse(&link.href).context("parsing URL in feed")?; // TODO(robustness) ignore failure here...?
let last_changed = entry.updated.or(entry.published);
@ -372,7 +372,7 @@ pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result<Vec<UrlRaked>> {
});
}
// TODO paginated feeds (e.g. JSON Feed next_url)
// TODO(feature) paginated feeds (e.g. JSON Feed next_url)
Ok(urls)
}

View File

@ -126,7 +126,7 @@ pub fn guess_document_language(text: &str) -> Option<String> {
.map(|lang: Language| lang.iso_code_639_1().to_string())
}
// TODO this isn't particularly efficient. Probably want a trie if it's important...
// TODO(perf) this isn't particularly efficient. Probably want a trie if it's important...
pub struct IpSet {
ips: BTreeSet<IpNetwork>,
}

View File

@ -261,7 +261,7 @@ impl PageExtractionServiceInternal {
document.body_remainder = DenseTree::from_body(root_node.clone());
document.body_content = DenseTree::from_body(article_node);
} else {
todo!()
todo!() // NEEDED
}
Ok(ExtractedPage::Success {

View File

@ -94,13 +94,13 @@ impl TaskContext {
self.process_domain(domain).await?;
} else {
// Already in use; need to pick again
// TODO be smarter about this.
// TODO(perf) be smarter about this.
tokio::time::sleep(Duration::from_secs(60)).await;
}
}
None => {
// No domains left! Woot.
// TODO come up with some stopping conditions
// TODO(perf, feature) come up with some stopping conditions
tokio::time::sleep(Duration::from_secs(60)).await;
}
}
@ -143,8 +143,8 @@ impl TaskContext {
return Ok(false);
}
// TODO delete the domain from the store
todo!();
// TODO(needed) delete the domain from the store
todo!(); // NEEDED
txn.commit()?;
Ok(true)
@ -366,12 +366,13 @@ impl TaskContext {
Ok(NextAction::Continue)
}
RakeOutcome::TemporaryFailure(failure) => {
// TODO do we want to log this somewhere?
// TODO(future) do we want to log this somewhere?
// or at least a metric
let domain = get_reduced_domain(url)?;
let url = url.clone();
// TODO add 1.1× the previous backoff, if there was one.
// TODO(feature) add 1.1× the previous backoff, if there was one.
let new_backoff = failure.backoff_sec;
let domain = domain.into_owned();

View File

@ -178,7 +178,7 @@ impl RakerStore {
F: FnOnce(RakerTxn<'_, RW>) -> anyhow::Result<R> + Send + 'static,
R: Send + 'static,
{
// TODO consider adding a lock here to prevent all the async executors getting stuck here...
// TODO(robustness) consider adding a lock here to prevent all the async executors getting stuck here...
let this = self.clone();
Ok(tokio::task::spawn_blocking(move || -> anyhow::Result<R> {
let txn = this.rw_txn()?;

View File

@ -20,7 +20,7 @@ pub struct UrlVisitedRecord {
#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct QueueUrlRecord {
pub intent: RakeIntent, // TODO
pub intent: RakeIntent, // TODO CONSIDER
}
#[derive(Clone, Debug, Deserialize, Serialize)]