Experimental deduplication support

This commit is contained in:
Olivier 'reivilibre' 2020-11-16 13:25:44 +00:00
parent dc5a5b9909
commit 9173c75a7f
2 changed files with 101 additions and 27 deletions

View File

@ -20,7 +20,7 @@ import logging
import time import time
from copy import deepcopy from copy import deepcopy
from hashlib import sha256 from hashlib import sha256
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
import aiosqlite import aiosqlite
import attr import attr
@ -81,6 +81,9 @@ class DependencyBook:
cache_data: Dict[str, Any] = dict() cache_data: Dict[str, Any] = dict()
ignored: bool = False ignored: bool = False
var_list: List[str] = list()
varhash: str = ""
# TODO(performance, feature): track more in-depth details, perhaps as a # TODO(performance, feature): track more in-depth details, perhaps as a
# per-resource cache thing, so that we can track the info needed to know # per-resource cache thing, so that we can track the info needed to know
# if it changed...? # if it changed...?
@ -92,6 +95,8 @@ class DependencyBook:
"last_changed": self.last_changed, "last_changed": self.last_changed,
"cache_data": self.cache_data, "cache_data": self.cache_data,
"ignored": self.ignored, "ignored": self.ignored,
"var_list": self.var_list,
"varhash": self.varhash,
} }
@staticmethod @staticmethod
@ -105,6 +110,8 @@ class DependencyBook:
last_changed=dictionary["last_changed"], last_changed=dictionary["last_changed"],
cache_data=dictionary["cache_data"], cache_data=dictionary["cache_data"],
ignored=dictionary["ignored"], ignored=dictionary["ignored"],
var_list=dictionary["var_list"],
varhash=dictionary["varhash"],
) )
@ -118,27 +125,41 @@ cattr.global_converter.register_structure_hook(
class DependencyTracker: class DependencyTracker:
def __init__(self, book: DependencyBook, dag: "RecipeDag", recipe: "Recipe"): def __init__(self, book: DependencyBook, dag: "RecipeDag", recipe: "Recipe"):
self.book: DependencyBook = book self._book: DependencyBook = book
self._dag: "RecipeDag" = dag self._dag: "RecipeDag" = dag
self._recipe: "Recipe" = recipe self._recipe: "Recipe" = recipe
self._time: int = int(time.time() * 1000) self._time: int = int(time.time() * 1000)
self._vars: Dict[str, Any] = dict()
def build_book(self) -> DependencyBook:
self._book.varhash = hash_dict(self._vars)
self._book.var_list = sorted(self._vars.keys())
return self._book
def watch(self, resource: Resource) -> None: def watch(self, resource: Resource) -> None:
# XXX self.book.watching[resource] = self._dag.resource_time[resource] try:
self.book.watching[resource] = -42 self._book.watching[resource] = self._dag.resource_time[resource]
except KeyError as ke:
raise RuntimeError(
f"Can't watch {resource!r} because it hasn't been provided (yet)!"
) from ke
def provide(self, resource: Resource, time: Optional[int] = None) -> None: def provide(self, resource: Resource, time: Optional[int] = None) -> None:
if time is None: if time is None:
time = self._time time = self._time
self._dag.resource_time[resource] = time # We use the maximum time because multiple recipes may provide something
# and we should be careful to define a consistent behaviour in this case
self._dag.resource_time[resource] = max(
time, self._dag.resource_time.get(resource, -1)
)
def ignore(self) -> None: def ignore(self) -> None:
self.book.ignored = True self._book.ignored = True
def register_variable(self, variable: str, value: Union[dict, str, int]): def register_variable(self, variable: str, value: Union[dict, str, int]):
# self._vars[variable] = value # store a copy and we'll read it later
# TODO(implement) self._vars[variable] = value
logger.critical("not implemented: register var %s", variable)
def register_fridge_file(self, desugared_path: str): def register_fridge_file(self, desugared_path: str):
# TODO this is not complete # TODO this is not complete

View File

@ -17,6 +17,7 @@
import asyncio import asyncio
import logging import logging
import time
from asyncio import Future, Queue from asyncio import Future, Queue
from collections import deque from collections import deque
from contextvars import ContextVar from contextvars import ContextVar
@ -33,6 +34,7 @@ from scone.head.dependency_tracking import (
DependencyBook, DependencyBook,
DependencyCache, DependencyCache,
DependencyTracker, DependencyTracker,
hash_dict,
) )
from scone.head.head import Head from scone.head.head import Head
from scone.head.recipe import Recipe from scone.head.recipe import Recipe
@ -133,6 +135,7 @@ class Kitchen:
self.last_updated_ats: Dict[Resource, int] = dict() self.last_updated_ats: Dict[Resource, int] = dict()
self._cookable: Queue[Optional[Vertex]] = Queue() self._cookable: Queue[Optional[Vertex]] = Queue()
self._sleeper_slots: int = 0 self._sleeper_slots: int = 0
self._kitchen_time: int = int(1000 * time.time())
def get_dependency_tracker(self): def get_dependency_tracker(self):
return self._dependency_trackers[current_recipe.get()] return self._dependency_trackers[current_recipe.get()]
@ -198,6 +201,45 @@ class Kitchen:
await asyncio.gather(*workers, return_exceptions=False) await asyncio.gather(*workers, return_exceptions=False)
async def _should_skip(
self, recipe: Recipe
) -> Tuple[Optional[DependencyBook], bool]:
"""
:param recipe: recipe to inquire about
:return: dep book, or None if there wasn't one
and true if the recipe should be skipped, false otherwise.
"""
inquiry = await self._dependency_store.inquire(recipe)
if inquiry is None:
return None, False
_id, prev_book = inquiry
# ignored books are not valid...
if prev_book.ignored:
return prev_book, False
# compute and compare the var hash...
sous_vars = self.head.variables[recipe.recipe_context.sous]
vars_to_hash = {}
for var in prev_book.var_list:
vars_to_hash[var] = sous_vars.get_dotted(var)
my_varhash = hash_dict(vars_to_hash)
if prev_book.varhash != my_varhash:
return prev_book, False
# compare watched resources...
for resource, last_update_time in prev_book.watching.items():
res_time = self.head.dag.resource_time.get(resource)
if res_time is None:
# suggests something has changed in a significant way...
return prev_book, False
if res_time != last_update_time:
# recipe is out of date
return prev_book, False
return prev_book, True
async def _cooking_worker(self): async def _cooking_worker(self):
dag = self.head.dag dag = self.head.dag
while True: while True:
@ -218,23 +260,34 @@ class Kitchen:
if isinstance(next_job, Recipe): if isinstance(next_job, Recipe):
meta = dag.recipe_meta[next_job] meta = dag.recipe_meta[next_job]
# TODO try to deduplicate last_book, should_skip = await self._should_skip(next_job)
meta.state = RecipeState.BEING_COOKED if should_skip and last_book:
current_recipe.set(next_job) meta.state = RecipeState.SKIPPED
eprint(f"cooking {next_job}") # provide stuff that it provided last time
self._dependency_trackers[next_job] = DependencyTracker( for res, last_update_time in last_book.provided.items():
DependencyBook(), dag, next_job dag.resource_time[res] = max(
) last_update_time, dag.resource_time.get(res, -1)
try: )
await next_job.cook(self) else:
except Exception as e: meta.state = RecipeState.BEING_COOKED
meta.state = RecipeState.FAILED current_recipe.set(next_job)
raise RuntimeError(f"Recipe {next_job} failed!") from e eprint(f"cooking {next_job}")
eprint(f"cooked {next_job}") tracker = DependencyTracker(DependencyBook(), dag, next_job)
# TODO cook self._dependency_trackers[next_job] = tracker
# TODO store depbook try:
await self._store_dependency(next_job) await next_job.cook(self)
meta.state = RecipeState.COOKED
# provide stuff
for outgoing in dag.edges[next_job]:
if not isinstance(outgoing, Resource):
continue
tracker.provide(outgoing)
except Exception as e:
meta.state = RecipeState.FAILED
raise RuntimeError(f"Recipe {next_job} failed!") from e
eprint(f"cooked {next_job}")
await self._store_dependency(next_job)
meta.state = RecipeState.COOKED
elif isinstance(next_job, Resource): elif isinstance(next_job, Resource):
eprint(f"have {next_job}") eprint(f"have {next_job}")
pass pass
@ -328,7 +381,7 @@ class Kitchen:
dependency_tracker = self._dependency_trackers.pop(recipe, None) dependency_tracker = self._dependency_trackers.pop(recipe, None)
if not dependency_tracker: if not dependency_tracker:
raise KeyError(f"Recipe {recipe} has not been tracked.") raise KeyError(f"Recipe {recipe} has not been tracked.")
depbook = dependency_tracker.book depbook = dependency_tracker.build_book()
if depbook: if depbook:
await self._dependency_store.register(recipe, depbook) await self._dependency_store.register(recipe, depbook)