Separate HTTP preview code and URL previewer. (#15269)
Separates REST layer code from the actual URL previewing.
This commit is contained in:
parent
5ab7146e19
commit
a5fb382a29
|
@ -0,0 +1 @@
|
|||
Reorganize URL preview code.
|
|
@ -0,0 +1,833 @@
|
|||
# Copyright 2016 OpenMarket Ltd
|
||||
# Copyright 2020-2023 The Matrix.org Foundation C.I.C.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import datetime
|
||||
import errno
|
||||
import fnmatch
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import sys
|
||||
import traceback
|
||||
from typing import TYPE_CHECKING, BinaryIO, Iterable, Optional, Tuple
|
||||
from urllib.parse import urljoin, urlparse, urlsplit
|
||||
from urllib.request import urlopen
|
||||
|
||||
import attr
|
||||
|
||||
from twisted.internet.defer import Deferred
|
||||
from twisted.internet.error import DNSLookupError
|
||||
|
||||
from synapse.api.errors import Codes, SynapseError
|
||||
from synapse.http.client import SimpleHttpClient
|
||||
from synapse.logging.context import make_deferred_yieldable, run_in_background
|
||||
from synapse.media._base import FileInfo, get_filename_from_headers
|
||||
from synapse.media.media_storage import MediaStorage
|
||||
from synapse.media.oembed import OEmbedProvider
|
||||
from synapse.media.preview_html import decode_body, parse_html_to_open_graph
|
||||
from synapse.metrics.background_process_metrics import run_as_background_process
|
||||
from synapse.types import JsonDict, UserID
|
||||
from synapse.util import json_encoder
|
||||
from synapse.util.async_helpers import ObservableDeferred
|
||||
from synapse.util.caches.expiringcache import ExpiringCache
|
||||
from synapse.util.stringutils import random_string
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from synapse.media.media_repository import MediaRepository
|
||||
from synapse.server import HomeServer
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
OG_TAG_NAME_MAXLEN = 50
|
||||
OG_TAG_VALUE_MAXLEN = 1000
|
||||
|
||||
ONE_HOUR = 60 * 60 * 1000
|
||||
ONE_DAY = 24 * ONE_HOUR
|
||||
IMAGE_CACHE_EXPIRY_MS = 2 * ONE_DAY
|
||||
|
||||
|
||||
@attr.s(slots=True, frozen=True, auto_attribs=True)
|
||||
class DownloadResult:
|
||||
length: int
|
||||
uri: str
|
||||
response_code: int
|
||||
media_type: str
|
||||
download_name: Optional[str]
|
||||
expires: int
|
||||
etag: Optional[str]
|
||||
|
||||
|
||||
@attr.s(slots=True, frozen=True, auto_attribs=True)
|
||||
class MediaInfo:
|
||||
"""
|
||||
Information parsed from downloading media being previewed.
|
||||
"""
|
||||
|
||||
# The Content-Type header of the response.
|
||||
media_type: str
|
||||
# The length (in bytes) of the downloaded media.
|
||||
media_length: int
|
||||
# The media filename, according to the server. This is parsed from the
|
||||
# returned headers, if possible.
|
||||
download_name: Optional[str]
|
||||
# The time of the preview.
|
||||
created_ts_ms: int
|
||||
# Information from the media storage provider about where the file is stored
|
||||
# on disk.
|
||||
filesystem_id: str
|
||||
filename: str
|
||||
# The URI being previewed.
|
||||
uri: str
|
||||
# The HTTP response code.
|
||||
response_code: int
|
||||
# The timestamp (in milliseconds) of when this preview expires.
|
||||
expires: int
|
||||
# The ETag header of the response.
|
||||
etag: Optional[str]
|
||||
|
||||
|
||||
class UrlPreviewer:
|
||||
"""
|
||||
Generates an Open Graph (https://ogp.me/) responses (with some Matrix
|
||||
specific additions) for a given URL.
|
||||
|
||||
When Synapse is asked to preview a URL it does the following:
|
||||
|
||||
1. Checks against a URL blacklist (defined as `url_preview_url_blacklist` in the
|
||||
config).
|
||||
2. Checks the URL against an in-memory cache and returns the result if it exists. (This
|
||||
is also used to de-duplicate processing of multiple in-flight requests at once.)
|
||||
3. Kicks off a background process to generate a preview:
|
||||
1. Checks URL and timestamp against the database cache and returns the result if it
|
||||
has not expired and was successful (a 2xx return code).
|
||||
2. Checks if the URL matches an oEmbed (https://oembed.com/) pattern. If it
|
||||
does, update the URL to download.
|
||||
3. Downloads the URL and stores it into a file via the media storage provider
|
||||
and saves the local media metadata.
|
||||
4. If the media is an image:
|
||||
1. Generates thumbnails.
|
||||
2. Generates an Open Graph response based on image properties.
|
||||
5. If the media is HTML:
|
||||
1. Decodes the HTML via the stored file.
|
||||
2. Generates an Open Graph response from the HTML.
|
||||
3. If a JSON oEmbed URL was found in the HTML via autodiscovery:
|
||||
1. Downloads the URL and stores it into a file via the media storage provider
|
||||
and saves the local media metadata.
|
||||
2. Convert the oEmbed response to an Open Graph response.
|
||||
3. Override any Open Graph data from the HTML with data from oEmbed.
|
||||
4. If an image exists in the Open Graph response:
|
||||
1. Downloads the URL and stores it into a file via the media storage
|
||||
provider and saves the local media metadata.
|
||||
2. Generates thumbnails.
|
||||
3. Updates the Open Graph response based on image properties.
|
||||
6. If the media is JSON and an oEmbed URL was found:
|
||||
1. Convert the oEmbed response to an Open Graph response.
|
||||
2. If a thumbnail or image is in the oEmbed response:
|
||||
1. Downloads the URL and stores it into a file via the media storage
|
||||
provider and saves the local media metadata.
|
||||
2. Generates thumbnails.
|
||||
3. Updates the Open Graph response based on image properties.
|
||||
7. Stores the result in the database cache.
|
||||
4. Returns the result.
|
||||
|
||||
If any additional requests (e.g. from oEmbed autodiscovery, step 5.3 or
|
||||
image thumbnailing, step 5.4 or 6.4) fails then the URL preview as a whole
|
||||
does not fail. As much information as possible is returned.
|
||||
|
||||
The in-memory cache expires after 1 hour.
|
||||
|
||||
Expired entries in the database cache (and their associated media files) are
|
||||
deleted every 10 seconds. The default expiration time is 1 hour from download.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hs: "HomeServer",
|
||||
media_repo: "MediaRepository",
|
||||
media_storage: MediaStorage,
|
||||
):
|
||||
self.clock = hs.get_clock()
|
||||
self.filepaths = media_repo.filepaths
|
||||
self.max_spider_size = hs.config.media.max_spider_size
|
||||
self.server_name = hs.hostname
|
||||
self.store = hs.get_datastores().main
|
||||
self.client = SimpleHttpClient(
|
||||
hs,
|
||||
treq_args={"browser_like_redirects": True},
|
||||
ip_whitelist=hs.config.media.url_preview_ip_range_whitelist,
|
||||
ip_blacklist=hs.config.media.url_preview_ip_range_blacklist,
|
||||
use_proxy=True,
|
||||
)
|
||||
self.media_repo = media_repo
|
||||
self.primary_base_path = media_repo.primary_base_path
|
||||
self.media_storage = media_storage
|
||||
|
||||
self._oembed = OEmbedProvider(hs)
|
||||
|
||||
# We run the background jobs if we're the instance specified (or no
|
||||
# instance is specified, where we assume there is only one instance
|
||||
# serving media).
|
||||
instance_running_jobs = hs.config.media.media_instance_running_background_jobs
|
||||
self._worker_run_media_background_jobs = (
|
||||
instance_running_jobs is None
|
||||
or instance_running_jobs == hs.get_instance_name()
|
||||
)
|
||||
|
||||
self.url_preview_url_blacklist = hs.config.media.url_preview_url_blacklist
|
||||
self.url_preview_accept_language = hs.config.media.url_preview_accept_language
|
||||
|
||||
# memory cache mapping urls to an ObservableDeferred returning
|
||||
# JSON-encoded OG metadata
|
||||
self._cache: ExpiringCache[str, ObservableDeferred] = ExpiringCache(
|
||||
cache_name="url_previews",
|
||||
clock=self.clock,
|
||||
# don't spider URLs more often than once an hour
|
||||
expiry_ms=ONE_HOUR,
|
||||
)
|
||||
|
||||
if self._worker_run_media_background_jobs:
|
||||
self._cleaner_loop = self.clock.looping_call(
|
||||
self._start_expire_url_cache_data, 10 * 1000
|
||||
)
|
||||
|
||||
async def preview(self, url: str, user: UserID, ts: int) -> bytes:
|
||||
# XXX: we could move this into _do_preview if we wanted.
|
||||
url_tuple = urlsplit(url)
|
||||
for entry in self.url_preview_url_blacklist:
|
||||
match = True
|
||||
for attrib in entry:
|
||||
pattern = entry[attrib]
|
||||
value = getattr(url_tuple, attrib)
|
||||
logger.debug(
|
||||
"Matching attrib '%s' with value '%s' against pattern '%s'",
|
||||
attrib,
|
||||
value,
|
||||
pattern,
|
||||
)
|
||||
|
||||
if value is None:
|
||||
match = False
|
||||
continue
|
||||
|
||||
# Some attributes might not be parsed as strings by urlsplit (such as the
|
||||
# port, which is parsed as an int). Because we use match functions that
|
||||
# expect strings, we want to make sure that's what we give them.
|
||||
value_str = str(value)
|
||||
|
||||
if pattern.startswith("^"):
|
||||
if not re.match(pattern, value_str):
|
||||
match = False
|
||||
continue
|
||||
else:
|
||||
if not fnmatch.fnmatch(value_str, pattern):
|
||||
match = False
|
||||
continue
|
||||
if match:
|
||||
logger.warning("URL %s blocked by url_blacklist entry %s", url, entry)
|
||||
raise SynapseError(
|
||||
403, "URL blocked by url pattern blacklist entry", Codes.UNKNOWN
|
||||
)
|
||||
|
||||
# the in-memory cache:
|
||||
# * ensures that only one request is active at a time
|
||||
# * takes load off the DB for the thundering herds
|
||||
# * also caches any failures (unlike the DB) so we don't keep
|
||||
# requesting the same endpoint
|
||||
|
||||
observable = self._cache.get(url)
|
||||
|
||||
if not observable:
|
||||
download = run_in_background(self._do_preview, url, user, ts)
|
||||
observable = ObservableDeferred(download, consumeErrors=True)
|
||||
self._cache[url] = observable
|
||||
else:
|
||||
logger.info("Returning cached response")
|
||||
|
||||
return await make_deferred_yieldable(observable.observe())
|
||||
|
||||
async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
|
||||
"""Check the db, and download the URL and build a preview
|
||||
|
||||
Args:
|
||||
url: The URL to preview.
|
||||
user: The user requesting the preview.
|
||||
ts: The timestamp requested for the preview.
|
||||
|
||||
Returns:
|
||||
json-encoded og data
|
||||
"""
|
||||
# check the URL cache in the DB (which will also provide us with
|
||||
# historical previews, if we have any)
|
||||
cache_result = await self.store.get_url_cache(url, ts)
|
||||
if (
|
||||
cache_result
|
||||
and cache_result["expires_ts"] > ts
|
||||
and cache_result["response_code"] / 100 == 2
|
||||
):
|
||||
# It may be stored as text in the database, not as bytes (such as
|
||||
# PostgreSQL). If so, encode it back before handing it on.
|
||||
og = cache_result["og"]
|
||||
if isinstance(og, str):
|
||||
og = og.encode("utf8")
|
||||
return og
|
||||
|
||||
# If this URL can be accessed via oEmbed, use that instead.
|
||||
url_to_download = url
|
||||
oembed_url = self._oembed.get_oembed_url(url)
|
||||
if oembed_url:
|
||||
url_to_download = oembed_url
|
||||
|
||||
media_info = await self._handle_url(url_to_download, user)
|
||||
|
||||
logger.debug("got media_info of '%s'", media_info)
|
||||
|
||||
# The number of milliseconds that the response should be considered valid.
|
||||
expiration_ms = media_info.expires
|
||||
author_name: Optional[str] = None
|
||||
|
||||
if _is_media(media_info.media_type):
|
||||
file_id = media_info.filesystem_id
|
||||
dims = await self.media_repo._generate_thumbnails(
|
||||
None, file_id, file_id, media_info.media_type, url_cache=True
|
||||
)
|
||||
|
||||
og = {
|
||||
"og:description": media_info.download_name,
|
||||
"og:image": f"mxc://{self.server_name}/{media_info.filesystem_id}",
|
||||
"og:image:type": media_info.media_type,
|
||||
"matrix:image:size": media_info.media_length,
|
||||
}
|
||||
|
||||
if dims:
|
||||
og["og:image:width"] = dims["width"]
|
||||
og["og:image:height"] = dims["height"]
|
||||
else:
|
||||
logger.warning("Couldn't get dims for %s" % url)
|
||||
|
||||
# define our OG response for this media
|
||||
elif _is_html(media_info.media_type):
|
||||
# TODO: somehow stop a big HTML tree from exploding synapse's RAM
|
||||
|
||||
with open(media_info.filename, "rb") as file:
|
||||
body = file.read()
|
||||
|
||||
tree = decode_body(body, media_info.uri, media_info.media_type)
|
||||
if tree is not None:
|
||||
# Check if this HTML document points to oEmbed information and
|
||||
# defer to that.
|
||||
oembed_url = self._oembed.autodiscover_from_html(tree)
|
||||
og_from_oembed: JsonDict = {}
|
||||
if oembed_url:
|
||||
try:
|
||||
oembed_info = await self._handle_url(
|
||||
oembed_url, user, allow_data_urls=True
|
||||
)
|
||||
except Exception as e:
|
||||
# Fetching the oEmbed info failed, don't block the entire URL preview.
|
||||
logger.warning(
|
||||
"oEmbed fetch failed during URL preview: %s errored with %s",
|
||||
oembed_url,
|
||||
e,
|
||||
)
|
||||
else:
|
||||
(
|
||||
og_from_oembed,
|
||||
author_name,
|
||||
expiration_ms,
|
||||
) = await self._handle_oembed_response(
|
||||
url, oembed_info, expiration_ms
|
||||
)
|
||||
|
||||
# Parse Open Graph information from the HTML in case the oEmbed
|
||||
# response failed or is incomplete.
|
||||
og_from_html = parse_html_to_open_graph(tree)
|
||||
|
||||
# Compile the Open Graph response by using the scraped
|
||||
# information from the HTML and overlaying any information
|
||||
# from the oEmbed response.
|
||||
og = {**og_from_html, **og_from_oembed}
|
||||
|
||||
await self._precache_image_url(user, media_info, og)
|
||||
else:
|
||||
og = {}
|
||||
|
||||
elif oembed_url:
|
||||
# Handle the oEmbed information.
|
||||
og, author_name, expiration_ms = await self._handle_oembed_response(
|
||||
url, media_info, expiration_ms
|
||||
)
|
||||
await self._precache_image_url(user, media_info, og)
|
||||
|
||||
else:
|
||||
logger.warning("Failed to find any OG data in %s", url)
|
||||
og = {}
|
||||
|
||||
# If we don't have a title but we have author_name, copy it as
|
||||
# title
|
||||
if not og.get("og:title") and author_name:
|
||||
og["og:title"] = author_name
|
||||
|
||||
# filter out any stupidly long values
|
||||
keys_to_remove = []
|
||||
for k, v in og.items():
|
||||
# values can be numeric as well as strings, hence the cast to str
|
||||
if len(k) > OG_TAG_NAME_MAXLEN or len(str(v)) > OG_TAG_VALUE_MAXLEN:
|
||||
logger.warning(
|
||||
"Pruning overlong tag %s from OG data", k[:OG_TAG_NAME_MAXLEN]
|
||||
)
|
||||
keys_to_remove.append(k)
|
||||
for k in keys_to_remove:
|
||||
del og[k]
|
||||
|
||||
logger.debug("Calculated OG for %s as %s", url, og)
|
||||
|
||||
jsonog = json_encoder.encode(og)
|
||||
|
||||
# Cap the amount of time to consider a response valid.
|
||||
expiration_ms = min(expiration_ms, ONE_DAY)
|
||||
|
||||
# store OG in history-aware DB cache
|
||||
await self.store.store_url_cache(
|
||||
url,
|
||||
media_info.response_code,
|
||||
media_info.etag,
|
||||
media_info.created_ts_ms + expiration_ms,
|
||||
jsonog,
|
||||
media_info.filesystem_id,
|
||||
media_info.created_ts_ms,
|
||||
)
|
||||
|
||||
return jsonog.encode("utf8")
|
||||
|
||||
async def _download_url(self, url: str, output_stream: BinaryIO) -> DownloadResult:
|
||||
"""
|
||||
Fetches a remote URL and parses the headers.
|
||||
|
||||
Args:
|
||||
url: The URL to fetch.
|
||||
output_stream: The stream to write the content to.
|
||||
|
||||
Returns:
|
||||
A tuple of:
|
||||
Media length, URL downloaded, the HTTP response code,
|
||||
the media type, the downloaded file name, the number of
|
||||
milliseconds the result is valid for, the etag header.
|
||||
"""
|
||||
|
||||
try:
|
||||
logger.debug("Trying to get preview for url '%s'", url)
|
||||
length, headers, uri, code = await self.client.get_file(
|
||||
url,
|
||||
output_stream=output_stream,
|
||||
max_size=self.max_spider_size,
|
||||
headers={
|
||||
b"Accept-Language": self.url_preview_accept_language,
|
||||
# Use a custom user agent for the preview because some sites will only return
|
||||
# Open Graph metadata to crawler user agents. Omit the Synapse version
|
||||
# string to avoid leaking information.
|
||||
b"User-Agent": [
|
||||
"Synapse (bot; +https://github.com/matrix-org/synapse)"
|
||||
],
|
||||
},
|
||||
is_allowed_content_type=_is_previewable,
|
||||
)
|
||||
except SynapseError:
|
||||
# Pass SynapseErrors through directly, so that the servlet
|
||||
# handler will return a SynapseError to the client instead of
|
||||
# blank data or a 500.
|
||||
raise
|
||||
except DNSLookupError:
|
||||
# DNS lookup returned no results
|
||||
# Note: This will also be the case if one of the resolved IP
|
||||
# addresses is blacklisted
|
||||
raise SynapseError(
|
||||
502,
|
||||
"DNS resolution failure during URL preview generation",
|
||||
Codes.UNKNOWN,
|
||||
)
|
||||
except Exception as e:
|
||||
# FIXME: pass through 404s and other error messages nicely
|
||||
logger.warning("Error downloading %s: %r", url, e)
|
||||
|
||||
raise SynapseError(
|
||||
500,
|
||||
"Failed to download content: %s"
|
||||
% (traceback.format_exception_only(sys.exc_info()[0], e),),
|
||||
Codes.UNKNOWN,
|
||||
)
|
||||
|
||||
if b"Content-Type" in headers:
|
||||
media_type = headers[b"Content-Type"][0].decode("ascii")
|
||||
else:
|
||||
media_type = "application/octet-stream"
|
||||
|
||||
download_name = get_filename_from_headers(headers)
|
||||
|
||||
# FIXME: we should calculate a proper expiration based on the
|
||||
# Cache-Control and Expire headers. But for now, assume 1 hour.
|
||||
expires = ONE_HOUR
|
||||
etag = headers[b"ETag"][0].decode("ascii") if b"ETag" in headers else None
|
||||
|
||||
return DownloadResult(
|
||||
length, uri, code, media_type, download_name, expires, etag
|
||||
)
|
||||
|
||||
async def _parse_data_url(
|
||||
self, url: str, output_stream: BinaryIO
|
||||
) -> DownloadResult:
|
||||
"""
|
||||
Parses a data: URL.
|
||||
|
||||
Args:
|
||||
url: The URL to parse.
|
||||
output_stream: The stream to write the content to.
|
||||
|
||||
Returns:
|
||||
A tuple of:
|
||||
Media length, URL downloaded, the HTTP response code,
|
||||
the media type, the downloaded file name, the number of
|
||||
milliseconds the result is valid for, the etag header.
|
||||
"""
|
||||
|
||||
try:
|
||||
logger.debug("Trying to parse data url '%s'", url)
|
||||
with urlopen(url) as url_info:
|
||||
# TODO Can this be more efficient.
|
||||
output_stream.write(url_info.read())
|
||||
except Exception as e:
|
||||
logger.warning("Error parsing data: URL %s: %r", url, e)
|
||||
|
||||
raise SynapseError(
|
||||
500,
|
||||
"Failed to parse data URL: %s"
|
||||
% (traceback.format_exception_only(sys.exc_info()[0], e),),
|
||||
Codes.UNKNOWN,
|
||||
)
|
||||
|
||||
return DownloadResult(
|
||||
# Read back the length that has been written.
|
||||
length=output_stream.tell(),
|
||||
uri=url,
|
||||
# If it was parsed, consider this a 200 OK.
|
||||
response_code=200,
|
||||
# urlopen shoves the media-type from the data URL into the content type
|
||||
# header object.
|
||||
media_type=url_info.headers.get_content_type(),
|
||||
# Some features are not supported by data: URLs.
|
||||
download_name=None,
|
||||
expires=ONE_HOUR,
|
||||
etag=None,
|
||||
)
|
||||
|
||||
async def _handle_url(
|
||||
self, url: str, user: UserID, allow_data_urls: bool = False
|
||||
) -> MediaInfo:
|
||||
"""
|
||||
Fetches content from a URL and parses the result to generate a MediaInfo.
|
||||
|
||||
It uses the media storage provider to persist the fetched content and
|
||||
stores the mapping into the database.
|
||||
|
||||
Args:
|
||||
url: The URL to fetch.
|
||||
user: The user who ahs requested this URL.
|
||||
allow_data_urls: True if data URLs should be allowed.
|
||||
|
||||
Returns:
|
||||
A MediaInfo object describing the fetched content.
|
||||
"""
|
||||
|
||||
# TODO: we should probably honour robots.txt... except in practice
|
||||
# we're most likely being explicitly triggered by a human rather than a
|
||||
# bot, so are we really a robot?
|
||||
|
||||
file_id = datetime.date.today().isoformat() + "_" + random_string(16)
|
||||
|
||||
file_info = FileInfo(server_name=None, file_id=file_id, url_cache=True)
|
||||
|
||||
with self.media_storage.store_into_file(file_info) as (f, fname, finish):
|
||||
if url.startswith("data:"):
|
||||
if not allow_data_urls:
|
||||
raise SynapseError(
|
||||
500, "Previewing of data: URLs is forbidden", Codes.UNKNOWN
|
||||
)
|
||||
|
||||
download_result = await self._parse_data_url(url, f)
|
||||
else:
|
||||
download_result = await self._download_url(url, f)
|
||||
|
||||
await finish()
|
||||
|
||||
try:
|
||||
time_now_ms = self.clock.time_msec()
|
||||
|
||||
await self.store.store_local_media(
|
||||
media_id=file_id,
|
||||
media_type=download_result.media_type,
|
||||
time_now_ms=time_now_ms,
|
||||
upload_name=download_result.download_name,
|
||||
media_length=download_result.length,
|
||||
user_id=user,
|
||||
url_cache=url,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Error handling downloaded %s: %r", url, e)
|
||||
# TODO: we really ought to delete the downloaded file in this
|
||||
# case, since we won't have recorded it in the db, and will
|
||||
# therefore not expire it.
|
||||
raise
|
||||
|
||||
return MediaInfo(
|
||||
media_type=download_result.media_type,
|
||||
media_length=download_result.length,
|
||||
download_name=download_result.download_name,
|
||||
created_ts_ms=time_now_ms,
|
||||
filesystem_id=file_id,
|
||||
filename=fname,
|
||||
uri=download_result.uri,
|
||||
response_code=download_result.response_code,
|
||||
expires=download_result.expires,
|
||||
etag=download_result.etag,
|
||||
)
|
||||
|
||||
async def _precache_image_url(
|
||||
self, user: UserID, media_info: MediaInfo, og: JsonDict
|
||||
) -> None:
|
||||
"""
|
||||
Pre-cache the image (if one exists) for posterity
|
||||
|
||||
Args:
|
||||
user: The user requesting the preview.
|
||||
media_info: The media being previewed.
|
||||
og: The Open Graph dictionary. This is modified with image information.
|
||||
"""
|
||||
# If there's no image or it is blank, there's nothing to do.
|
||||
if "og:image" not in og:
|
||||
return
|
||||
|
||||
# Remove the raw image URL, this will be replaced with an MXC URL, if successful.
|
||||
image_url = og.pop("og:image")
|
||||
if not image_url:
|
||||
return
|
||||
|
||||
# The image URL from the HTML might be relative to the previewed page,
|
||||
# convert it to an URL which can be requested directly.
|
||||
url_parts = urlparse(image_url)
|
||||
if url_parts.scheme != "data":
|
||||
image_url = urljoin(media_info.uri, image_url)
|
||||
|
||||
# FIXME: it might be cleaner to use the same flow as the main /preview_url
|
||||
# request itself and benefit from the same caching etc. But for now we
|
||||
# just rely on the caching on the master request to speed things up.
|
||||
try:
|
||||
image_info = await self._handle_url(image_url, user, allow_data_urls=True)
|
||||
except Exception as e:
|
||||
# Pre-caching the image failed, don't block the entire URL preview.
|
||||
logger.warning(
|
||||
"Pre-caching image failed during URL preview: %s errored with %s",
|
||||
image_url,
|
||||
e,
|
||||
)
|
||||
return
|
||||
|
||||
if _is_media(image_info.media_type):
|
||||
# TODO: make sure we don't choke on white-on-transparent images
|
||||
file_id = image_info.filesystem_id
|
||||
dims = await self.media_repo._generate_thumbnails(
|
||||
None, file_id, file_id, image_info.media_type, url_cache=True
|
||||
)
|
||||
if dims:
|
||||
og["og:image:width"] = dims["width"]
|
||||
og["og:image:height"] = dims["height"]
|
||||
else:
|
||||
logger.warning("Couldn't get dims for %s", image_url)
|
||||
|
||||
og["og:image"] = f"mxc://{self.server_name}/{image_info.filesystem_id}"
|
||||
og["og:image:type"] = image_info.media_type
|
||||
og["matrix:image:size"] = image_info.media_length
|
||||
|
||||
async def _handle_oembed_response(
|
||||
self, url: str, media_info: MediaInfo, expiration_ms: int
|
||||
) -> Tuple[JsonDict, Optional[str], int]:
|
||||
"""
|
||||
Parse the downloaded oEmbed info.
|
||||
|
||||
Args:
|
||||
url: The URL which is being previewed (not the one which was
|
||||
requested).
|
||||
media_info: The media being previewed.
|
||||
expiration_ms: The length of time, in milliseconds, the media is valid for.
|
||||
|
||||
Returns:
|
||||
A tuple of:
|
||||
The Open Graph dictionary, if the oEmbed info can be parsed.
|
||||
The author name if it could be retrieved from oEmbed.
|
||||
The (possibly updated) length of time, in milliseconds, the media is valid for.
|
||||
"""
|
||||
# If JSON was not returned, there's nothing to do.
|
||||
if not _is_json(media_info.media_type):
|
||||
return {}, None, expiration_ms
|
||||
|
||||
with open(media_info.filename, "rb") as file:
|
||||
body = file.read()
|
||||
|
||||
oembed_response = self._oembed.parse_oembed_response(url, body)
|
||||
open_graph_result = oembed_response.open_graph_result
|
||||
|
||||
# Use the cache age from the oEmbed result, if one was given.
|
||||
if open_graph_result and oembed_response.cache_age is not None:
|
||||
expiration_ms = oembed_response.cache_age
|
||||
|
||||
return open_graph_result, oembed_response.author_name, expiration_ms
|
||||
|
||||
def _start_expire_url_cache_data(self) -> Deferred:
|
||||
return run_as_background_process(
|
||||
"expire_url_cache_data", self._expire_url_cache_data
|
||||
)
|
||||
|
||||
async def _expire_url_cache_data(self) -> None:
|
||||
"""Clean up expired url cache content, media and thumbnails."""
|
||||
|
||||
assert self._worker_run_media_background_jobs
|
||||
|
||||
now = self.clock.time_msec()
|
||||
|
||||
logger.debug("Running url preview cache expiry")
|
||||
|
||||
def try_remove_parent_dirs(dirs: Iterable[str]) -> None:
|
||||
"""Attempt to remove the given chain of parent directories
|
||||
|
||||
Args:
|
||||
dirs: The list of directory paths to delete, with children appearing
|
||||
before their parents.
|
||||
"""
|
||||
for dir in dirs:
|
||||
try:
|
||||
os.rmdir(dir)
|
||||
except FileNotFoundError:
|
||||
# Already deleted, continue with deleting the rest
|
||||
pass
|
||||
except OSError as e:
|
||||
# Failed, skip deleting the rest of the parent dirs
|
||||
if e.errno != errno.ENOTEMPTY:
|
||||
logger.warning(
|
||||
"Failed to remove media directory while clearing url preview cache: %r: %s",
|
||||
dir,
|
||||
e,
|
||||
)
|
||||
break
|
||||
|
||||
# First we delete expired url cache entries
|
||||
media_ids = await self.store.get_expired_url_cache(now)
|
||||
|
||||
removed_media = []
|
||||
for media_id in media_ids:
|
||||
fname = self.filepaths.url_cache_filepath(media_id)
|
||||
try:
|
||||
os.remove(fname)
|
||||
except FileNotFoundError:
|
||||
pass # If the path doesn't exist, meh
|
||||
except OSError as e:
|
||||
logger.warning(
|
||||
"Failed to remove media while clearing url preview cache: %r: %s",
|
||||
media_id,
|
||||
e,
|
||||
)
|
||||
continue
|
||||
|
||||
removed_media.append(media_id)
|
||||
|
||||
dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id)
|
||||
try_remove_parent_dirs(dirs)
|
||||
|
||||
await self.store.delete_url_cache(removed_media)
|
||||
|
||||
if removed_media:
|
||||
logger.debug(
|
||||
"Deleted %d entries from url preview cache", len(removed_media)
|
||||
)
|
||||
else:
|
||||
logger.debug("No entries removed from url preview cache")
|
||||
|
||||
# Now we delete old images associated with the url cache.
|
||||
# These may be cached for a bit on the client (i.e., they
|
||||
# may have a room open with a preview url thing open).
|
||||
# So we wait a couple of days before deleting, just in case.
|
||||
expire_before = now - IMAGE_CACHE_EXPIRY_MS
|
||||
media_ids = await self.store.get_url_cache_media_before(expire_before)
|
||||
|
||||
removed_media = []
|
||||
for media_id in media_ids:
|
||||
fname = self.filepaths.url_cache_filepath(media_id)
|
||||
try:
|
||||
os.remove(fname)
|
||||
except FileNotFoundError:
|
||||
pass # If the path doesn't exist, meh
|
||||
except OSError as e:
|
||||
logger.warning(
|
||||
"Failed to remove media from url preview cache: %r: %s", media_id, e
|
||||
)
|
||||
continue
|
||||
|
||||
dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id)
|
||||
try_remove_parent_dirs(dirs)
|
||||
|
||||
thumbnail_dir = self.filepaths.url_cache_thumbnail_directory(media_id)
|
||||
try:
|
||||
shutil.rmtree(thumbnail_dir)
|
||||
except FileNotFoundError:
|
||||
pass # If the path doesn't exist, meh
|
||||
except OSError as e:
|
||||
logger.warning(
|
||||
"Failed to remove media from url preview cache: %r: %s", media_id, e
|
||||
)
|
||||
continue
|
||||
|
||||
removed_media.append(media_id)
|
||||
|
||||
dirs = self.filepaths.url_cache_thumbnail_dirs_to_delete(media_id)
|
||||
# Note that one of the directories to be deleted has already been
|
||||
# removed by the `rmtree` above.
|
||||
try_remove_parent_dirs(dirs)
|
||||
|
||||
await self.store.delete_url_cache_media(removed_media)
|
||||
|
||||
if removed_media:
|
||||
logger.debug("Deleted %d media from url preview cache", len(removed_media))
|
||||
else:
|
||||
logger.debug("No media removed from url preview cache")
|
||||
|
||||
|
||||
def _is_media(content_type: str) -> bool:
|
||||
return content_type.lower().startswith("image/")
|
||||
|
||||
|
||||
def _is_html(content_type: str) -> bool:
|
||||
content_type = content_type.lower()
|
||||
return content_type.startswith("text/html") or content_type.startswith(
|
||||
"application/xhtml"
|
||||
)
|
||||
|
||||
|
||||
def _is_json(content_type: str) -> bool:
|
||||
return content_type.lower().startswith("application/json")
|
||||
|
||||
|
||||
def _is_previewable(content_type: str) -> bool:
|
||||
"""Returns True for content types for which we will perform URL preview and False
|
||||
otherwise."""
|
||||
|
||||
return _is_html(content_type) or _is_media(content_type) or _is_json(content_type)
|
|
@ -12,26 +12,9 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import datetime
|
||||
import errno
|
||||
import fnmatch
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import sys
|
||||
import traceback
|
||||
from typing import TYPE_CHECKING, BinaryIO, Iterable, Optional, Tuple
|
||||
from urllib.parse import urljoin, urlparse, urlsplit
|
||||
from urllib.request import urlopen
|
||||
|
||||
import attr
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from twisted.internet.defer import Deferred
|
||||
from twisted.internet.error import DNSLookupError
|
||||
|
||||
from synapse.api.errors import Codes, SynapseError
|
||||
from synapse.http.client import SimpleHttpClient
|
||||
from synapse.http.server import (
|
||||
DirectServeJsonResource,
|
||||
respond_with_json,
|
||||
|
@ -39,71 +22,13 @@ from synapse.http.server import (
|
|||
)
|
||||
from synapse.http.servlet import parse_integer, parse_string
|
||||
from synapse.http.site import SynapseRequest
|
||||
from synapse.logging.context import make_deferred_yieldable, run_in_background
|
||||
from synapse.media._base import FileInfo, get_filename_from_headers
|
||||
from synapse.media.media_storage import MediaStorage
|
||||
from synapse.media.oembed import OEmbedProvider
|
||||
from synapse.media.preview_html import decode_body, parse_html_to_open_graph
|
||||
from synapse.metrics.background_process_metrics import run_as_background_process
|
||||
from synapse.types import JsonDict, UserID
|
||||
from synapse.util import json_encoder
|
||||
from synapse.util.async_helpers import ObservableDeferred
|
||||
from synapse.util.caches.expiringcache import ExpiringCache
|
||||
from synapse.util.stringutils import random_string
|
||||
from synapse.media.url_previewer import UrlPreviewer
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from synapse.media.media_repository import MediaRepository
|
||||
from synapse.server import HomeServer
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
OG_TAG_NAME_MAXLEN = 50
|
||||
OG_TAG_VALUE_MAXLEN = 1000
|
||||
|
||||
ONE_HOUR = 60 * 60 * 1000
|
||||
ONE_DAY = 24 * ONE_HOUR
|
||||
IMAGE_CACHE_EXPIRY_MS = 2 * ONE_DAY
|
||||
|
||||
|
||||
@attr.s(slots=True, frozen=True, auto_attribs=True)
|
||||
class DownloadResult:
|
||||
length: int
|
||||
uri: str
|
||||
response_code: int
|
||||
media_type: str
|
||||
download_name: Optional[str]
|
||||
expires: int
|
||||
etag: Optional[str]
|
||||
|
||||
|
||||
@attr.s(slots=True, frozen=True, auto_attribs=True)
|
||||
class MediaInfo:
|
||||
"""
|
||||
Information parsed from downloading media being previewed.
|
||||
"""
|
||||
|
||||
# The Content-Type header of the response.
|
||||
media_type: str
|
||||
# The length (in bytes) of the downloaded media.
|
||||
media_length: int
|
||||
# The media filename, according to the server. This is parsed from the
|
||||
# returned headers, if possible.
|
||||
download_name: Optional[str]
|
||||
# The time of the preview.
|
||||
created_ts_ms: int
|
||||
# Information from the media storage provider about where the file is stored
|
||||
# on disk.
|
||||
filesystem_id: str
|
||||
filename: str
|
||||
# The URI being previewed.
|
||||
uri: str
|
||||
# The HTTP response code.
|
||||
response_code: int
|
||||
# The timestamp (in milliseconds) of when this preview expires.
|
||||
expires: int
|
||||
# The ETag header of the response.
|
||||
etag: Optional[str]
|
||||
|
||||
|
||||
class PreviewUrlResource(DirectServeJsonResource):
|
||||
"""
|
||||
|
@ -121,54 +46,6 @@ class PreviewUrlResource(DirectServeJsonResource):
|
|||
* The URL metadata must be stored somewhere, rather than just using Matrix
|
||||
itself to store the media.
|
||||
* Matrix cannot be used to distribute the metadata between homeservers.
|
||||
|
||||
When Synapse is asked to preview a URL it does the following:
|
||||
|
||||
1. Checks against a URL blacklist (defined as `url_preview_url_blacklist` in the
|
||||
config).
|
||||
2. Checks the URL against an in-memory cache and returns the result if it exists. (This
|
||||
is also used to de-duplicate processing of multiple in-flight requests at once.)
|
||||
3. Kicks off a background process to generate a preview:
|
||||
1. Checks URL and timestamp against the database cache and returns the result if it
|
||||
has not expired and was successful (a 2xx return code).
|
||||
2. Checks if the URL matches an oEmbed (https://oembed.com/) pattern. If it
|
||||
does, update the URL to download.
|
||||
3. Downloads the URL and stores it into a file via the media storage provider
|
||||
and saves the local media metadata.
|
||||
4. If the media is an image:
|
||||
1. Generates thumbnails.
|
||||
2. Generates an Open Graph response based on image properties.
|
||||
5. If the media is HTML:
|
||||
1. Decodes the HTML via the stored file.
|
||||
2. Generates an Open Graph response from the HTML.
|
||||
3. If a JSON oEmbed URL was found in the HTML via autodiscovery:
|
||||
1. Downloads the URL and stores it into a file via the media storage provider
|
||||
and saves the local media metadata.
|
||||
2. Convert the oEmbed response to an Open Graph response.
|
||||
3. Override any Open Graph data from the HTML with data from oEmbed.
|
||||
4. If an image exists in the Open Graph response:
|
||||
1. Downloads the URL and stores it into a file via the media storage
|
||||
provider and saves the local media metadata.
|
||||
2. Generates thumbnails.
|
||||
3. Updates the Open Graph response based on image properties.
|
||||
6. If the media is JSON and an oEmbed URL was found:
|
||||
1. Convert the oEmbed response to an Open Graph response.
|
||||
2. If a thumbnail or image is in the oEmbed response:
|
||||
1. Downloads the URL and stores it into a file via the media storage
|
||||
provider and saves the local media metadata.
|
||||
2. Generates thumbnails.
|
||||
3. Updates the Open Graph response based on image properties.
|
||||
7. Stores the result in the database cache.
|
||||
4. Returns the result.
|
||||
|
||||
If any additional requests (e.g. from oEmbed autodiscovery, step 5.3 or
|
||||
image thumbnailing, step 5.4 or 6.4) fails then the URL preview as a whole
|
||||
does not fail. As much information as possible is returned.
|
||||
|
||||
The in-memory cache expires after 1 hour.
|
||||
|
||||
Expired entries in the database cache (and their associated media files) are
|
||||
deleted every 10 seconds. The default expiration time is 1 hour from download.
|
||||
"""
|
||||
|
||||
isLeaf = True
|
||||
|
@ -183,48 +60,10 @@ class PreviewUrlResource(DirectServeJsonResource):
|
|||
|
||||
self.auth = hs.get_auth()
|
||||
self.clock = hs.get_clock()
|
||||
self.filepaths = media_repo.filepaths
|
||||
self.max_spider_size = hs.config.media.max_spider_size
|
||||
self.server_name = hs.hostname
|
||||
self.store = hs.get_datastores().main
|
||||
self.client = SimpleHttpClient(
|
||||
hs,
|
||||
treq_args={"browser_like_redirects": True},
|
||||
ip_whitelist=hs.config.media.url_preview_ip_range_whitelist,
|
||||
ip_blacklist=hs.config.media.url_preview_ip_range_blacklist,
|
||||
use_proxy=True,
|
||||
)
|
||||
self.media_repo = media_repo
|
||||
self.primary_base_path = media_repo.primary_base_path
|
||||
self.media_storage = media_storage
|
||||
|
||||
self._oembed = OEmbedProvider(hs)
|
||||
|
||||
# We run the background jobs if we're the instance specified (or no
|
||||
# instance is specified, where we assume there is only one instance
|
||||
# serving media).
|
||||
instance_running_jobs = hs.config.media.media_instance_running_background_jobs
|
||||
self._worker_run_media_background_jobs = (
|
||||
instance_running_jobs is None
|
||||
or instance_running_jobs == hs.get_instance_name()
|
||||
)
|
||||
|
||||
self.url_preview_url_blacklist = hs.config.media.url_preview_url_blacklist
|
||||
self.url_preview_accept_language = hs.config.media.url_preview_accept_language
|
||||
|
||||
# memory cache mapping urls to an ObservableDeferred returning
|
||||
# JSON-encoded OG metadata
|
||||
self._cache: ExpiringCache[str, ObservableDeferred] = ExpiringCache(
|
||||
cache_name="url_previews",
|
||||
clock=self.clock,
|
||||
# don't spider URLs more often than once an hour
|
||||
expiry_ms=ONE_HOUR,
|
||||
)
|
||||
|
||||
if self._worker_run_media_background_jobs:
|
||||
self._cleaner_loop = self.clock.looping_call(
|
||||
self._start_expire_url_cache_data, 10 * 1000
|
||||
)
|
||||
self._url_previewer = UrlPreviewer(hs, media_repo, media_storage)
|
||||
|
||||
async def _async_render_OPTIONS(self, request: SynapseRequest) -> None:
|
||||
request.setHeader(b"Allow", b"OPTIONS, GET")
|
||||
|
@ -238,632 +77,5 @@ class PreviewUrlResource(DirectServeJsonResource):
|
|||
if ts is None:
|
||||
ts = self.clock.time_msec()
|
||||
|
||||
# XXX: we could move this into _do_preview if we wanted.
|
||||
url_tuple = urlsplit(url)
|
||||
for entry in self.url_preview_url_blacklist:
|
||||
match = True
|
||||
for attrib in entry:
|
||||
pattern = entry[attrib]
|
||||
value = getattr(url_tuple, attrib)
|
||||
logger.debug(
|
||||
"Matching attrib '%s' with value '%s' against pattern '%s'",
|
||||
attrib,
|
||||
value,
|
||||
pattern,
|
||||
)
|
||||
|
||||
if value is None:
|
||||
match = False
|
||||
continue
|
||||
|
||||
# Some attributes might not be parsed as strings by urlsplit (such as the
|
||||
# port, which is parsed as an int). Because we use match functions that
|
||||
# expect strings, we want to make sure that's what we give them.
|
||||
value_str = str(value)
|
||||
|
||||
if pattern.startswith("^"):
|
||||
if not re.match(pattern, value_str):
|
||||
match = False
|
||||
continue
|
||||
else:
|
||||
if not fnmatch.fnmatch(value_str, pattern):
|
||||
match = False
|
||||
continue
|
||||
if match:
|
||||
logger.warning("URL %s blocked by url_blacklist entry %s", url, entry)
|
||||
raise SynapseError(
|
||||
403, "URL blocked by url pattern blacklist entry", Codes.UNKNOWN
|
||||
)
|
||||
|
||||
# the in-memory cache:
|
||||
# * ensures that only one request is active at a time
|
||||
# * takes load off the DB for the thundering herds
|
||||
# * also caches any failures (unlike the DB) so we don't keep
|
||||
# requesting the same endpoint
|
||||
|
||||
observable = self._cache.get(url)
|
||||
|
||||
if not observable:
|
||||
download = run_in_background(self._do_preview, url, requester.user, ts)
|
||||
observable = ObservableDeferred(download, consumeErrors=True)
|
||||
self._cache[url] = observable
|
||||
else:
|
||||
logger.info("Returning cached response")
|
||||
|
||||
og = await make_deferred_yieldable(observable.observe())
|
||||
og = await self._url_previewer.preview(url, requester.user, ts)
|
||||
respond_with_json_bytes(request, 200, og, send_cors=True)
|
||||
|
||||
async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
|
||||
"""Check the db, and download the URL and build a preview
|
||||
|
||||
Args:
|
||||
url: The URL to preview.
|
||||
user: The user requesting the preview.
|
||||
ts: The timestamp requested for the preview.
|
||||
|
||||
Returns:
|
||||
json-encoded og data
|
||||
"""
|
||||
# check the URL cache in the DB (which will also provide us with
|
||||
# historical previews, if we have any)
|
||||
cache_result = await self.store.get_url_cache(url, ts)
|
||||
if (
|
||||
cache_result
|
||||
and cache_result["expires_ts"] > ts
|
||||
and cache_result["response_code"] / 100 == 2
|
||||
):
|
||||
# It may be stored as text in the database, not as bytes (such as
|
||||
# PostgreSQL). If so, encode it back before handing it on.
|
||||
og = cache_result["og"]
|
||||
if isinstance(og, str):
|
||||
og = og.encode("utf8")
|
||||
return og
|
||||
|
||||
# If this URL can be accessed via oEmbed, use that instead.
|
||||
url_to_download = url
|
||||
oembed_url = self._oembed.get_oembed_url(url)
|
||||
if oembed_url:
|
||||
url_to_download = oembed_url
|
||||
|
||||
media_info = await self._handle_url(url_to_download, user)
|
||||
|
||||
logger.debug("got media_info of '%s'", media_info)
|
||||
|
||||
# The number of milliseconds that the response should be considered valid.
|
||||
expiration_ms = media_info.expires
|
||||
author_name: Optional[str] = None
|
||||
|
||||
if _is_media(media_info.media_type):
|
||||
file_id = media_info.filesystem_id
|
||||
dims = await self.media_repo._generate_thumbnails(
|
||||
None, file_id, file_id, media_info.media_type, url_cache=True
|
||||
)
|
||||
|
||||
og = {
|
||||
"og:description": media_info.download_name,
|
||||
"og:image": f"mxc://{self.server_name}/{media_info.filesystem_id}",
|
||||
"og:image:type": media_info.media_type,
|
||||
"matrix:image:size": media_info.media_length,
|
||||
}
|
||||
|
||||
if dims:
|
||||
og["og:image:width"] = dims["width"]
|
||||
og["og:image:height"] = dims["height"]
|
||||
else:
|
||||
logger.warning("Couldn't get dims for %s" % url)
|
||||
|
||||
# define our OG response for this media
|
||||
elif _is_html(media_info.media_type):
|
||||
# TODO: somehow stop a big HTML tree from exploding synapse's RAM
|
||||
|
||||
with open(media_info.filename, "rb") as file:
|
||||
body = file.read()
|
||||
|
||||
tree = decode_body(body, media_info.uri, media_info.media_type)
|
||||
if tree is not None:
|
||||
# Check if this HTML document points to oEmbed information and
|
||||
# defer to that.
|
||||
oembed_url = self._oembed.autodiscover_from_html(tree)
|
||||
og_from_oembed: JsonDict = {}
|
||||
if oembed_url:
|
||||
try:
|
||||
oembed_info = await self._handle_url(
|
||||
oembed_url, user, allow_data_urls=True
|
||||
)
|
||||
except Exception as e:
|
||||
# Fetching the oEmbed info failed, don't block the entire URL preview.
|
||||
logger.warning(
|
||||
"oEmbed fetch failed during URL preview: %s errored with %s",
|
||||
oembed_url,
|
||||
e,
|
||||
)
|
||||
else:
|
||||
(
|
||||
og_from_oembed,
|
||||
author_name,
|
||||
expiration_ms,
|
||||
) = await self._handle_oembed_response(
|
||||
url, oembed_info, expiration_ms
|
||||
)
|
||||
|
||||
# Parse Open Graph information from the HTML in case the oEmbed
|
||||
# response failed or is incomplete.
|
||||
og_from_html = parse_html_to_open_graph(tree)
|
||||
|
||||
# Compile the Open Graph response by using the scraped
|
||||
# information from the HTML and overlaying any information
|
||||
# from the oEmbed response.
|
||||
og = {**og_from_html, **og_from_oembed}
|
||||
|
||||
await self._precache_image_url(user, media_info, og)
|
||||
else:
|
||||
og = {}
|
||||
|
||||
elif oembed_url:
|
||||
# Handle the oEmbed information.
|
||||
og, author_name, expiration_ms = await self._handle_oembed_response(
|
||||
url, media_info, expiration_ms
|
||||
)
|
||||
await self._precache_image_url(user, media_info, og)
|
||||
|
||||
else:
|
||||
logger.warning("Failed to find any OG data in %s", url)
|
||||
og = {}
|
||||
|
||||
# If we don't have a title but we have author_name, copy it as
|
||||
# title
|
||||
if not og.get("og:title") and author_name:
|
||||
og["og:title"] = author_name
|
||||
|
||||
# filter out any stupidly long values
|
||||
keys_to_remove = []
|
||||
for k, v in og.items():
|
||||
# values can be numeric as well as strings, hence the cast to str
|
||||
if len(k) > OG_TAG_NAME_MAXLEN or len(str(v)) > OG_TAG_VALUE_MAXLEN:
|
||||
logger.warning(
|
||||
"Pruning overlong tag %s from OG data", k[:OG_TAG_NAME_MAXLEN]
|
||||
)
|
||||
keys_to_remove.append(k)
|
||||
for k in keys_to_remove:
|
||||
del og[k]
|
||||
|
||||
logger.debug("Calculated OG for %s as %s", url, og)
|
||||
|
||||
jsonog = json_encoder.encode(og)
|
||||
|
||||
# Cap the amount of time to consider a response valid.
|
||||
expiration_ms = min(expiration_ms, ONE_DAY)
|
||||
|
||||
# store OG in history-aware DB cache
|
||||
await self.store.store_url_cache(
|
||||
url,
|
||||
media_info.response_code,
|
||||
media_info.etag,
|
||||
media_info.created_ts_ms + expiration_ms,
|
||||
jsonog,
|
||||
media_info.filesystem_id,
|
||||
media_info.created_ts_ms,
|
||||
)
|
||||
|
||||
return jsonog.encode("utf8")
|
||||
|
||||
async def _download_url(self, url: str, output_stream: BinaryIO) -> DownloadResult:
|
||||
"""
|
||||
Fetches a remote URL and parses the headers.
|
||||
|
||||
Args:
|
||||
url: The URL to fetch.
|
||||
output_stream: The stream to write the content to.
|
||||
|
||||
Returns:
|
||||
A tuple of:
|
||||
Media length, URL downloaded, the HTTP response code,
|
||||
the media type, the downloaded file name, the number of
|
||||
milliseconds the result is valid for, the etag header.
|
||||
"""
|
||||
|
||||
try:
|
||||
logger.debug("Trying to get preview for url '%s'", url)
|
||||
length, headers, uri, code = await self.client.get_file(
|
||||
url,
|
||||
output_stream=output_stream,
|
||||
max_size=self.max_spider_size,
|
||||
headers={
|
||||
b"Accept-Language": self.url_preview_accept_language,
|
||||
# Use a custom user agent for the preview because some sites will only return
|
||||
# Open Graph metadata to crawler user agents. Omit the Synapse version
|
||||
# string to avoid leaking information.
|
||||
b"User-Agent": [
|
||||
"Synapse (bot; +https://github.com/matrix-org/synapse)"
|
||||
],
|
||||
},
|
||||
is_allowed_content_type=_is_previewable,
|
||||
)
|
||||
except SynapseError:
|
||||
# Pass SynapseErrors through directly, so that the servlet
|
||||
# handler will return a SynapseError to the client instead of
|
||||
# blank data or a 500.
|
||||
raise
|
||||
except DNSLookupError:
|
||||
# DNS lookup returned no results
|
||||
# Note: This will also be the case if one of the resolved IP
|
||||
# addresses is blacklisted
|
||||
raise SynapseError(
|
||||
502,
|
||||
"DNS resolution failure during URL preview generation",
|
||||
Codes.UNKNOWN,
|
||||
)
|
||||
except Exception as e:
|
||||
# FIXME: pass through 404s and other error messages nicely
|
||||
logger.warning("Error downloading %s: %r", url, e)
|
||||
|
||||
raise SynapseError(
|
||||
500,
|
||||
"Failed to download content: %s"
|
||||
% (traceback.format_exception_only(sys.exc_info()[0], e),),
|
||||
Codes.UNKNOWN,
|
||||
)
|
||||
|
||||
if b"Content-Type" in headers:
|
||||
media_type = headers[b"Content-Type"][0].decode("ascii")
|
||||
else:
|
||||
media_type = "application/octet-stream"
|
||||
|
||||
download_name = get_filename_from_headers(headers)
|
||||
|
||||
# FIXME: we should calculate a proper expiration based on the
|
||||
# Cache-Control and Expire headers. But for now, assume 1 hour.
|
||||
expires = ONE_HOUR
|
||||
etag = headers[b"ETag"][0].decode("ascii") if b"ETag" in headers else None
|
||||
|
||||
return DownloadResult(
|
||||
length, uri, code, media_type, download_name, expires, etag
|
||||
)
|
||||
|
||||
async def _parse_data_url(
|
||||
self, url: str, output_stream: BinaryIO
|
||||
) -> DownloadResult:
|
||||
"""
|
||||
Parses a data: URL.
|
||||
|
||||
Args:
|
||||
url: The URL to parse.
|
||||
output_stream: The stream to write the content to.
|
||||
|
||||
Returns:
|
||||
A tuple of:
|
||||
Media length, URL downloaded, the HTTP response code,
|
||||
the media type, the downloaded file name, the number of
|
||||
milliseconds the result is valid for, the etag header.
|
||||
"""
|
||||
|
||||
try:
|
||||
logger.debug("Trying to parse data url '%s'", url)
|
||||
with urlopen(url) as url_info:
|
||||
# TODO Can this be more efficient.
|
||||
output_stream.write(url_info.read())
|
||||
except Exception as e:
|
||||
logger.warning("Error parsing data: URL %s: %r", url, e)
|
||||
|
||||
raise SynapseError(
|
||||
500,
|
||||
"Failed to parse data URL: %s"
|
||||
% (traceback.format_exception_only(sys.exc_info()[0], e),),
|
||||
Codes.UNKNOWN,
|
||||
)
|
||||
|
||||
return DownloadResult(
|
||||
# Read back the length that has been written.
|
||||
length=output_stream.tell(),
|
||||
uri=url,
|
||||
# If it was parsed, consider this a 200 OK.
|
||||
response_code=200,
|
||||
# urlopen shoves the media-type from the data URL into the content type
|
||||
# header object.
|
||||
media_type=url_info.headers.get_content_type(),
|
||||
# Some features are not supported by data: URLs.
|
||||
download_name=None,
|
||||
expires=ONE_HOUR,
|
||||
etag=None,
|
||||
)
|
||||
|
||||
async def _handle_url(
|
||||
self, url: str, user: UserID, allow_data_urls: bool = False
|
||||
) -> MediaInfo:
|
||||
"""
|
||||
Fetches content from a URL and parses the result to generate a MediaInfo.
|
||||
|
||||
It uses the media storage provider to persist the fetched content and
|
||||
stores the mapping into the database.
|
||||
|
||||
Args:
|
||||
url: The URL to fetch.
|
||||
user: The user who ahs requested this URL.
|
||||
allow_data_urls: True if data URLs should be allowed.
|
||||
|
||||
Returns:
|
||||
A MediaInfo object describing the fetched content.
|
||||
"""
|
||||
|
||||
# TODO: we should probably honour robots.txt... except in practice
|
||||
# we're most likely being explicitly triggered by a human rather than a
|
||||
# bot, so are we really a robot?
|
||||
|
||||
file_id = datetime.date.today().isoformat() + "_" + random_string(16)
|
||||
|
||||
file_info = FileInfo(server_name=None, file_id=file_id, url_cache=True)
|
||||
|
||||
with self.media_storage.store_into_file(file_info) as (f, fname, finish):
|
||||
if url.startswith("data:"):
|
||||
if not allow_data_urls:
|
||||
raise SynapseError(
|
||||
500, "Previewing of data: URLs is forbidden", Codes.UNKNOWN
|
||||
)
|
||||
|
||||
download_result = await self._parse_data_url(url, f)
|
||||
else:
|
||||
download_result = await self._download_url(url, f)
|
||||
|
||||
await finish()
|
||||
|
||||
try:
|
||||
time_now_ms = self.clock.time_msec()
|
||||
|
||||
await self.store.store_local_media(
|
||||
media_id=file_id,
|
||||
media_type=download_result.media_type,
|
||||
time_now_ms=time_now_ms,
|
||||
upload_name=download_result.download_name,
|
||||
media_length=download_result.length,
|
||||
user_id=user,
|
||||
url_cache=url,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Error handling downloaded %s: %r", url, e)
|
||||
# TODO: we really ought to delete the downloaded file in this
|
||||
# case, since we won't have recorded it in the db, and will
|
||||
# therefore not expire it.
|
||||
raise
|
||||
|
||||
return MediaInfo(
|
||||
media_type=download_result.media_type,
|
||||
media_length=download_result.length,
|
||||
download_name=download_result.download_name,
|
||||
created_ts_ms=time_now_ms,
|
||||
filesystem_id=file_id,
|
||||
filename=fname,
|
||||
uri=download_result.uri,
|
||||
response_code=download_result.response_code,
|
||||
expires=download_result.expires,
|
||||
etag=download_result.etag,
|
||||
)
|
||||
|
||||
async def _precache_image_url(
|
||||
self, user: UserID, media_info: MediaInfo, og: JsonDict
|
||||
) -> None:
|
||||
"""
|
||||
Pre-cache the image (if one exists) for posterity
|
||||
|
||||
Args:
|
||||
user: The user requesting the preview.
|
||||
media_info: The media being previewed.
|
||||
og: The Open Graph dictionary. This is modified with image information.
|
||||
"""
|
||||
# If there's no image or it is blank, there's nothing to do.
|
||||
if "og:image" not in og:
|
||||
return
|
||||
|
||||
# Remove the raw image URL, this will be replaced with an MXC URL, if successful.
|
||||
image_url = og.pop("og:image")
|
||||
if not image_url:
|
||||
return
|
||||
|
||||
# The image URL from the HTML might be relative to the previewed page,
|
||||
# convert it to an URL which can be requested directly.
|
||||
url_parts = urlparse(image_url)
|
||||
if url_parts.scheme != "data":
|
||||
image_url = urljoin(media_info.uri, image_url)
|
||||
|
||||
# FIXME: it might be cleaner to use the same flow as the main /preview_url
|
||||
# request itself and benefit from the same caching etc. But for now we
|
||||
# just rely on the caching on the master request to speed things up.
|
||||
try:
|
||||
image_info = await self._handle_url(image_url, user, allow_data_urls=True)
|
||||
except Exception as e:
|
||||
# Pre-caching the image failed, don't block the entire URL preview.
|
||||
logger.warning(
|
||||
"Pre-caching image failed during URL preview: %s errored with %s",
|
||||
image_url,
|
||||
e,
|
||||
)
|
||||
return
|
||||
|
||||
if _is_media(image_info.media_type):
|
||||
# TODO: make sure we don't choke on white-on-transparent images
|
||||
file_id = image_info.filesystem_id
|
||||
dims = await self.media_repo._generate_thumbnails(
|
||||
None, file_id, file_id, image_info.media_type, url_cache=True
|
||||
)
|
||||
if dims:
|
||||
og["og:image:width"] = dims["width"]
|
||||
og["og:image:height"] = dims["height"]
|
||||
else:
|
||||
logger.warning("Couldn't get dims for %s", image_url)
|
||||
|
||||
og["og:image"] = f"mxc://{self.server_name}/{image_info.filesystem_id}"
|
||||
og["og:image:type"] = image_info.media_type
|
||||
og["matrix:image:size"] = image_info.media_length
|
||||
|
||||
async def _handle_oembed_response(
|
||||
self, url: str, media_info: MediaInfo, expiration_ms: int
|
||||
) -> Tuple[JsonDict, Optional[str], int]:
|
||||
"""
|
||||
Parse the downloaded oEmbed info.
|
||||
|
||||
Args:
|
||||
url: The URL which is being previewed (not the one which was
|
||||
requested).
|
||||
media_info: The media being previewed.
|
||||
expiration_ms: The length of time, in milliseconds, the media is valid for.
|
||||
|
||||
Returns:
|
||||
A tuple of:
|
||||
The Open Graph dictionary, if the oEmbed info can be parsed.
|
||||
The author name if it could be retrieved from oEmbed.
|
||||
The (possibly updated) length of time, in milliseconds, the media is valid for.
|
||||
"""
|
||||
# If JSON was not returned, there's nothing to do.
|
||||
if not _is_json(media_info.media_type):
|
||||
return {}, None, expiration_ms
|
||||
|
||||
with open(media_info.filename, "rb") as file:
|
||||
body = file.read()
|
||||
|
||||
oembed_response = self._oembed.parse_oembed_response(url, body)
|
||||
open_graph_result = oembed_response.open_graph_result
|
||||
|
||||
# Use the cache age from the oEmbed result, if one was given.
|
||||
if open_graph_result and oembed_response.cache_age is not None:
|
||||
expiration_ms = oembed_response.cache_age
|
||||
|
||||
return open_graph_result, oembed_response.author_name, expiration_ms
|
||||
|
||||
def _start_expire_url_cache_data(self) -> Deferred:
|
||||
return run_as_background_process(
|
||||
"expire_url_cache_data", self._expire_url_cache_data
|
||||
)
|
||||
|
||||
async def _expire_url_cache_data(self) -> None:
|
||||
"""Clean up expired url cache content, media and thumbnails."""
|
||||
|
||||
assert self._worker_run_media_background_jobs
|
||||
|
||||
now = self.clock.time_msec()
|
||||
|
||||
logger.debug("Running url preview cache expiry")
|
||||
|
||||
def try_remove_parent_dirs(dirs: Iterable[str]) -> None:
|
||||
"""Attempt to remove the given chain of parent directories
|
||||
|
||||
Args:
|
||||
dirs: The list of directory paths to delete, with children appearing
|
||||
before their parents.
|
||||
"""
|
||||
for dir in dirs:
|
||||
try:
|
||||
os.rmdir(dir)
|
||||
except FileNotFoundError:
|
||||
# Already deleted, continue with deleting the rest
|
||||
pass
|
||||
except OSError as e:
|
||||
# Failed, skip deleting the rest of the parent dirs
|
||||
if e.errno != errno.ENOTEMPTY:
|
||||
logger.warning(
|
||||
"Failed to remove media directory while clearing url preview cache: %r: %s",
|
||||
dir,
|
||||
e,
|
||||
)
|
||||
break
|
||||
|
||||
# First we delete expired url cache entries
|
||||
media_ids = await self.store.get_expired_url_cache(now)
|
||||
|
||||
removed_media = []
|
||||
for media_id in media_ids:
|
||||
fname = self.filepaths.url_cache_filepath(media_id)
|
||||
try:
|
||||
os.remove(fname)
|
||||
except FileNotFoundError:
|
||||
pass # If the path doesn't exist, meh
|
||||
except OSError as e:
|
||||
logger.warning(
|
||||
"Failed to remove media while clearing url preview cache: %r: %s",
|
||||
media_id,
|
||||
e,
|
||||
)
|
||||
continue
|
||||
|
||||
removed_media.append(media_id)
|
||||
|
||||
dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id)
|
||||
try_remove_parent_dirs(dirs)
|
||||
|
||||
await self.store.delete_url_cache(removed_media)
|
||||
|
||||
if removed_media:
|
||||
logger.debug(
|
||||
"Deleted %d entries from url preview cache", len(removed_media)
|
||||
)
|
||||
else:
|
||||
logger.debug("No entries removed from url preview cache")
|
||||
|
||||
# Now we delete old images associated with the url cache.
|
||||
# These may be cached for a bit on the client (i.e., they
|
||||
# may have a room open with a preview url thing open).
|
||||
# So we wait a couple of days before deleting, just in case.
|
||||
expire_before = now - IMAGE_CACHE_EXPIRY_MS
|
||||
media_ids = await self.store.get_url_cache_media_before(expire_before)
|
||||
|
||||
removed_media = []
|
||||
for media_id in media_ids:
|
||||
fname = self.filepaths.url_cache_filepath(media_id)
|
||||
try:
|
||||
os.remove(fname)
|
||||
except FileNotFoundError:
|
||||
pass # If the path doesn't exist, meh
|
||||
except OSError as e:
|
||||
logger.warning(
|
||||
"Failed to remove media from url preview cache: %r: %s", media_id, e
|
||||
)
|
||||
continue
|
||||
|
||||
dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id)
|
||||
try_remove_parent_dirs(dirs)
|
||||
|
||||
thumbnail_dir = self.filepaths.url_cache_thumbnail_directory(media_id)
|
||||
try:
|
||||
shutil.rmtree(thumbnail_dir)
|
||||
except FileNotFoundError:
|
||||
pass # If the path doesn't exist, meh
|
||||
except OSError as e:
|
||||
logger.warning(
|
||||
"Failed to remove media from url preview cache: %r: %s", media_id, e
|
||||
)
|
||||
continue
|
||||
|
||||
removed_media.append(media_id)
|
||||
|
||||
dirs = self.filepaths.url_cache_thumbnail_dirs_to_delete(media_id)
|
||||
# Note that one of the directories to be deleted has already been
|
||||
# removed by the `rmtree` above.
|
||||
try_remove_parent_dirs(dirs)
|
||||
|
||||
await self.store.delete_url_cache_media(removed_media)
|
||||
|
||||
if removed_media:
|
||||
logger.debug("Deleted %d media from url preview cache", len(removed_media))
|
||||
else:
|
||||
logger.debug("No media removed from url preview cache")
|
||||
|
||||
|
||||
def _is_media(content_type: str) -> bool:
|
||||
return content_type.lower().startswith("image/")
|
||||
|
||||
|
||||
def _is_html(content_type: str) -> bool:
|
||||
content_type = content_type.lower()
|
||||
return content_type.startswith("text/html") or content_type.startswith(
|
||||
"application/xhtml"
|
||||
)
|
||||
|
||||
|
||||
def _is_json(content_type: str) -> bool:
|
||||
return content_type.lower().startswith("application/json")
|
||||
|
||||
|
||||
def _is_previewable(content_type: str) -> bool:
|
||||
"""Returns True for content types for which we will perform URL preview and False
|
||||
otherwise."""
|
||||
|
||||
return _is_html(content_type) or _is_media(content_type) or _is_json(content_type)
|
||||
|
|
|
@ -26,8 +26,8 @@ from twisted.internet.interfaces import IAddress, IResolutionReceiver
|
|||
from twisted.test.proto_helpers import AccumulatingProtocol, MemoryReactor
|
||||
|
||||
from synapse.config.oembed import OEmbedEndpointConfig
|
||||
from synapse.media.url_previewer import IMAGE_CACHE_EXPIRY_MS
|
||||
from synapse.rest.media.media_repository_resource import MediaRepositoryResource
|
||||
from synapse.rest.media.preview_url_resource import IMAGE_CACHE_EXPIRY_MS
|
||||
from synapse.server import HomeServer
|
||||
from synapse.types import JsonDict
|
||||
from synapse.util import Clock
|
||||
|
@ -36,7 +36,6 @@ from synapse.util.stringutils import parse_and_validate_mxc_uri
|
|||
from tests import unittest
|
||||
from tests.server import FakeTransport
|
||||
from tests.test_utils import SMALL_PNG
|
||||
from tests.utils import MockClock
|
||||
|
||||
try:
|
||||
import lxml
|
||||
|
@ -117,8 +116,9 @@ class URLPreviewTests(unittest.HomeserverTestCase):
|
|||
return hs
|
||||
|
||||
def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
|
||||
self.media_repo = hs.get_media_repository_resource()
|
||||
self.preview_url = self.media_repo.children[b"preview_url"]
|
||||
self.media_repo = hs.get_media_repository()
|
||||
media_repo_resource = hs.get_media_repository_resource()
|
||||
self.preview_url = media_repo_resource.children[b"preview_url"]
|
||||
|
||||
self.lookups: Dict[str, Any] = {}
|
||||
|
||||
|
@ -193,9 +193,9 @@ class URLPreviewTests(unittest.HomeserverTestCase):
|
|||
)
|
||||
|
||||
# Clear the in-memory cache
|
||||
self.assertIn("http://matrix.org", self.preview_url._cache)
|
||||
self.preview_url._cache.pop("http://matrix.org")
|
||||
self.assertNotIn("http://matrix.org", self.preview_url._cache)
|
||||
self.assertIn("http://matrix.org", self.preview_url._url_previewer._cache)
|
||||
self.preview_url._url_previewer._cache.pop("http://matrix.org")
|
||||
self.assertNotIn("http://matrix.org", self.preview_url._url_previewer._cache)
|
||||
|
||||
# Check the database cache returns the correct response
|
||||
channel = self.make_request(
|
||||
|
@ -1073,7 +1073,7 @@ class URLPreviewTests(unittest.HomeserverTestCase):
|
|||
"""Test that files are not stored in or fetched from storage providers."""
|
||||
host, media_id = self._download_image()
|
||||
|
||||
rel_file_path = self.preview_url.filepaths.url_cache_filepath_rel(media_id)
|
||||
rel_file_path = self.media_repo.filepaths.url_cache_filepath_rel(media_id)
|
||||
media_store_path = os.path.join(self.media_store_path, rel_file_path)
|
||||
storage_provider_path = os.path.join(self.storage_path, rel_file_path)
|
||||
|
||||
|
@ -1116,7 +1116,7 @@ class URLPreviewTests(unittest.HomeserverTestCase):
|
|||
host, media_id = self._download_image()
|
||||
|
||||
rel_thumbnail_path = (
|
||||
self.preview_url.filepaths.url_cache_thumbnail_directory_rel(media_id)
|
||||
self.media_repo.filepaths.url_cache_thumbnail_directory_rel(media_id)
|
||||
)
|
||||
media_store_thumbnail_path = os.path.join(
|
||||
self.media_store_path, rel_thumbnail_path
|
||||
|
@ -1143,7 +1143,7 @@ class URLPreviewTests(unittest.HomeserverTestCase):
|
|||
self.assertEqual(channel.code, 200)
|
||||
|
||||
# Remove the original, otherwise thumbnails will regenerate
|
||||
rel_file_path = self.preview_url.filepaths.url_cache_filepath_rel(media_id)
|
||||
rel_file_path = self.media_repo.filepaths.url_cache_filepath_rel(media_id)
|
||||
media_store_path = os.path.join(self.media_store_path, rel_file_path)
|
||||
os.remove(media_store_path)
|
||||
|
||||
|
@ -1166,26 +1166,24 @@ class URLPreviewTests(unittest.HomeserverTestCase):
|
|||
|
||||
def test_cache_expiry(self) -> None:
|
||||
"""Test that URL cache files and thumbnails are cleaned up properly on expiry."""
|
||||
self.preview_url.clock = MockClock()
|
||||
|
||||
_host, media_id = self._download_image()
|
||||
|
||||
file_path = self.preview_url.filepaths.url_cache_filepath(media_id)
|
||||
file_dirs = self.preview_url.filepaths.url_cache_filepath_dirs_to_delete(
|
||||
file_path = self.media_repo.filepaths.url_cache_filepath(media_id)
|
||||
file_dirs = self.media_repo.filepaths.url_cache_filepath_dirs_to_delete(
|
||||
media_id
|
||||
)
|
||||
thumbnail_dir = self.preview_url.filepaths.url_cache_thumbnail_directory(
|
||||
thumbnail_dir = self.media_repo.filepaths.url_cache_thumbnail_directory(
|
||||
media_id
|
||||
)
|
||||
thumbnail_dirs = self.preview_url.filepaths.url_cache_thumbnail_dirs_to_delete(
|
||||
thumbnail_dirs = self.media_repo.filepaths.url_cache_thumbnail_dirs_to_delete(
|
||||
media_id
|
||||
)
|
||||
|
||||
self.assertTrue(os.path.isfile(file_path))
|
||||
self.assertTrue(os.path.isdir(thumbnail_dir))
|
||||
|
||||
self.preview_url.clock.advance_time_msec(IMAGE_CACHE_EXPIRY_MS + 1)
|
||||
self.get_success(self.preview_url._expire_url_cache_data())
|
||||
self.reactor.advance(IMAGE_CACHE_EXPIRY_MS * 1000 + 1)
|
||||
self.get_success(self.preview_url._url_previewer._expire_url_cache_data())
|
||||
|
||||
for path in [file_path] + file_dirs + [thumbnail_dir] + thumbnail_dirs:
|
||||
self.assertFalse(
|
||||
|
|
Loading…
Reference in New Issue