Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Commit

Permalink
Implement a content type allow list for URL previews (#11936)
Browse files Browse the repository at this point in the history
This implements an allow list for content types for which Synapse will attempt URL preview. If a URL resolves to a resource with a content type which isn't in the list, the download will terminate immediately.

This makes sense given that Synapse would never successfully generate a URL preview for such files in the first place, and helps prevent issues with streaming media servers, such as #8302.

Signed-off-by: Denis Kasak dkasak@termina.org.uk
  • Loading branch information
dkasak committed Feb 10, 2022
1 parent 06e5a76 commit 337f38c
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 0 deletions.
1 change: 1 addition & 0 deletions changelog.d/11936.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Implement an allow list of content types for which we will attempt to preview a URL. This prevents Synapse from making useless longer-lived connections to streaming media servers.
18 changes: 18 additions & 0 deletions synapse/http/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
TYPE_CHECKING,
Any,
BinaryIO,
Callable,
Dict,
Iterable,
List,
Expand Down Expand Up @@ -693,12 +694,18 @@ async def get_file(
output_stream: BinaryIO,
max_size: Optional[int] = None,
headers: Optional[RawHeaders] = None,
is_allowed_content_type: Optional[Callable[[str], bool]] = None,
) -> Tuple[int, Dict[bytes, List[bytes]], str, int]:
"""GETs a file from a given URL
Args:
url: The URL to GET
output_stream: File to write the response body to.
headers: A map from header name to a list of values for that header
is_allowed_content_type: A predicate to determine whether the
content type of the file we're downloading is allowed. If set and
it evaluates to False when called with the content type, the
request will be terminated before completing the download by
raising SynapseError.
Returns:
A tuple of the file length, dict of the response
headers, absolute URI of the response and HTTP response code.
Expand Down Expand Up @@ -726,6 +733,17 @@ async def get_file(
HTTPStatus.BAD_GATEWAY, "Got error %d" % (response.code,), Codes.UNKNOWN
)

if is_allowed_content_type and b"Content-Type" in resp_headers:
content_type = resp_headers[b"Content-Type"][0].decode("ascii")
if not is_allowed_content_type(content_type):
raise SynapseError(
HTTPStatus.BAD_GATEWAY,
(
"Requested file's content type not allowed for this operation: %s"
% content_type
),
)

# TODO: if our Content-Type is HTML or something, just read the first
# N bytes into RAM rather than saving it all to disk only to read it
# straight back in again
Expand Down
8 changes: 8 additions & 0 deletions synapse/rest/media/v1/preview_url_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,7 @@ async def _download_url(self, url: str, output_stream: BinaryIO) -> DownloadResu
output_stream=output_stream,
max_size=self.max_spider_size,
headers={"Accept-Language": self.url_preview_accept_language},
is_allowed_content_type=_is_previewable,
)
except SynapseError:
# Pass SynapseErrors through directly, so that the servlet
Expand Down Expand Up @@ -761,3 +762,10 @@ def _is_html(content_type: str) -> bool:

def _is_json(content_type: str) -> bool:
return content_type.lower().startswith("application/json")


def _is_previewable(content_type: str) -> bool:
"""Returns True for content types for which we will perform URL preview and False
otherwise."""

return _is_html(content_type) or _is_media(content_type) or _is_json(content_type)
72 changes: 72 additions & 0 deletions tests/rest/media/v1/test_url_preview.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,78 @@ def test_non_ascii_preview_httpequiv(self):
self.assertEqual(channel.code, 200)
self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430")

def test_video_rejected(self):
self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]

end_content = b"anything"

channel = self.make_request(
"GET",
"preview_url?url=http://matrix.org",
shorthand=False,
await_result=False,
)
self.pump()

client = self.reactor.tcpClients[0][2].buildProtocol(None)
server = AccumulatingProtocol()
server.makeConnection(FakeTransport(client, self.reactor))
client.makeConnection(FakeTransport(server, self.reactor))
client.dataReceived(
(
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
b"Content-Type: video/mp4\r\n\r\n"
)
% (len(end_content))
+ end_content
)

self.pump()
self.assertEqual(channel.code, 502)
self.assertEqual(
channel.json_body,
{
"errcode": "M_UNKNOWN",
"error": "Requested file's content type not allowed for this operation: video/mp4",
},
)

def test_audio_rejected(self):
self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]

end_content = b"anything"

channel = self.make_request(
"GET",
"preview_url?url=http://matrix.org",
shorthand=False,
await_result=False,
)
self.pump()

client = self.reactor.tcpClients[0][2].buildProtocol(None)
server = AccumulatingProtocol()
server.makeConnection(FakeTransport(client, self.reactor))
client.makeConnection(FakeTransport(server, self.reactor))
client.dataReceived(
(
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
b"Content-Type: audio/aac\r\n\r\n"
)
% (len(end_content))
+ end_content
)

self.pump()
self.assertEqual(channel.code, 502)
self.assertEqual(
channel.json_body,
{
"errcode": "M_UNKNOWN",
"error": "Requested file's content type not allowed for this operation: audio/aac",
},
)

def test_non_ascii_preview_content_type(self):
self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]

Expand Down

0 comments on commit 337f38c

Please sign in to comment.