Switch to dash encoding for table/database/row-pk in paths

* Dash encoding functions, tests and docs, refs #1439
* dash encoding is now like percent encoding but with dashes
* Use dash-encoding for row PKs and ?_next=, refs #1439
* Use dash encoding for table names, refs #1439
* Use dash encoding for database names, too, refs #1439

See also https://simonwillison.net/2022/Mar/5/dash-encoding/
pull/1495/merge
Simon Willison 2022-03-07 07:38:29 -08:00 zatwierdzone przez GitHub
rodzic de810f49cc
commit 1baa030eca
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: 4AEE18F83AFDEB23
13 zmienionych plików z 173 dodań i 53 usunięć

Wyświetl plik

@ -1,4 +1,4 @@
from .utils import path_with_format, HASH_LENGTH, PrefixedUrlString
from .utils import dash_encode, path_with_format, HASH_LENGTH, PrefixedUrlString
import urllib
@ -31,20 +31,20 @@ class Urls:
db = self.ds.databases[database]
if self.ds.setting("hash_urls") and db.hash:
path = self.path(
f"{urllib.parse.quote(database)}-{db.hash[:HASH_LENGTH]}", format=format
f"{dash_encode(database)}-{db.hash[:HASH_LENGTH]}", format=format
)
else:
path = self.path(urllib.parse.quote(database), format=format)
path = self.path(dash_encode(database), format=format)
return path
def table(self, database, table, format=None):
path = f"{self.database(database)}/{urllib.parse.quote_plus(table)}"
path = f"{self.database(database)}/{dash_encode(table)}"
if format is not None:
path = path_with_format(path=path, format=format)
return PrefixedUrlString(path)
def query(self, database, query, format=None):
path = f"{self.database(database)}/{urllib.parse.quote_plus(query)}"
path = f"{self.database(database)}/{dash_encode(query)}"
if format is not None:
path = path_with_format(path=path, format=format)
return PrefixedUrlString(path)

Wyświetl plik

@ -112,12 +112,12 @@ async def await_me_maybe(value: typing.Any) -> typing.Any:
def urlsafe_components(token):
"""Splits token on commas and URL decodes each component"""
return [urllib.parse.unquote_plus(b) for b in token.split(",")]
"""Splits token on commas and dash-decodes each component"""
return [dash_decode(b) for b in token.split(",")]
def path_from_row_pks(row, pks, use_rowid, quote=True):
"""Generate an optionally URL-quoted unique identifier
"""Generate an optionally dash-quoted unique identifier
for a row from its primary keys."""
if use_rowid:
bits = [row["rowid"]]
@ -126,7 +126,7 @@ def path_from_row_pks(row, pks, use_rowid, quote=True):
row[pk]["value"] if isinstance(row[pk], dict) else row[pk] for pk in pks
]
if quote:
bits = [urllib.parse.quote_plus(str(bit)) for bit in bits]
bits = [dash_encode(str(bit)) for bit in bits]
else:
bits = [str(bit) for bit in bits]
@ -1140,3 +1140,36 @@ def add_cors_headers(headers):
headers["Access-Control-Allow-Origin"] = "*"
headers["Access-Control-Allow-Headers"] = "Authorization"
headers["Access-Control-Expose-Headers"] = "Link"
_DASH_ENCODING_SAFE = frozenset(
b"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
b"abcdefghijklmnopqrstuvwxyz"
b"0123456789_"
# This is the same as Python percent-encoding but I removed
# '.' and '-' and '~'
)
class DashEncoder(dict):
# Keeps a cache internally, via __missing__
def __missing__(self, b):
# Handle a cache miss, store encoded string in cache and return.
res = chr(b) if b in _DASH_ENCODING_SAFE else "-{:02X}".format(b)
self[b] = res
return res
_dash_encoder = DashEncoder().__getitem__
@documented
def dash_encode(s: str) -> str:
"Returns dash-encoded string - for example ``/foo/bar`` -> ``-2Ffoo-2Fbar``"
return "".join(_dash_encoder(char) for char in s.encode("utf-8"))
@documented
def dash_decode(s: str) -> str:
"Decodes a dash-encoded string, so ``-2Ffoo-2Fbar`` -> ``/foo/bar``"
return urllib.parse.unquote(s.replace("-", "%"))

Wyświetl plik

@ -17,6 +17,8 @@ from datasette.utils import (
InvalidSql,
LimitedWriter,
call_with_supported_arguments,
dash_decode,
dash_encode,
path_from_row_pks,
path_with_added_args,
path_with_removed_args,
@ -203,17 +205,17 @@ class DataView(BaseView):
async def resolve_db_name(self, request, db_name, **kwargs):
hash = None
name = None
db_name = urllib.parse.unquote_plus(db_name)
if db_name not in self.ds.databases and "-" in db_name:
decoded_name = dash_decode(db_name)
if decoded_name not in self.ds.databases and "-" in db_name:
# No matching DB found, maybe it's a name-hash?
name_bit, hash_bit = db_name.rsplit("-", 1)
if name_bit not in self.ds.databases:
if dash_decode(name_bit) not in self.ds.databases:
raise NotFound(f"Database not found: {name}")
else:
name = name_bit
name = dash_decode(name_bit)
hash = hash_bit
else:
name = db_name
name = decoded_name
try:
db = self.ds.databases[name]
@ -233,9 +235,7 @@ class DataView(BaseView):
return await db.table_exists(t)
table, _format = await resolve_table_and_format(
table_and_format=urllib.parse.unquote_plus(
kwargs["table_and_format"]
),
table_and_format=dash_decode(kwargs["table_and_format"]),
table_exists=async_table_exists,
allowed_formats=self.ds.renderers.keys(),
)
@ -243,11 +243,11 @@ class DataView(BaseView):
if _format:
kwargs["as_format"] = f".{_format}"
elif kwargs.get("table"):
kwargs["table"] = urllib.parse.unquote_plus(kwargs["table"])
kwargs["table"] = dash_decode(kwargs["table"])
should_redirect = self.ds.urls.path(f"{name}-{expected}")
if kwargs.get("table"):
should_redirect += "/" + urllib.parse.quote_plus(kwargs["table"])
should_redirect += "/" + dash_encode(kwargs["table"])
if kwargs.get("pk_path"):
should_redirect += "/" + kwargs["pk_path"]
if kwargs.get("as_format"):
@ -467,7 +467,7 @@ class DataView(BaseView):
return await db.table_exists(t)
table, _ext_format = await resolve_table_and_format(
table_and_format=urllib.parse.unquote_plus(args["table_and_format"]),
table_and_format=dash_decode(args["table_and_format"]),
table_exists=async_table_exists,
allowed_formats=self.ds.renderers.keys(),
)
@ -475,7 +475,7 @@ class DataView(BaseView):
args["table"] = table
del args["table_and_format"]
elif "table" in args:
args["table"] = urllib.parse.unquote_plus(args["table"])
args["table"] = dash_decode(args["table"])
return _format, args
async def view_get(self, request, database, hash, correct_hash_provided, **kwargs):

Wyświetl plik

@ -12,6 +12,7 @@ from datasette.utils import (
MultiParams,
append_querystring,
compound_keys_after_sql,
dash_encode,
escape_sqlite,
filters_should_redirect,
is_url,
@ -142,7 +143,7 @@ class RowTableShared(DataView):
'<a href="{base_url}{database}/{table}/{flat_pks_quoted}">{flat_pks}</a>'.format(
base_url=base_url,
database=database,
table=urllib.parse.quote_plus(table),
table=dash_encode(table),
flat_pks=str(markupsafe.escape(pk_path)),
flat_pks_quoted=path_from_row_pks(row, pks, not pks),
)
@ -199,8 +200,8 @@ class RowTableShared(DataView):
link_template.format(
database=database,
base_url=base_url,
table=urllib.parse.quote_plus(other_table),
link_id=urllib.parse.quote_plus(str(value)),
table=dash_encode(other_table),
link_id=dash_encode(str(value)),
id=str(markupsafe.escape(value)),
label=str(markupsafe.escape(label)) or "-",
)
@ -765,7 +766,7 @@ class TableView(RowTableShared):
if prefix is None:
prefix = "$null"
else:
prefix = urllib.parse.quote_plus(str(prefix))
prefix = dash_encode(str(prefix))
next_value = f"{prefix},{next_value}"
added_args = {"_next": next_value}
if sort:

Wyświetl plik

@ -876,6 +876,32 @@ Utility function for calling ``await`` on a return value if it is awaitable, oth
.. autofunction:: datasette.utils.await_me_maybe
.. _internals_dash_encoding:
Dash encoding
-------------
Datasette uses a custom encoding scheme in some places, called **dash encoding**. This is primarily used for table names and row primary keys, to avoid any confusion between ``/`` characters in those values and the Datasette URLs that reference them.
Dash encoding uses the same algorithm as `URL percent-encoding <https://developer.mozilla.org/en-US/docs/Glossary/percent-encoding>`__, but with the ``-`` hyphen character used in place of ``%``.
Any character other than ``ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz 0123456789_`` will be replaced by the numeric equivalent preceded by a hyphen. For example:
- ``/`` becomes ``-2F``
- ``.`` becomes ``-2E``
- ``%`` becomes ``-25``
- ``-`` becomes ``-2D``
- Space character becomes ``-20``
- ``polls/2022.primary`` becomes ``polls-2F2022-2Eprimary``
.. _internals_utils_dash_encode:
.. autofunction:: datasette.utils.dash_encode
.. _internals_utils_dash_decode:
.. autofunction:: datasette.utils.dash_decode
.. _internals_tracer:
datasette.tracer

Wyświetl plik

@ -406,6 +406,7 @@ CREATE TABLE compound_primary_key (
);
INSERT INTO compound_primary_key VALUES ('a', 'b', 'c');
INSERT INTO compound_primary_key VALUES ('a/b', '.c-d', 'c');
CREATE TABLE compound_three_primary_keys (
pk1 varchar(30),

Wyświetl plik

@ -143,7 +143,7 @@ def test_database_page(app_client):
"name": "compound_primary_key",
"columns": ["pk1", "pk2", "content"],
"primary_keys": ["pk1", "pk2"],
"count": 1,
"count": 2,
"hidden": False,
"fts_table": None,
"foreign_keys": {"incoming": [], "outgoing": []},
@ -942,7 +942,7 @@ def test_cors(app_client_with_cors, path, status_code):
)
def test_database_with_space_in_name(app_client_two_attached_databases, path):
response = app_client_two_attached_databases.get(
"/extra database" + path, follow_redirects=True
"/extra-20database" + path, follow_redirects=True
)
assert response.status == 200
@ -953,7 +953,7 @@ def test_common_prefix_database_names(app_client_conflicting_database_names):
d["name"]
for d in app_client_conflicting_database_names.get("/-/databases.json").json
]
for db_name, path in (("foo", "/foo.json"), ("foo-bar", "/foo-bar.json")):
for db_name, path in (("foo", "/foo.json"), ("foo-bar", "/foo-2Dbar.json")):
data = app_client_conflicting_database_names.get(path).json
assert db_name == data["database"]
@ -992,3 +992,16 @@ async def test_hidden_sqlite_stat1_table():
data = (await ds.client.get("/db.json?_show_hidden=1")).json()
tables = [(t["name"], t["hidden"]) for t in data["tables"]]
assert tables == [("normal", False), ("sqlite_stat1", True)]
@pytest.mark.asyncio
@pytest.mark.parametrize("db_name", ("foo", r"fo%o", "f~/c.d"))
async def test_dash_encoded_database_names(db_name):
ds = Datasette()
ds.add_memory_database(db_name)
response = await ds.client.get("/.json")
assert db_name in response.json().keys()
path = response.json()[db_name]["path"]
# And the JSON for that database
response2 = await ds.client.get(path + ".json")
assert response2.status_code == 200

Wyświetl plik

@ -9,6 +9,7 @@ from datasette.app import SETTINGS
from datasette.plugins import DEFAULT_PLUGINS
from datasette.cli import cli, serve
from datasette.version import __version__
from datasette.utils import dash_encode
from datasette.utils.sqlite import sqlite3
from click.testing import CliRunner
import io
@ -294,12 +295,12 @@ def test_weird_database_names(ensure_eventloop, tmpdir, filename):
assert result1.exit_code == 0, result1.output
filename_no_stem = filename.rsplit(".", 1)[0]
expected_link = '<a href="/{}">{}</a>'.format(
urllib.parse.quote(filename_no_stem), filename_no_stem
dash_encode(filename_no_stem), filename_no_stem
)
assert expected_link in result1.output
# Now try hitting that database page
result2 = runner.invoke(
cli, [db_path, "--get", "/{}".format(urllib.parse.quote(filename_no_stem))]
cli, [db_path, "--get", "/{}".format(dash_encode(filename_no_stem))]
)
assert result2.exit_code == 0, result2.output

Wyświetl plik

@ -29,7 +29,7 @@ def test_homepage(app_client_two_attached_databases):
)
# Should be two attached databases
assert [
{"href": r"/extra%20database", "text": "extra database"},
{"href": r"/extra-20database", "text": "extra database"},
{"href": "/fixtures", "text": "fixtures"},
] == [{"href": a["href"], "text": a.text.strip()} for a in soup.select("h2 a")]
# Database should show count text and attached tables
@ -44,8 +44,8 @@ def test_homepage(app_client_two_attached_databases):
{"href": a["href"], "text": a.text.strip()} for a in links_p.findAll("a")
]
assert [
{"href": r"/extra%20database/searchable", "text": "searchable"},
{"href": r"/extra%20database/searchable_view", "text": "searchable_view"},
{"href": r"/extra-20database/searchable", "text": "searchable"},
{"href": r"/extra-20database/searchable_view", "text": "searchable_view"},
] == table_links
@ -140,7 +140,7 @@ def test_database_page(app_client):
assert queries_ul is not None
assert [
(
"/fixtures/%F0%9D%90%9C%F0%9D%90%A2%F0%9D%90%AD%F0%9D%90%A2%F0%9D%90%9E%F0%9D%90%AC",
"/fixtures/-F0-9D-90-9C-F0-9D-90-A2-F0-9D-90-AD-F0-9D-90-A2-F0-9D-90-9E-F0-9D-90-AC",
"𝐜𝐢𝐭𝐢𝐞𝐬",
),
("/fixtures/from_async_hook", "from_async_hook"),
@ -193,11 +193,11 @@ def test_row_redirects_with_url_hash(app_client_with_hash):
def test_row_strange_table_name_with_url_hash(app_client_with_hash):
response = app_client_with_hash.get("/fixtures/table%2Fwith%2Fslashes.csv/3")
response = app_client_with_hash.get("/fixtures/table-2Fwith-2Fslashes-2Ecsv/3")
assert response.status == 302
assert response.headers["Location"].endswith("/table%2Fwith%2Fslashes.csv/3")
assert response.headers["Location"].endswith("/table-2Fwith-2Fslashes-2Ecsv/3")
response = app_client_with_hash.get(
"/fixtures/table%2Fwith%2Fslashes.csv/3", follow_redirects=True
"/fixtures/table-2Fwith-2Fslashes-2Ecsv/3", follow_redirects=True
)
assert response.status == 200
@ -345,20 +345,38 @@ def test_row_links_from_other_tables(app_client, path, expected_text, expected_l
assert link == expected_link
def test_row_html_compound_primary_key(app_client):
response = app_client.get("/fixtures/compound_primary_key/a,b")
@pytest.mark.parametrize(
"path,expected",
(
(
"/fixtures/compound_primary_key/a,b",
[
[
'<td class="col-pk1 type-str">a</td>',
'<td class="col-pk2 type-str">b</td>',
'<td class="col-content type-str">c</td>',
]
],
),
(
"/fixtures/compound_primary_key/a-2Fb,-2Ec-2Dd",
[
[
'<td class="col-pk1 type-str">a/b</td>',
'<td class="col-pk2 type-str">.c-d</td>',
'<td class="col-content type-str">c</td>',
]
],
),
),
)
def test_row_html_compound_primary_key(app_client, path, expected):
response = app_client.get(path)
assert response.status == 200
table = Soup(response.body, "html.parser").find("table")
assert ["pk1", "pk2", "content"] == [
th.string.strip() for th in table.select("thead th")
]
expected = [
[
'<td class="col-pk1 type-str">a</td>',
'<td class="col-pk2 type-str">b</td>',
'<td class="col-content type-str">c</td>',
]
]
assert expected == [
[str(td) for td in tr.select("td")] for tr in table.select("tbody tr")
]

Wyświetl plik

@ -121,7 +121,7 @@ def test_database(ds, base_url, format, expected):
("/", "name", None, "/_memory/name"),
("/prefix/", "name", None, "/prefix/_memory/name"),
("/", "name", "json", "/_memory/name.json"),
("/", "name.json", "json", "/_memory/name.json?_format=json"),
("/", "name.json", "json", "/_memory/name-2Ejson.json"),
],
)
def test_table_and_query(ds, base_url, name, format, expected):

Wyświetl plik

@ -136,7 +136,10 @@ def test_table_shape_object(app_client):
def test_table_shape_object_compound_primary_key(app_client):
response = app_client.get("/fixtures/compound_primary_key.json?_shape=object")
assert {"a,b": {"pk1": "a", "pk2": "b", "content": "c"}} == response.json
assert response.json == {
"a,b": {"pk1": "a", "pk2": "b", "content": "c"},
"a-2Fb,-2Ec-2Dd": {"pk1": "a/b", "pk2": ".c-d", "content": "c"},
}
def test_table_with_slashes_in_name(app_client):
@ -308,7 +311,7 @@ def test_sortable(app_client, query_string, sort_key, human_description_en):
path = response.json["next_url"]
if path:
path = path.replace("http://localhost", "")
assert 5 == page
assert page == 5
expected = list(generate_sortable_rows(201))
expected.sort(key=sort_key)
assert [r["content"] for r in expected] == [r["content"] for r in fetched]

Wyświetl plik

@ -563,11 +563,17 @@ def test_table_html_compound_primary_key(app_client):
'<td class="col-pk1 type-str">a</td>',
'<td class="col-pk2 type-str">b</td>',
'<td class="col-content type-str">c</td>',
]
],
[
'<td class="col-Link type-pk"><a href="/fixtures/compound_primary_key/a-2Fb,-2Ec-2Dd">a/b,.c-d</a></td>',
'<td class="col-pk1 type-str">a/b</td>',
'<td class="col-pk2 type-str">.c-d</td>',
'<td class="col-content type-str">c</td>',
],
]
assert expected == [
assert [
[str(td) for td in tr.select("td")] for tr in table.select("tbody tr")
]
] == expected
def test_table_html_foreign_key_links(app_client):

Wyświetl plik

@ -93,7 +93,7 @@ def test_path_with_replaced_args(path, args, expected):
"row,pks,expected_path",
[
({"A": "foo", "B": "bar"}, ["A", "B"], "foo,bar"),
({"A": "f,o", "B": "bar"}, ["A", "B"], "f%2Co,bar"),
({"A": "f,o", "B": "bar"}, ["A", "B"], "f-2Co,bar"),
({"A": 123}, ["A"], "123"),
(
utils.CustomRow(
@ -646,3 +646,21 @@ async def test_derive_named_parameters(sql, expected):
db = ds.get_database("_memory")
params = await utils.derive_named_parameters(db, sql)
assert params == expected
@pytest.mark.parametrize(
"original,expected",
(
("abc", "abc"),
("/foo/bar", "-2Ffoo-2Fbar"),
("/-/bar", "-2F-2D-2Fbar"),
("-/db-/table.csv", "-2D-2Fdb-2D-2Ftable-2Ecsv"),
(r"%~-/", "-25-7E-2D-2F"),
("-25-7E-2D-2F", "-2D25-2D7E-2D2D-2D2F"),
),
)
def test_dash_encoding(original, expected):
actual = utils.dash_encode(original)
assert actual == expected
# And test round-trip
assert original == utils.dash_decode(actual)