kopia lustrzana https://github.com/jupyterhub/repo2docker
Porównaj commity
8 Commity
34245597c3
...
f1248f6c14
Autor | SHA1 | Data |
---|---|---|
Sol Lee | f1248f6c14 | |
Sol Lee | c948d78013 | |
Sol Lee | 7faa53c83b | |
Sol Lee | 0808018da7 | |
Sol Lee | 1039d8dcb9 | |
Sol Lee | 7712751c8f | |
Sol Lee | 472c5cdc29 | |
Sol Lee | 391b9bc5ba |
|
@ -39,7 +39,8 @@ where ``<source-repository>`` is:
|
|||
|
||||
* a URL of a Git repository (``https://github.com/binder-examples/requirements``),
|
||||
* a Zenodo DOI (``10.5281/zenodo.1211089``),
|
||||
* a SWHID_ (``swh:1:rev:999dd06c7f679a2714dfe5199bdca09522a29649``), or
|
||||
* a SWHID_ (``swh:1:rev:999dd06c7f679a2714dfe5199bdca09522a29649``),
|
||||
* a URL of a CKAN_ dataset (``https://demo.ckan.org/dataset/sample-dataset-1``), or
|
||||
* a path to a local directory (``a/local/directory``)
|
||||
|
||||
of the source repository you want to build.
|
||||
|
@ -136,3 +137,4 @@ Command line API
|
|||
|
||||
.. _Pytudes: https://github.com/norvig/pytudes
|
||||
.. _SWHID: https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html
|
||||
.. _CKAN: https://ckan.org
|
||||
|
|
|
@ -152,6 +152,7 @@ class Repo2Docker(Application):
|
|||
contentproviders.Dataverse,
|
||||
contentproviders.Hydroshare,
|
||||
contentproviders.Swhid,
|
||||
contentproviders.CKAN,
|
||||
contentproviders.Mercurial,
|
||||
contentproviders.Git,
|
||||
],
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
from .base import Local
|
||||
from .ckan import CKAN
|
||||
from .dataverse import Dataverse
|
||||
from .figshare import Figshare
|
||||
from .git import Git
|
||||
|
|
|
@ -0,0 +1,127 @@
|
|||
from datetime import datetime, timedelta, timezone
|
||||
from os import path
|
||||
from urllib.parse import parse_qs, urlparse
|
||||
|
||||
from requests import Session
|
||||
|
||||
from .. import __version__
|
||||
from .base import ContentProvider
|
||||
|
||||
|
||||
class CKAN(ContentProvider):
|
||||
"""Provide contents of a remote CKAN dataset."""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.session = Session()
|
||||
self.session.headers.update(
|
||||
{
|
||||
"user-agent": f"repo2docker {__version__}",
|
||||
}
|
||||
)
|
||||
|
||||
def _fetch_version(self, api_url):
|
||||
"""Fetch dataset modified date and convert to epoch.
|
||||
Borrowed from the Hydroshare provider.
|
||||
"""
|
||||
package_show_url = f"{api_url}package_show?id={self.dataset_id}"
|
||||
resp = self.urlopen(package_show_url).json()
|
||||
date = resp["result"]["metadata_modified"]
|
||||
parsed_date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%f")
|
||||
epoch = parsed_date.replace(tzinfo=timezone(timedelta(0))).timestamp()
|
||||
# truncate the timestamp
|
||||
return str(int(epoch))
|
||||
|
||||
def _request(self, url, **kwargs):
|
||||
return self.session.get(url, **kwargs)
|
||||
|
||||
urlopen = _request
|
||||
|
||||
def detect(self, source, ref=None, extra_args=None):
|
||||
"""Trigger this provider for things that resolve to a CKAN dataset."""
|
||||
parsed_url = urlparse(source)
|
||||
if not parsed_url.netloc:
|
||||
return None
|
||||
|
||||
url_parts_1 = parsed_url.path.split("/history/")
|
||||
url_parts_2 = url_parts_1[0].split("/")
|
||||
if url_parts_2[-2] == "dataset":
|
||||
self.dataset_id = url_parts_2[-1]
|
||||
else:
|
||||
return None
|
||||
|
||||
api_url_path = "/api/3/action/"
|
||||
api_url = parsed_url._replace(
|
||||
path="/".join(url_parts_2[:-2]) + api_url_path, query=""
|
||||
).geturl()
|
||||
|
||||
status_show_url = f"{api_url}status_show"
|
||||
resp = self.urlopen(status_show_url)
|
||||
if resp.status_code == 200:
|
||||
|
||||
# handle the activites
|
||||
activity_id = None
|
||||
if parse_qs(parsed_url.query).get("activity_id") is not None:
|
||||
activity_id = parse_qs(parsed_url.query).get("activity_id")[0]
|
||||
if len(url_parts_1) == 2:
|
||||
activity_id = url_parts_1[-1]
|
||||
|
||||
self.version = self._fetch_version(api_url)
|
||||
return {
|
||||
"dataset_id": self.dataset_id,
|
||||
"activity_id": activity_id,
|
||||
"api_url": api_url,
|
||||
"version": self.version,
|
||||
}
|
||||
else:
|
||||
return None
|
||||
|
||||
def fetch(self, spec, output_dir, yield_output=False):
|
||||
"""Fetch a CKAN dataset."""
|
||||
dataset_id = spec["dataset_id"]
|
||||
activity_id = spec["activity_id"]
|
||||
|
||||
yield f"Fetching CKAN dataset {dataset_id}.\n"
|
||||
|
||||
# handle the activites
|
||||
if activity_id:
|
||||
fetch_url = (
|
||||
f"{spec['api_url']}activity_data_show?"
|
||||
f"id={activity_id}&object_type=package"
|
||||
)
|
||||
else:
|
||||
fetch_url = f"{spec['api_url']}package_show?id={dataset_id}"
|
||||
|
||||
resp = self.urlopen(
|
||||
fetch_url,
|
||||
headers={"accept": "application/json"},
|
||||
)
|
||||
|
||||
dataset = resp.json()
|
||||
|
||||
yield "Fetching CKAN resources.\n"
|
||||
|
||||
resources = dataset["result"]["resources"]
|
||||
|
||||
for resource in resources:
|
||||
file_url = resource["url"]
|
||||
if file_url == "":
|
||||
continue
|
||||
fname = file_url.rsplit("/", maxsplit=1)[-1]
|
||||
if fname == "":
|
||||
fname = resource["id"]
|
||||
|
||||
yield f"Requesting {file_url}\n"
|
||||
resp = self._request(file_url, stream=True)
|
||||
resp.raise_for_status()
|
||||
|
||||
dst_fname = path.join(output_dir, fname)
|
||||
with open(dst_fname, "wb") as dst:
|
||||
yield f"Fetching {fname}\n"
|
||||
for chunk in resp.iter_content(chunk_size=None):
|
||||
dst.write(chunk)
|
||||
|
||||
@property
|
||||
def content_id(self):
|
||||
"""A unique ID to represent the version of the content."""
|
||||
return f"{self.dataset_id}.v{self.version}"
|
|
@ -0,0 +1,79 @@
|
|||
import os
|
||||
from contextlib import contextmanager
|
||||
from tempfile import NamedTemporaryFile, TemporaryDirectory
|
||||
|
||||
from repo2docker.contentproviders import CKAN
|
||||
|
||||
|
||||
def test_detect_ckan(requests_mock):
|
||||
mock_response = {"result": {"metadata_modified": "2024-02-27T14:15:54.573058"}}
|
||||
requests_mock.get("http://demo.ckan.org/api/3/action/status_show", status_code=200)
|
||||
requests_mock.get(
|
||||
"http://demo.ckan.org/api/3/action/package_show?id=1234", json=mock_response
|
||||
)
|
||||
|
||||
expected = {
|
||||
"dataset_id": "1234",
|
||||
"activity_id": None,
|
||||
"api_url": "http://demo.ckan.org/api/3/action/",
|
||||
"version": "1709043354",
|
||||
}
|
||||
|
||||
expected_activity = expected.copy()
|
||||
expected_activity["activity_id"] = "5678"
|
||||
|
||||
assert CKAN().detect("http://demo.ckan.org/dataset/1234") == expected
|
||||
assert (
|
||||
CKAN().detect("http://demo.ckan.org/dataset/1234?activity_id=5678")
|
||||
== expected_activity
|
||||
)
|
||||
assert (
|
||||
CKAN().detect("http://demo.ckan.org/dataset/1234/history/5678")
|
||||
== expected_activity
|
||||
)
|
||||
|
||||
|
||||
def test_detect_not_ckan():
|
||||
# Don't trigger the CKAN content provider
|
||||
assert CKAN().detect("/some/path/here") is None
|
||||
assert CKAN().detect("https://example.com/path/here") is None
|
||||
assert CKAN().detect("https://data.gov.tw/dataset/6564") is None
|
||||
|
||||
|
||||
@contextmanager
|
||||
def ckan_file():
|
||||
with NamedTemporaryFile() as file:
|
||||
file.write(b"some content")
|
||||
yield file.name
|
||||
|
||||
|
||||
def test_ckan_fetch(requests_mock):
|
||||
with ckan_file() as ckan_path:
|
||||
mock_response = {"result": {"resources": [{"url": f"file://{ckan_path}"}]}}
|
||||
requests_mock.get(
|
||||
"http://demo.ckan.org/api/3/action/package_show?id=1234", json=mock_response
|
||||
)
|
||||
requests_mock.get(
|
||||
"http://demo.ckan.org/api/3/action/activity_data_show?id=5678",
|
||||
json=mock_response,
|
||||
)
|
||||
requests_mock.get(f"file://{ckan_path}", content=open(ckan_path, "rb").read())
|
||||
|
||||
ckan = CKAN()
|
||||
spec = {"dataset_id": "1234", "api_url": "http://demo.ckan.org/api/3/action/"}
|
||||
|
||||
expected = {ckan_path.rsplit("/", maxsplit=1)[1]}
|
||||
|
||||
with TemporaryDirectory() as d:
|
||||
spec["activity_id"] = None
|
||||
output = []
|
||||
for l in ckan.fetch(spec, d):
|
||||
output.append(l)
|
||||
assert expected == set(os.listdir(d))
|
||||
|
||||
with TemporaryDirectory() as d:
|
||||
spec["activity_id"] = "5678"
|
||||
output = []
|
||||
for l in ckan.fetch(spec, d):
|
||||
output.append(l)
|
||||
assert expected == set(os.listdir(d))
|
Ładowanie…
Reference in New Issue