kopia lustrzana https://github.com/jupyterhub/repo2docker
Replace urllib by requests in contentproviders
requests is globally simpler to use, and more and more people are more familiar with this later than urllib.pull/993/head
rodzic
560b1d96a0
commit
830a9c89c0
|
@ -5,4 +5,4 @@ pytest>=4.6
|
|||
wheel
|
||||
pytest-cov
|
||||
pre-commit
|
||||
requests
|
||||
requests_mock
|
||||
|
|
|
@ -2,7 +2,6 @@ import os
|
|||
import json
|
||||
import shutil
|
||||
|
||||
from urllib.request import Request
|
||||
from urllib.parse import urlparse, urlunparse, parse_qs
|
||||
|
||||
from .doi import DoiProvider
|
||||
|
@ -56,7 +55,6 @@ class Dataverse(DoiProvider):
|
|||
return
|
||||
|
||||
query_args = parse_qs(parsed_url.query)
|
||||
|
||||
# Corner case handling
|
||||
if parsed_url.path.startswith("/file.xhtml"):
|
||||
# There's no way of getting file information using its persistentId, the only thing we can do is assume that doi
|
||||
|
@ -75,8 +73,7 @@ class Dataverse(DoiProvider):
|
|||
parsed_url._replace(path="/api/search", query=search_query)
|
||||
)
|
||||
self.log.debug("Querying Dataverse: " + search_url)
|
||||
resp = self.urlopen(search_url).read()
|
||||
data = json.loads(resp.decode("utf-8"))["data"]
|
||||
data = self.urlopen(search_url).json()
|
||||
if data["count_in_response"] != 1:
|
||||
self.log.debug(
|
||||
"Dataverse search query failed!\n - doi: {}\n - url: {}\n - resp: {}\n".format(
|
||||
|
@ -101,14 +98,12 @@ class Dataverse(DoiProvider):
|
|||
host = spec["host"]
|
||||
|
||||
yield "Fetching Dataverse record {}.\n".format(record_id)
|
||||
req = Request(
|
||||
"{}/api/datasets/:persistentId?persistentId={}".format(
|
||||
host["url"], record_id
|
||||
),
|
||||
headers={"accept": "application/json"},
|
||||
url = "{}/api/datasets/:persistentId?persistentId={}".format(
|
||||
host["url"], record_id
|
||||
)
|
||||
resp = self.urlopen(req)
|
||||
record = json.loads(resp.read().decode("utf-8"))["data"]
|
||||
|
||||
resp = self.urlopen(url, headers={"accept": "application/json"})
|
||||
record = resp.json()
|
||||
|
||||
for fobj in deep_get(record, "latestVersion.files"):
|
||||
file_url = "{}/api/access/datafile/{}".format(
|
||||
|
|
|
@ -5,8 +5,8 @@ import logging
|
|||
|
||||
from os import makedirs
|
||||
from os import path
|
||||
from urllib import request # urlopen, Request
|
||||
from urllib.error import HTTPError
|
||||
from requests import Session, HTTPError
|
||||
|
||||
from zipfile import ZipFile, is_zipfile
|
||||
|
||||
from .base import ContentProvider
|
||||
|
@ -18,7 +18,21 @@ from .. import __version__
|
|||
class DoiProvider(ContentProvider):
|
||||
"""Provide contents of a repository identified by a DOI and some helper functions."""
|
||||
|
||||
def urlopen(self, req, headers=None):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.session = Session()
|
||||
self.session.headers.update(
|
||||
{
|
||||
"user-agent": "repo2docker {}".format(__version__),
|
||||
}
|
||||
)
|
||||
|
||||
def _request(self, url, **kwargs):
|
||||
return self.session.get(url, **kwargs)
|
||||
|
||||
urlopen = _request
|
||||
|
||||
def _urlopen(self, req, headers=None):
|
||||
"""A urlopen() helper"""
|
||||
# someone passed a string, not a request
|
||||
if not isinstance(req, request.Request):
|
||||
|
@ -38,7 +52,8 @@ class DoiProvider(ContentProvider):
|
|||
doi = normalize_doi(doi)
|
||||
|
||||
try:
|
||||
resp = self.urlopen("https://doi.org/{}".format(doi))
|
||||
resp = self._request("https://doi.org/{}".format(doi))
|
||||
resp.raise_for_status()
|
||||
# If the DOI doesn't resolve, just return URL
|
||||
except HTTPError:
|
||||
return doi
|
||||
|
@ -53,38 +68,42 @@ class DoiProvider(ContentProvider):
|
|||
file_url = deep_get(file_ref, host["download"])
|
||||
fname = deep_get(file_ref, host["filename"])
|
||||
logging.debug("Downloading file {} as {}\n".format(file_url, fname))
|
||||
with self.urlopen(file_url) as src:
|
||||
|
||||
yield "Requesting {}\n".format(file_url)
|
||||
resp = self._request(file_url, stream=True)
|
||||
resp.raise_for_status()
|
||||
|
||||
if path.dirname(fname):
|
||||
sub_dir = path.join(output_dir, path.dirname(fname))
|
||||
if not path.exists(sub_dir):
|
||||
yield "Creating {}\n".format(sub_dir)
|
||||
makedirs(sub_dir, exist_ok=True)
|
||||
|
||||
dst_fname = path.join(output_dir, fname)
|
||||
with open(dst_fname, "wb") as dst:
|
||||
yield "Fetching {}\n".format(fname)
|
||||
for chunk in resp.iter_content(chunk_size=None):
|
||||
dst.write(chunk)
|
||||
|
||||
if unzip and is_zipfile(dst_fname):
|
||||
yield "Extracting {}\n".format(fname)
|
||||
zfile = ZipFile(dst_fname)
|
||||
zfile.extractall(path=output_dir)
|
||||
zfile.close()
|
||||
|
||||
# delete downloaded file ...
|
||||
os.remove(dst_fname)
|
||||
# ... and any directories we might have created,
|
||||
# in which case sub_dir will be defined
|
||||
if path.dirname(fname):
|
||||
sub_dir = path.join(output_dir, path.dirname(fname))
|
||||
if not path.exists(sub_dir):
|
||||
yield "Creating {}\n".format(sub_dir)
|
||||
makedirs(sub_dir, exist_ok=True)
|
||||
shutil.rmtree(sub_dir)
|
||||
|
||||
dst_fname = path.join(output_dir, fname)
|
||||
with open(dst_fname, "wb") as dst:
|
||||
yield "Fetching {}\n".format(fname)
|
||||
shutil.copyfileobj(src, dst)
|
||||
# first close the newly written file, then continue
|
||||
# processing it
|
||||
if unzip and is_zipfile(dst_fname):
|
||||
yield "Extracting {}\n".format(fname)
|
||||
zfile = ZipFile(dst_fname)
|
||||
zfile.extractall(path=output_dir)
|
||||
zfile.close()
|
||||
new_subdirs = os.listdir(output_dir)
|
||||
# if there is only one new subdirectory move its contents
|
||||
# to the top level directory
|
||||
if len(new_subdirs) == 1:
|
||||
d = new_subdirs[0]
|
||||
copytree(path.join(output_dir, d), output_dir)
|
||||
shutil.rmtree(path.join(output_dir, d))
|
||||
|
||||
# delete downloaded file ...
|
||||
os.remove(dst_fname)
|
||||
# ... and any directories we might have created,
|
||||
# in which case sub_dir will be defined
|
||||
if path.dirname(fname):
|
||||
shutil.rmtree(sub_dir)
|
||||
|
||||
new_subdirs = os.listdir(output_dir)
|
||||
# if there is only one new subdirectory move its contents
|
||||
# to the top level directory
|
||||
if len(new_subdirs) == 1:
|
||||
d = new_subdirs[0]
|
||||
copytree(path.join(output_dir, d), output_dir)
|
||||
shutil.rmtree(path.join(output_dir, d))
|
||||
|
||||
yield "Fetched files: {}\n".format(os.listdir(output_dir))
|
||||
yield "Fetched files: {}\n".format(os.listdir(output_dir))
|
||||
|
|
|
@ -25,6 +25,7 @@ class Figshare(DoiProvider):
|
|||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.hosts = [
|
||||
{
|
||||
"hostname": [
|
||||
|
@ -74,13 +75,12 @@ class Figshare(DoiProvider):
|
|||
yield "Fetching Figshare article {} in version {}.\n".format(
|
||||
article_id, article_version
|
||||
)
|
||||
req = Request(
|
||||
resp = self.urlopen(
|
||||
"{}{}/versions/{}".format(host["api"], article_id, article_version),
|
||||
headers={"accept": "application/json"},
|
||||
)
|
||||
resp = self.urlopen(req)
|
||||
|
||||
article = json.loads(resp.read().decode("utf-8"))
|
||||
article = resp.json()
|
||||
|
||||
files = deep_get(article, host["filepath"])
|
||||
# only fetch files where is_link_only: False
|
||||
|
|
|
@ -16,9 +16,7 @@ class Hydroshare(DoiProvider):
|
|||
|
||||
def _fetch_version(self, host):
|
||||
"""Fetch resource modified date and convert to epoch"""
|
||||
json_response = json.loads(
|
||||
self.urlopen(host["version"].format(self.resource_id)).read()
|
||||
)
|
||||
json_response = self.urlopen(host["version"].format(self.resource_id)).json()
|
||||
date = next(
|
||||
item for item in json_response["dates"] if item["type"] == "modified"
|
||||
)["start_date"]
|
||||
|
|
|
@ -15,6 +15,7 @@ class Zenodo(DoiProvider):
|
|||
"""Provide contents of a Zenodo deposit."""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
# We need the hostname (url where records are), api url (for metadata),
|
||||
# filepath (path to files in metadata), filename (path to filename in
|
||||
# metadata), download (path to file download URL), and type (path to item type in metadata)
|
||||
|
@ -55,13 +56,12 @@ class Zenodo(DoiProvider):
|
|||
host = spec["host"]
|
||||
|
||||
yield "Fetching Zenodo record {}.\n".format(record_id)
|
||||
req = Request(
|
||||
resp = self.urlopen(
|
||||
"{}{}".format(host["api"], record_id),
|
||||
headers={"accept": "application/json"},
|
||||
)
|
||||
resp = self.urlopen(req)
|
||||
|
||||
record = json.loads(resp.read().decode("utf-8"))
|
||||
record = resp.json()
|
||||
|
||||
is_software = deep_get(record, host["type"]).lower() == "software"
|
||||
files = deep_get(record, host["filepath"])
|
||||
|
|
1
setup.py
1
setup.py
|
@ -52,6 +52,7 @@ setup(
|
|||
"python-json-logger",
|
||||
"escapism",
|
||||
"jinja2",
|
||||
"requests",
|
||||
"ruamel.yaml>=0.15",
|
||||
"toml",
|
||||
"semver",
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import json
|
||||
import os
|
||||
import pytest
|
||||
import re
|
||||
|
||||
from io import BytesIO
|
||||
from tempfile import TemporaryDirectory
|
||||
|
@ -19,7 +20,7 @@ test_hosts = [
|
|||
"doi:10.7910/DVN/6ZXAGT/3YRRYJ",
|
||||
"10.7910/DVN/6ZXAGT",
|
||||
"https://dataverse.harvard.edu/api/access/datafile/3323458",
|
||||
"hdl:11529/10016",
|
||||
"https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
|
||||
],
|
||||
[
|
||||
{"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"},
|
||||
|
@ -27,57 +28,68 @@ test_hosts = [
|
|||
],
|
||||
)
|
||||
]
|
||||
test_responses = {
|
||||
"doi:10.7910/DVN/6ZXAGT/3YRRYJ": (
|
||||
doi_responses = {
|
||||
"https://doi.org/10.7910/DVN/6ZXAGT/3YRRYJ": (
|
||||
"https://dataverse.harvard.edu/file.xhtml"
|
||||
"?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ"
|
||||
),
|
||||
"doi:10.7910/DVN/6ZXAGT": (
|
||||
"https://doi.org/10.7910/DVN/6ZXAGT": (
|
||||
"https://dataverse.harvard.edu/dataset.xhtml"
|
||||
"?persistentId=doi:10.7910/DVN/6ZXAGT"
|
||||
),
|
||||
"10.7910/DVN/6ZXAGT": (
|
||||
"https://dataverse.harvard.edu/dataset.xhtml"
|
||||
"?persistentId=doi:10.7910/DVN/6ZXAGT"
|
||||
"https://dataverse.harvard.edu/api/access/datafile/3323458": (
|
||||
"https://dataverse.harvard.edu/api/access/datafile/3323458"
|
||||
),
|
||||
"https://doi.org/10.21105/joss.01277": (
|
||||
"https://joss.theoj.org/papers/10.21105/joss.01277"
|
||||
),
|
||||
"https://dataverse.harvard.edu/api/access/datafile/3323458": "https://dataverse.harvard.edu/api/access/datafile/3323458",
|
||||
"hdl:11529/10016": "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
|
||||
}
|
||||
test_search = {
|
||||
"data": {
|
||||
"count_in_response": 1,
|
||||
"items": [{"dataset_persistent_id": "doi:10.7910/DVN/6ZXAGT"}],
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.parametrize("test_input, expected", test_hosts)
|
||||
def test_detect_dataverse(test_input, expected):
|
||||
def doi_resolver(url):
|
||||
return test_responses.get(url)
|
||||
def test_detect_dataverse(test_input, expected, requests_mock):
|
||||
def doi_resolver(req, context):
|
||||
resp = doi_responses.get(req.url)
|
||||
# doi responses are redirects
|
||||
if resp is not None:
|
||||
context.status_code = 302
|
||||
context.headers["Location"] = resp
|
||||
return resp
|
||||
|
||||
with patch.object(Dataverse, "urlopen") as fake_urlopen, patch.object(
|
||||
Dataverse, "doi2url", side_effect=doi_resolver
|
||||
) as fake_doi2url:
|
||||
fake_urlopen.return_value.read.return_value = json.dumps(test_search).encode()
|
||||
# valid Dataverse DOIs trigger this content provider
|
||||
assert Dataverse().detect(test_input[0]) == expected[0]
|
||||
assert fake_doi2url.call_count == 2 # File, then dataset
|
||||
assert Dataverse().detect(test_input[1]) == expected[0]
|
||||
assert Dataverse().detect(test_input[2]) == expected[0]
|
||||
# only two of the three calls above have to resolve a DOI
|
||||
assert fake_urlopen.call_count == 1
|
||||
assert Dataverse().detect(test_input[3]) == expected[1]
|
||||
requests_mock.get(re.compile("https://"), json=doi_resolver)
|
||||
requests_mock.get(
|
||||
"https://dataverse.harvard.edu/api/search?q=entityId:3323458&type=file",
|
||||
json={
|
||||
"count_in_response": 1,
|
||||
"items": [{"dataset_persistent_id": "doi:10.7910/DVN/6ZXAGT"}],
|
||||
},
|
||||
)
|
||||
|
||||
with patch.object(Dataverse, "urlopen") as fake_urlopen:
|
||||
# Don't trigger the Dataverse content provider
|
||||
assert Dataverse().detect("/some/path/here") is None
|
||||
assert Dataverse().detect("https://example.com/path/here") is None
|
||||
# don't handle DOIs that aren't from Dataverse
|
||||
fake_urlopen.return_value.url = (
|
||||
"http://joss.theoj.org/papers/10.21105/joss.01277"
|
||||
)
|
||||
assert Dataverse().detect("https://doi.org/10.21105/joss.01277") is None
|
||||
assert requests_mock.call_count == 0
|
||||
# valid Dataverse DOIs trigger this content provider
|
||||
assert Dataverse().detect(test_input[0]) == expected[0]
|
||||
# 4: doi resolution (302), File, doi resolution (302), then dataset
|
||||
assert requests_mock.call_count == 4
|
||||
requests_mock.reset_mock()
|
||||
|
||||
assert Dataverse().detect(test_input[1]) == expected[0]
|
||||
# 2: doi (302), dataset
|
||||
assert requests_mock.call_count == 2
|
||||
requests_mock.reset_mock()
|
||||
|
||||
assert Dataverse().detect(test_input[2]) == expected[0]
|
||||
# 1: datafile (search dataverse for the file id)
|
||||
assert requests_mock.call_count == 1
|
||||
requests_mock.reset_mock()
|
||||
|
||||
assert Dataverse().detect(test_input[3]) == expected[1]
|
||||
requests_mock.reset_mock()
|
||||
|
||||
# Don't trigger the Dataverse content provider
|
||||
assert Dataverse().detect("/some/path/here") is None
|
||||
assert Dataverse().detect("https://example.com/path/here") is None
|
||||
# don't handle DOIs that aren't from Dataverse
|
||||
assert Dataverse().detect("https://doi.org/10.21105/joss.01277") is None
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -95,50 +107,50 @@ def dv_files(tmpdir):
|
|||
return [f1, f2, f3]
|
||||
|
||||
|
||||
def test_dataverse_fetch(dv_files):
|
||||
mock_response_ds_query = BytesIO(
|
||||
json.dumps(
|
||||
{
|
||||
"data": {
|
||||
"latestVersion": {
|
||||
"files": [
|
||||
{"dataFile": {"id": 1}, "label": "some-file.txt"},
|
||||
{
|
||||
"dataFile": {"id": 2},
|
||||
"label": "some-other-file.txt",
|
||||
"directoryLabel": "directory",
|
||||
},
|
||||
{
|
||||
"dataFile": {"id": 3},
|
||||
"label": "the-other-file.txt",
|
||||
"directoryLabel": "directory/subdirectory",
|
||||
},
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
).encode("utf-8")
|
||||
)
|
||||
def test_dataverse_fetch(dv_files, requests_mock):
|
||||
mock_response = {
|
||||
"latestVersion": {
|
||||
"files": [
|
||||
{"dataFile": {"id": 1}, "label": "some-file.txt"},
|
||||
{
|
||||
"dataFile": {"id": 2},
|
||||
"label": "some-other-file.txt",
|
||||
"directoryLabel": "directory",
|
||||
},
|
||||
{
|
||||
"dataFile": {"id": 3},
|
||||
"label": "the-other-file.txt",
|
||||
"directoryLabel": "directory/subdirectory",
|
||||
},
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}
|
||||
|
||||
def mock_filecontent(req, context):
|
||||
file_no = int(req.url.split("/")[-1]) - 1
|
||||
return open(dv_files[file_no], "rb").read()
|
||||
|
||||
requests_mock.get(
|
||||
"https://dataverse.harvard.edu/api/datasets/"
|
||||
":persistentId?persistentId=doi:10.7910/DVN/6ZXAGT",
|
||||
json=mock_response,
|
||||
)
|
||||
requests_mock.get(
|
||||
re.compile("https://dataverse.harvard.edu/api/access/datafile"),
|
||||
content=mock_filecontent,
|
||||
)
|
||||
|
||||
dv = Dataverse()
|
||||
|
||||
def mock_urlopen(self, req):
|
||||
if isinstance(req, Request):
|
||||
return mock_response_ds_query
|
||||
else:
|
||||
file_no = int(req.split("/")[-1]) - 1
|
||||
return urlopen("file://{}".format(dv_files[file_no]))
|
||||
|
||||
with patch.object(Dataverse, "urlopen", new=mock_urlopen):
|
||||
with TemporaryDirectory() as d:
|
||||
output = []
|
||||
for l in dv.fetch(spec, d):
|
||||
output.append(l)
|
||||
|
||||
unpacked_files = set(os.listdir(d))
|
||||
expected = set(["directory", "some-file.txt"])
|
||||
assert expected == unpacked_files
|
||||
assert os.path.isfile(
|
||||
os.path.join(d, "directory", "subdirectory", "the-other-file.txt")
|
||||
)
|
||||
with TemporaryDirectory() as d:
|
||||
output = []
|
||||
for l in dv.fetch(spec, d):
|
||||
output.append(l)
|
||||
unpacked_files = set(os.listdir(d))
|
||||
expected = set(["directory", "some-file.txt"])
|
||||
assert expected == unpacked_files
|
||||
assert os.path.isfile(
|
||||
os.path.join(d, "directory", "subdirectory", "the-other-file.txt")
|
||||
)
|
||||
|
|
|
@ -11,6 +11,7 @@ from zipfile import ZipFile
|
|||
|
||||
from repo2docker.contentproviders.doi import DoiProvider
|
||||
from repo2docker.contentproviders.base import ContentProviderException
|
||||
from repo2docker import __version__
|
||||
|
||||
|
||||
def test_content_id():
|
||||
|
@ -18,20 +19,15 @@ def test_content_id():
|
|||
assert doi.content_id is None
|
||||
|
||||
|
||||
def fake_urlopen(req):
|
||||
print(req)
|
||||
return req.headers
|
||||
|
||||
|
||||
@patch("urllib.request.urlopen", fake_urlopen)
|
||||
def test_url_headers():
|
||||
def test_url_headers(requests_mock):
|
||||
requests_mock.get("https://mybinder.org", text="resp")
|
||||
doi = DoiProvider()
|
||||
|
||||
headers = {"test1": "value1", "Test2": "value2"}
|
||||
result = doi.urlopen("https://mybinder.org", headers=headers)
|
||||
assert "Test1" in result
|
||||
assert "Test2" in result
|
||||
assert len(result) is 3 # User-agent is also set
|
||||
assert "test1" in result.request.headers
|
||||
assert "Test2" in result.request.headers
|
||||
assert result.request.headers["User-Agent"] == "repo2docker {}".format(__version__)
|
||||
|
||||
|
||||
def test_unresolving_doi():
|
||||
|
|
|
@ -22,12 +22,17 @@ test_content_ids = [
|
|||
|
||||
|
||||
@pytest.mark.parametrize("link,expected", test_content_ids)
|
||||
def test_content_id(link, expected):
|
||||
with patch.object(Figshare, "urlopen") as fake_urlopen:
|
||||
fake_urlopen.return_value.url = link
|
||||
fig = Figshare()
|
||||
fig.detect("10.6084/m9.figshare.9782777")
|
||||
assert fig.content_id == expected
|
||||
def test_content_id(link, expected, requests_mock):
|
||||
def mocked_get(req, context):
|
||||
if req.url.startswith("https://doi.org"):
|
||||
context.status_code = 302
|
||||
context.headers["Location"] = link
|
||||
return link
|
||||
|
||||
requests_mock.get(re.compile("https://"), text=mocked_get)
|
||||
fig = Figshare()
|
||||
fig.detect("10.6084/m9.figshare.9782777")
|
||||
assert fig.content_id == expected
|
||||
|
||||
|
||||
test_fig = Figshare()
|
||||
|
@ -103,76 +108,73 @@ def figshare_archive(prefix="a_directory"):
|
|||
yield zfile.name
|
||||
|
||||
|
||||
def test_fetch_zip():
|
||||
def test_fetch_zip(requests_mock):
|
||||
# see test_zenodo.py/test_fetch_software
|
||||
with figshare_archive() as fig_path:
|
||||
mock_response = BytesIO(
|
||||
json.dumps(
|
||||
mock_response = {
|
||||
"files": [
|
||||
{
|
||||
"files": [
|
||||
{
|
||||
"name": "afake.zip",
|
||||
"is_link_only": False,
|
||||
"download_url": "file://{}".format(fig_path),
|
||||
}
|
||||
]
|
||||
"name": "afake.zip",
|
||||
"is_link_only": False,
|
||||
"download_url": "file://{}".format(fig_path),
|
||||
}
|
||||
).encode("utf-8")
|
||||
]
|
||||
}
|
||||
requests_mock.get(
|
||||
"https://api.figshare.com/v2/articles/123456/versions/42",
|
||||
json=mock_response,
|
||||
)
|
||||
requests_mock.get(
|
||||
"file://{}".format(fig_path), content=open(fig_path, "rb").read()
|
||||
)
|
||||
|
||||
def mock_urlopen(self, req):
|
||||
if isinstance(req, Request):
|
||||
return mock_response
|
||||
else:
|
||||
return urlopen(req)
|
||||
# with patch.object(Figshare, "urlopen", new=mock_urlopen):
|
||||
with TemporaryDirectory() as d:
|
||||
output = []
|
||||
for l in test_fig.fetch(test_spec, d):
|
||||
output.append(l)
|
||||
|
||||
unpacked_files = set(os.listdir(d))
|
||||
expected = set(["some-other-file.txt", "some-file.txt"])
|
||||
assert expected == unpacked_files
|
||||
|
||||
|
||||
def test_fetch_data(requests_mock):
|
||||
with figshare_archive() as a_path:
|
||||
with figshare_archive() as b_path:
|
||||
mock_response = {
|
||||
"files": [
|
||||
{
|
||||
"name": "afake.file",
|
||||
"download_url": "file://{}".format(a_path),
|
||||
"is_link_only": False,
|
||||
},
|
||||
{
|
||||
"name": "bfake.data",
|
||||
"download_url": "file://{}".format(b_path),
|
||||
"is_link_only": False,
|
||||
},
|
||||
{"name": "cfake.link", "is_link_only": True},
|
||||
]
|
||||
}
|
||||
|
||||
requests_mock.get(
|
||||
"https://api.figshare.com/v2/articles/123456/versions/42",
|
||||
json=mock_response,
|
||||
)
|
||||
requests_mock.get(
|
||||
"file://{}".format(a_path), content=open(a_path, "rb").read()
|
||||
)
|
||||
requests_mock.get(
|
||||
"file://{}".format(b_path), content=open(b_path, "rb").read()
|
||||
)
|
||||
|
||||
with patch.object(Figshare, "urlopen", new=mock_urlopen):
|
||||
with TemporaryDirectory() as d:
|
||||
output = []
|
||||
for l in test_fig.fetch(test_spec, d):
|
||||
output.append(l)
|
||||
|
||||
unpacked_files = set(os.listdir(d))
|
||||
expected = set(["some-other-file.txt", "some-file.txt"])
|
||||
# ZIP files shouldn't have been unpacked
|
||||
expected = {"bfake.data", "afake.file"}
|
||||
assert expected == unpacked_files
|
||||
|
||||
|
||||
def test_fetch_data():
|
||||
with figshare_archive() as a_path:
|
||||
with figshare_archive() as b_path:
|
||||
mock_response = BytesIO(
|
||||
json.dumps(
|
||||
{
|
||||
"files": [
|
||||
{
|
||||
"name": "afake.file",
|
||||
"download_url": "file://{}".format(a_path),
|
||||
"is_link_only": False,
|
||||
},
|
||||
{
|
||||
"name": "bfake.data",
|
||||
"download_url": "file://{}".format(b_path),
|
||||
"is_link_only": False,
|
||||
},
|
||||
{"name": "cfake.link", "is_link_only": True},
|
||||
]
|
||||
}
|
||||
).encode("utf-8")
|
||||
)
|
||||
|
||||
def mock_urlopen(self, req):
|
||||
if isinstance(req, Request):
|
||||
return mock_response
|
||||
else:
|
||||
return urlopen(req)
|
||||
|
||||
with patch.object(Figshare, "urlopen", new=mock_urlopen):
|
||||
with TemporaryDirectory() as d:
|
||||
output = []
|
||||
for l in test_fig.fetch(test_spec, d):
|
||||
output.append(l)
|
||||
|
||||
unpacked_files = set(os.listdir(d))
|
||||
# ZIP files shouldn't have been unpacked
|
||||
expected = {"bfake.data", "afake.file"}
|
||||
assert expected == unpacked_files
|
||||
|
|
|
@ -5,87 +5,98 @@ from contextlib import contextmanager
|
|||
from tempfile import TemporaryDirectory, NamedTemporaryFile
|
||||
from unittest.mock import patch
|
||||
from zipfile import ZipFile
|
||||
import re
|
||||
|
||||
from repo2docker.contentproviders import Hydroshare
|
||||
from repo2docker.contentproviders.base import ContentProviderException
|
||||
|
||||
|
||||
def test_content_id():
|
||||
with patch.object(Hydroshare, "urlopen") as fake_urlopen:
|
||||
fake_urlopen.return_value.url = (
|
||||
doi_responses = {
|
||||
"https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61": (
|
||||
"https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"
|
||||
),
|
||||
"https://doi.org/10.21105/joss.01277": (
|
||||
"https://joss.theoj.org/papers/10.21105/joss.01277"
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def doi_resolver(req, context):
|
||||
resp = doi_responses.get(req.url)
|
||||
# doi responses are redirects
|
||||
if resp is not None:
|
||||
context.status_code = 302
|
||||
context.headers["Location"] = resp
|
||||
return resp
|
||||
|
||||
|
||||
hydroshare_data = {
|
||||
"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]
|
||||
}
|
||||
|
||||
|
||||
def test_content_id(requests_mock):
|
||||
|
||||
requests_mock.get(re.compile("https://"), json=hydroshare_data)
|
||||
requests_mock.get(re.compile("https://doi.org"), json=doi_resolver)
|
||||
|
||||
hydro = Hydroshare()
|
||||
|
||||
hydro.detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61")
|
||||
assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61.v1569427757"
|
||||
|
||||
|
||||
def test_detect_hydroshare(requests_mock):
|
||||
requests_mock.get(re.compile("https://"), json=hydroshare_data)
|
||||
requests_mock.get(re.compile("https://doi.org"), json=doi_resolver)
|
||||
|
||||
# valid Hydroshare DOIs trigger this content provider
|
||||
expected = {
|
||||
"host": {
|
||||
"hostname": [
|
||||
"https://www.hydroshare.org/resource/",
|
||||
"http://www.hydroshare.org/resource/",
|
||||
],
|
||||
"django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
|
||||
"version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements",
|
||||
},
|
||||
"resource": "b8f6eae9d89241cf8b5904033460af61",
|
||||
"version": "1569427757",
|
||||
}
|
||||
|
||||
assert (
|
||||
Hydroshare().detect(
|
||||
"https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"
|
||||
)
|
||||
== expected
|
||||
)
|
||||
# assert a call to urlopen was called to fetch version
|
||||
assert requests_mock.call_count == 1
|
||||
requests_mock.reset_mock()
|
||||
|
||||
def read():
|
||||
return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}'
|
||||
assert (
|
||||
Hydroshare().detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") == expected
|
||||
)
|
||||
# assert 3 calls were made, 2 to resolve the DOI (302 + 200) and another to fetch the version
|
||||
assert requests_mock.call_count == 3
|
||||
requests_mock.reset_mock()
|
||||
|
||||
fake_urlopen.return_value.read = read
|
||||
hydro = Hydroshare()
|
||||
|
||||
hydro.detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61")
|
||||
assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61.v1569427757"
|
||||
|
||||
|
||||
def test_detect_hydroshare():
|
||||
with patch.object(Hydroshare, "urlopen") as fake_urlopen:
|
||||
fake_urlopen.return_value.url = (
|
||||
"https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"
|
||||
assert (
|
||||
Hydroshare().detect(
|
||||
"https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61"
|
||||
)
|
||||
== expected
|
||||
)
|
||||
# assert 3 more calls were made, 2 to resolve the DOI and another to fetch the version
|
||||
assert requests_mock.call_count == 3
|
||||
requests_mock.reset_mock()
|
||||
|
||||
def read():
|
||||
return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}'
|
||||
# Don't trigger the Hydroshare content provider
|
||||
assert Hydroshare().detect("/some/path/here") is None
|
||||
assert Hydroshare().detect("https://example.com/path/here") is None
|
||||
|
||||
fake_urlopen.return_value.read = read
|
||||
# valid Hydroshare DOIs trigger this content provider
|
||||
expected = {
|
||||
"host": {
|
||||
"hostname": [
|
||||
"https://www.hydroshare.org/resource/",
|
||||
"http://www.hydroshare.org/resource/",
|
||||
],
|
||||
"django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
|
||||
"version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements",
|
||||
},
|
||||
"resource": "b8f6eae9d89241cf8b5904033460af61",
|
||||
"version": "1569427757",
|
||||
}
|
||||
assert (
|
||||
Hydroshare().detect(
|
||||
"https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"
|
||||
)
|
||||
== expected
|
||||
)
|
||||
# assert a call to urlopen was called to fetch version
|
||||
assert fake_urlopen.call_count == 1
|
||||
assert (
|
||||
Hydroshare().detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61")
|
||||
== expected
|
||||
)
|
||||
# assert 2 more calls were made, one to resolve the DOI and another to fetch the version
|
||||
assert fake_urlopen.call_count == 3
|
||||
assert (
|
||||
Hydroshare().detect(
|
||||
"https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61"
|
||||
)
|
||||
== expected
|
||||
)
|
||||
# assert 2 more calls were made, one to resolve the DOI and another to fetch the version
|
||||
assert fake_urlopen.call_count == 5
|
||||
|
||||
with patch.object(Hydroshare, "urlopen") as fake_urlopen:
|
||||
# Don't trigger the Hydroshare content provider
|
||||
assert Hydroshare().detect("/some/path/here") is None
|
||||
assert Hydroshare().detect("https://example.com/path/here") is None
|
||||
# don't handle DOIs that aren't from Hydroshare
|
||||
fake_urlopen.return_value.url = (
|
||||
"http://joss.theoj.org/papers/10.21105/joss.01277"
|
||||
)
|
||||
|
||||
def read():
|
||||
return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}'
|
||||
|
||||
fake_urlopen.return_value.read = read
|
||||
assert Hydroshare().detect("https://doi.org/10.21105/joss.01277") is None
|
||||
# don't handle DOIs that aren't from Hydroshare
|
||||
assert Hydroshare().detect("https://doi.org/10.21105/joss.01277") is None
|
||||
|
||||
|
||||
@contextmanager
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import json
|
||||
import os
|
||||
import pytest
|
||||
import re
|
||||
|
||||
from contextlib import contextmanager
|
||||
from io import BytesIO
|
||||
|
@ -11,14 +12,30 @@ from zipfile import ZipFile
|
|||
|
||||
from repo2docker.contentproviders import Zenodo
|
||||
|
||||
doi_responses = {
|
||||
"https://doi.org/10.5281/zenodo.3232985": ("https://zenodo.org/record/3232985"),
|
||||
"https://doi.org/10.22002/d1.1235": ("https://data.caltech.edu/records/1235"),
|
||||
"https://doi.org/10.21105/joss.01277": (
|
||||
"https://joss.theoj.org/papers/10.21105/joss.01277"
|
||||
),
|
||||
}
|
||||
|
||||
def test_content_id():
|
||||
with patch.object(Zenodo, "urlopen") as fake_urlopen:
|
||||
fake_urlopen.return_value.url = "https://zenodo.org/record/3232985"
|
||||
zen = Zenodo()
|
||||
|
||||
zen.detect("10.5281/zenodo.3232985")
|
||||
assert zen.content_id == "3232985"
|
||||
def doi_resolver(req, context):
|
||||
resp = doi_responses.get(req.url)
|
||||
# doi responses are redirects
|
||||
if resp is not None:
|
||||
context.status_code = 302
|
||||
context.headers["Location"] = resp
|
||||
return resp
|
||||
|
||||
|
||||
def test_content_id(requests_mock):
|
||||
requests_mock.get(re.compile("https://"), json=doi_resolver)
|
||||
|
||||
zen = Zenodo()
|
||||
zen.detect("10.5281/zenodo.3232985")
|
||||
assert zen.content_id == "3232985"
|
||||
|
||||
|
||||
test_zen = Zenodo()
|
||||
|
@ -43,25 +60,22 @@ test_hosts = [
|
|||
|
||||
|
||||
@pytest.mark.parametrize("test_input,expected", test_hosts)
|
||||
def test_detect_zenodo(test_input, expected):
|
||||
with patch.object(Zenodo, "urlopen") as fake_urlopen:
|
||||
fake_urlopen.return_value.url = test_input[0]
|
||||
# valid Zenodo DOIs trigger this content provider
|
||||
assert Zenodo().detect(test_input[0]) == expected
|
||||
assert Zenodo().detect(test_input[1]) == expected
|
||||
assert Zenodo().detect(test_input[2]) == expected
|
||||
# only two of the three calls above have to resolve a DOI
|
||||
assert fake_urlopen.call_count == 2
|
||||
def test_detect_zenodo(test_input, expected, requests_mock):
|
||||
requests_mock.get(re.compile("https://"), json=doi_resolver)
|
||||
# valid Zenodo DOIs trigger this content provider
|
||||
assert Zenodo().detect(test_input[0]) == expected
|
||||
assert Zenodo().detect(test_input[1]) == expected
|
||||
assert Zenodo().detect(test_input[2]) == expected
|
||||
# only two of the three calls above have to resolve a DOI (2 req per doi resolution)
|
||||
assert requests_mock.call_count == 4
|
||||
requests_mock.reset_mock()
|
||||
|
||||
with patch.object(Zenodo, "urlopen") as fake_urlopen:
|
||||
# Don't trigger the Zenodo content provider
|
||||
assert Zenodo().detect("/some/path/here") is None
|
||||
assert Zenodo().detect("https://example.com/path/here") is None
|
||||
# don't handle DOIs that aren't from Zenodo
|
||||
fake_urlopen.return_value.url = (
|
||||
"http://joss.theoj.org/papers/10.21105/joss.01277"
|
||||
)
|
||||
assert Zenodo().detect("https://doi.org/10.21105/joss.01277") is None
|
||||
# Don't trigger the Zenodo content provider
|
||||
assert Zenodo().detect("/some/path/here") is None
|
||||
assert Zenodo().detect("https://example.com/path/here") is None
|
||||
|
||||
# don't handle DOIs that aren't from Zenodo
|
||||
assert Zenodo().detect("https://doi.org/10.21105/joss.01277") is None
|
||||
|
||||
|
||||
@contextmanager
|
||||
|
@ -74,120 +88,102 @@ def zenodo_archive(prefix="a_directory"):
|
|||
yield zfile.name
|
||||
|
||||
|
||||
def test_fetch_software_from_github_archive():
|
||||
def test_fetch_software_from_github_archive(requests_mock):
|
||||
# we "fetch" a local ZIP file to simulate a Zenodo record created from a
|
||||
# GitHub repository via the Zenodo-GitHub integration
|
||||
with zenodo_archive() as zen_path:
|
||||
mock_response = BytesIO(
|
||||
json.dumps(
|
||||
mock_response = {
|
||||
"files": [
|
||||
{
|
||||
"files": [
|
||||
{
|
||||
"filename": "some_dir/afake.zip",
|
||||
"links": {"download": "file://{}".format(zen_path)},
|
||||
}
|
||||
],
|
||||
"metadata": {"upload_type": "software"},
|
||||
"filename": "some_dir/afake.zip",
|
||||
"links": {"download": "file://{}".format(zen_path)},
|
||||
}
|
||||
).encode("utf-8")
|
||||
],
|
||||
"metadata": {"upload_type": "software"},
|
||||
}
|
||||
requests_mock.get("https://zenodo.org/api/records/1234", json=mock_response)
|
||||
requests_mock.get(
|
||||
"file://{}".format(zen_path), content=open(zen_path, "rb").read()
|
||||
)
|
||||
|
||||
def mock_urlopen(self, req):
|
||||
if isinstance(req, Request):
|
||||
return mock_response
|
||||
else:
|
||||
return urlopen(req)
|
||||
zen = Zenodo()
|
||||
spec = {"host": test_zen.hosts[0], "record": "1234"}
|
||||
|
||||
with patch.object(Zenodo, "urlopen", new=mock_urlopen):
|
||||
zen = Zenodo()
|
||||
spec = {"host": test_zen.hosts[0], "record": "1234"}
|
||||
with TemporaryDirectory() as d:
|
||||
output = []
|
||||
for l in zen.fetch(spec, d):
|
||||
output.append(l)
|
||||
|
||||
with TemporaryDirectory() as d:
|
||||
output = []
|
||||
for l in zen.fetch(spec, d):
|
||||
output.append(l)
|
||||
|
||||
unpacked_files = set(os.listdir(d))
|
||||
expected = set(["some-other-file.txt", "some-file.txt"])
|
||||
assert expected == unpacked_files
|
||||
unpacked_files = set(os.listdir(d))
|
||||
expected = set(["some-other-file.txt", "some-file.txt"])
|
||||
assert expected == unpacked_files
|
||||
|
||||
|
||||
def test_fetch_software():
|
||||
def test_fetch_software(requests_mock):
|
||||
# we "fetch" a local ZIP file to simulate a Zenodo software record with a
|
||||
# ZIP file in it
|
||||
with zenodo_archive() as zen_path:
|
||||
mock_response = BytesIO(
|
||||
json.dumps(
|
||||
mock_response = {
|
||||
"files": [
|
||||
{
|
||||
"files": [
|
||||
{
|
||||
# this is the difference to the GitHub generated one,
|
||||
# the ZIP file isn't in a directory
|
||||
"filename": "afake.zip",
|
||||
"links": {"download": "file://{}".format(zen_path)},
|
||||
}
|
||||
],
|
||||
"metadata": {"upload_type": "software"},
|
||||
# this is the difference to the GitHub generated one,
|
||||
# the ZIP file isn't in a directory
|
||||
"filename": "afake.zip",
|
||||
"links": {"download": "file://{}".format(zen_path)},
|
||||
}
|
||||
).encode("utf-8")
|
||||
],
|
||||
"metadata": {"upload_type": "software"},
|
||||
}
|
||||
requests_mock.get("https://zenodo.org/api/records/1234", json=mock_response)
|
||||
requests_mock.get(
|
||||
"file://{}".format(zen_path), content=open(zen_path, "rb").read()
|
||||
)
|
||||
|
||||
def mock_urlopen(self, req):
|
||||
if isinstance(req, Request):
|
||||
return mock_response
|
||||
else:
|
||||
return urlopen(req)
|
||||
with TemporaryDirectory() as d:
|
||||
zen = Zenodo()
|
||||
spec = spec = {"host": test_zen.hosts[0], "record": "1234"}
|
||||
output = []
|
||||
for l in zen.fetch(spec, d):
|
||||
output.append(l)
|
||||
|
||||
unpacked_files = set(os.listdir(d))
|
||||
expected = set(["some-other-file.txt", "some-file.txt"])
|
||||
assert expected == unpacked_files
|
||||
|
||||
|
||||
def test_fetch_data(requests_mock):
|
||||
# we "fetch" a local ZIP file to simulate a Zenodo data record
|
||||
with zenodo_archive() as a_zen_path:
|
||||
with zenodo_archive() as b_zen_path:
|
||||
mock_response = {
|
||||
"files": [
|
||||
{
|
||||
"filename": "afake.zip",
|
||||
"links": {"download": "file://{}".format(a_zen_path)},
|
||||
},
|
||||
{
|
||||
"filename": "bfake.zip",
|
||||
"links": {"download": "file://{}".format(b_zen_path)},
|
||||
},
|
||||
],
|
||||
"metadata": {"upload_type": "data"},
|
||||
}
|
||||
requests_mock.get("https://zenodo.org/api/records/1234", json=mock_response)
|
||||
requests_mock.get(
|
||||
"file://{}".format(a_zen_path), content=open(a_zen_path, "rb").read()
|
||||
)
|
||||
requests_mock.get(
|
||||
"file://{}".format(b_zen_path), content=open(b_zen_path, "rb").read()
|
||||
)
|
||||
|
||||
with patch.object(Zenodo, "urlopen", new=mock_urlopen):
|
||||
with TemporaryDirectory() as d:
|
||||
zen = Zenodo()
|
||||
spec = spec = {"host": test_zen.hosts[0], "record": "1234"}
|
||||
spec = {"host": test_zen.hosts[0], "record": "1234"}
|
||||
output = []
|
||||
for l in zen.fetch(spec, d):
|
||||
output.append(l)
|
||||
|
||||
unpacked_files = set(os.listdir(d))
|
||||
expected = set(["some-other-file.txt", "some-file.txt"])
|
||||
# ZIP files shouldn't have been unpacked
|
||||
expected = {"bfake.zip", "afake.zip"}
|
||||
assert expected == unpacked_files
|
||||
|
||||
|
||||
def test_fetch_data():
|
||||
# we "fetch" a local ZIP file to simulate a Zenodo data record
|
||||
with zenodo_archive() as a_zen_path:
|
||||
with zenodo_archive() as b_zen_path:
|
||||
mock_response = BytesIO(
|
||||
json.dumps(
|
||||
{
|
||||
"files": [
|
||||
{
|
||||
"filename": "afake.zip",
|
||||
"links": {"download": "file://{}".format(a_zen_path)},
|
||||
},
|
||||
{
|
||||
"filename": "bfake.zip",
|
||||
"links": {"download": "file://{}".format(b_zen_path)},
|
||||
},
|
||||
],
|
||||
"metadata": {"upload_type": "data"},
|
||||
}
|
||||
).encode("utf-8")
|
||||
)
|
||||
|
||||
def mock_urlopen(self, req):
|
||||
if isinstance(req, Request):
|
||||
return mock_response
|
||||
else:
|
||||
return urlopen(req)
|
||||
|
||||
with patch.object(Zenodo, "urlopen", new=mock_urlopen):
|
||||
with TemporaryDirectory() as d:
|
||||
zen = Zenodo()
|
||||
spec = {"host": test_zen.hosts[0], "record": "1234"}
|
||||
output = []
|
||||
for l in zen.fetch(spec, d):
|
||||
output.append(l)
|
||||
|
||||
unpacked_files = set(os.listdir(d))
|
||||
# ZIP files shouldn't have been unpacked
|
||||
expected = {"bfake.zip", "afake.zip"}
|
||||
assert expected == unpacked_files
|
||||
|
|
Ładowanie…
Reference in New Issue