Replace urllib by requests in contentproviders

requests is globally simpler to use, and more and more people
are more familiar with this later than urllib.
pull/993/head
David Douard 2020-12-10 18:55:14 +01:00
rodzic 560b1d96a0
commit 830a9c89c0
12 zmienionych plików z 427 dodań i 397 usunięć

Wyświetl plik

@ -5,4 +5,4 @@ pytest>=4.6
wheel wheel
pytest-cov pytest-cov
pre-commit pre-commit
requests requests_mock

Wyświetl plik

@ -2,7 +2,6 @@ import os
import json import json
import shutil import shutil
from urllib.request import Request
from urllib.parse import urlparse, urlunparse, parse_qs from urllib.parse import urlparse, urlunparse, parse_qs
from .doi import DoiProvider from .doi import DoiProvider
@ -56,7 +55,6 @@ class Dataverse(DoiProvider):
return return
query_args = parse_qs(parsed_url.query) query_args = parse_qs(parsed_url.query)
# Corner case handling # Corner case handling
if parsed_url.path.startswith("/file.xhtml"): if parsed_url.path.startswith("/file.xhtml"):
# There's no way of getting file information using its persistentId, the only thing we can do is assume that doi # There's no way of getting file information using its persistentId, the only thing we can do is assume that doi
@ -75,8 +73,7 @@ class Dataverse(DoiProvider):
parsed_url._replace(path="/api/search", query=search_query) parsed_url._replace(path="/api/search", query=search_query)
) )
self.log.debug("Querying Dataverse: " + search_url) self.log.debug("Querying Dataverse: " + search_url)
resp = self.urlopen(search_url).read() data = self.urlopen(search_url).json()
data = json.loads(resp.decode("utf-8"))["data"]
if data["count_in_response"] != 1: if data["count_in_response"] != 1:
self.log.debug( self.log.debug(
"Dataverse search query failed!\n - doi: {}\n - url: {}\n - resp: {}\n".format( "Dataverse search query failed!\n - doi: {}\n - url: {}\n - resp: {}\n".format(
@ -101,14 +98,12 @@ class Dataverse(DoiProvider):
host = spec["host"] host = spec["host"]
yield "Fetching Dataverse record {}.\n".format(record_id) yield "Fetching Dataverse record {}.\n".format(record_id)
req = Request( url = "{}/api/datasets/:persistentId?persistentId={}".format(
"{}/api/datasets/:persistentId?persistentId={}".format( host["url"], record_id
host["url"], record_id
),
headers={"accept": "application/json"},
) )
resp = self.urlopen(req)
record = json.loads(resp.read().decode("utf-8"))["data"] resp = self.urlopen(url, headers={"accept": "application/json"})
record = resp.json()
for fobj in deep_get(record, "latestVersion.files"): for fobj in deep_get(record, "latestVersion.files"):
file_url = "{}/api/access/datafile/{}".format( file_url = "{}/api/access/datafile/{}".format(

Wyświetl plik

@ -5,8 +5,8 @@ import logging
from os import makedirs from os import makedirs
from os import path from os import path
from urllib import request # urlopen, Request from requests import Session, HTTPError
from urllib.error import HTTPError
from zipfile import ZipFile, is_zipfile from zipfile import ZipFile, is_zipfile
from .base import ContentProvider from .base import ContentProvider
@ -18,7 +18,21 @@ from .. import __version__
class DoiProvider(ContentProvider): class DoiProvider(ContentProvider):
"""Provide contents of a repository identified by a DOI and some helper functions.""" """Provide contents of a repository identified by a DOI and some helper functions."""
def urlopen(self, req, headers=None): def __init__(self):
super().__init__()
self.session = Session()
self.session.headers.update(
{
"user-agent": "repo2docker {}".format(__version__),
}
)
def _request(self, url, **kwargs):
return self.session.get(url, **kwargs)
urlopen = _request
def _urlopen(self, req, headers=None):
"""A urlopen() helper""" """A urlopen() helper"""
# someone passed a string, not a request # someone passed a string, not a request
if not isinstance(req, request.Request): if not isinstance(req, request.Request):
@ -38,7 +52,8 @@ class DoiProvider(ContentProvider):
doi = normalize_doi(doi) doi = normalize_doi(doi)
try: try:
resp = self.urlopen("https://doi.org/{}".format(doi)) resp = self._request("https://doi.org/{}".format(doi))
resp.raise_for_status()
# If the DOI doesn't resolve, just return URL # If the DOI doesn't resolve, just return URL
except HTTPError: except HTTPError:
return doi return doi
@ -53,38 +68,42 @@ class DoiProvider(ContentProvider):
file_url = deep_get(file_ref, host["download"]) file_url = deep_get(file_ref, host["download"])
fname = deep_get(file_ref, host["filename"]) fname = deep_get(file_ref, host["filename"])
logging.debug("Downloading file {} as {}\n".format(file_url, fname)) logging.debug("Downloading file {} as {}\n".format(file_url, fname))
with self.urlopen(file_url) as src:
yield "Requesting {}\n".format(file_url)
resp = self._request(file_url, stream=True)
resp.raise_for_status()
if path.dirname(fname):
sub_dir = path.join(output_dir, path.dirname(fname))
if not path.exists(sub_dir):
yield "Creating {}\n".format(sub_dir)
makedirs(sub_dir, exist_ok=True)
dst_fname = path.join(output_dir, fname)
with open(dst_fname, "wb") as dst:
yield "Fetching {}\n".format(fname)
for chunk in resp.iter_content(chunk_size=None):
dst.write(chunk)
if unzip and is_zipfile(dst_fname):
yield "Extracting {}\n".format(fname)
zfile = ZipFile(dst_fname)
zfile.extractall(path=output_dir)
zfile.close()
# delete downloaded file ...
os.remove(dst_fname)
# ... and any directories we might have created,
# in which case sub_dir will be defined
if path.dirname(fname): if path.dirname(fname):
sub_dir = path.join(output_dir, path.dirname(fname)) shutil.rmtree(sub_dir)
if not path.exists(sub_dir):
yield "Creating {}\n".format(sub_dir)
makedirs(sub_dir, exist_ok=True)
dst_fname = path.join(output_dir, fname) new_subdirs = os.listdir(output_dir)
with open(dst_fname, "wb") as dst: # if there is only one new subdirectory move its contents
yield "Fetching {}\n".format(fname) # to the top level directory
shutil.copyfileobj(src, dst) if len(new_subdirs) == 1:
# first close the newly written file, then continue d = new_subdirs[0]
# processing it copytree(path.join(output_dir, d), output_dir)
if unzip and is_zipfile(dst_fname): shutil.rmtree(path.join(output_dir, d))
yield "Extracting {}\n".format(fname)
zfile = ZipFile(dst_fname)
zfile.extractall(path=output_dir)
zfile.close()
# delete downloaded file ... yield "Fetched files: {}\n".format(os.listdir(output_dir))
os.remove(dst_fname)
# ... and any directories we might have created,
# in which case sub_dir will be defined
if path.dirname(fname):
shutil.rmtree(sub_dir)
new_subdirs = os.listdir(output_dir)
# if there is only one new subdirectory move its contents
# to the top level directory
if len(new_subdirs) == 1:
d = new_subdirs[0]
copytree(path.join(output_dir, d), output_dir)
shutil.rmtree(path.join(output_dir, d))
yield "Fetched files: {}\n".format(os.listdir(output_dir))

Wyświetl plik

@ -25,6 +25,7 @@ class Figshare(DoiProvider):
""" """
def __init__(self): def __init__(self):
super().__init__()
self.hosts = [ self.hosts = [
{ {
"hostname": [ "hostname": [
@ -74,13 +75,12 @@ class Figshare(DoiProvider):
yield "Fetching Figshare article {} in version {}.\n".format( yield "Fetching Figshare article {} in version {}.\n".format(
article_id, article_version article_id, article_version
) )
req = Request( resp = self.urlopen(
"{}{}/versions/{}".format(host["api"], article_id, article_version), "{}{}/versions/{}".format(host["api"], article_id, article_version),
headers={"accept": "application/json"}, headers={"accept": "application/json"},
) )
resp = self.urlopen(req)
article = json.loads(resp.read().decode("utf-8")) article = resp.json()
files = deep_get(article, host["filepath"]) files = deep_get(article, host["filepath"])
# only fetch files where is_link_only: False # only fetch files where is_link_only: False

Wyświetl plik

@ -16,9 +16,7 @@ class Hydroshare(DoiProvider):
def _fetch_version(self, host): def _fetch_version(self, host):
"""Fetch resource modified date and convert to epoch""" """Fetch resource modified date and convert to epoch"""
json_response = json.loads( json_response = self.urlopen(host["version"].format(self.resource_id)).json()
self.urlopen(host["version"].format(self.resource_id)).read()
)
date = next( date = next(
item for item in json_response["dates"] if item["type"] == "modified" item for item in json_response["dates"] if item["type"] == "modified"
)["start_date"] )["start_date"]

Wyświetl plik

@ -15,6 +15,7 @@ class Zenodo(DoiProvider):
"""Provide contents of a Zenodo deposit.""" """Provide contents of a Zenodo deposit."""
def __init__(self): def __init__(self):
super().__init__()
# We need the hostname (url where records are), api url (for metadata), # We need the hostname (url where records are), api url (for metadata),
# filepath (path to files in metadata), filename (path to filename in # filepath (path to files in metadata), filename (path to filename in
# metadata), download (path to file download URL), and type (path to item type in metadata) # metadata), download (path to file download URL), and type (path to item type in metadata)
@ -55,13 +56,12 @@ class Zenodo(DoiProvider):
host = spec["host"] host = spec["host"]
yield "Fetching Zenodo record {}.\n".format(record_id) yield "Fetching Zenodo record {}.\n".format(record_id)
req = Request( resp = self.urlopen(
"{}{}".format(host["api"], record_id), "{}{}".format(host["api"], record_id),
headers={"accept": "application/json"}, headers={"accept": "application/json"},
) )
resp = self.urlopen(req)
record = json.loads(resp.read().decode("utf-8")) record = resp.json()
is_software = deep_get(record, host["type"]).lower() == "software" is_software = deep_get(record, host["type"]).lower() == "software"
files = deep_get(record, host["filepath"]) files = deep_get(record, host["filepath"])

Wyświetl plik

@ -52,6 +52,7 @@ setup(
"python-json-logger", "python-json-logger",
"escapism", "escapism",
"jinja2", "jinja2",
"requests",
"ruamel.yaml>=0.15", "ruamel.yaml>=0.15",
"toml", "toml",
"semver", "semver",

Wyświetl plik

@ -1,6 +1,7 @@
import json import json
import os import os
import pytest import pytest
import re
from io import BytesIO from io import BytesIO
from tempfile import TemporaryDirectory from tempfile import TemporaryDirectory
@ -19,7 +20,7 @@ test_hosts = [
"doi:10.7910/DVN/6ZXAGT/3YRRYJ", "doi:10.7910/DVN/6ZXAGT/3YRRYJ",
"10.7910/DVN/6ZXAGT", "10.7910/DVN/6ZXAGT",
"https://dataverse.harvard.edu/api/access/datafile/3323458", "https://dataverse.harvard.edu/api/access/datafile/3323458",
"hdl:11529/10016", "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
], ],
[ [
{"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}, {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"},
@ -27,57 +28,68 @@ test_hosts = [
], ],
) )
] ]
test_responses = { doi_responses = {
"doi:10.7910/DVN/6ZXAGT/3YRRYJ": ( "https://doi.org/10.7910/DVN/6ZXAGT/3YRRYJ": (
"https://dataverse.harvard.edu/file.xhtml" "https://dataverse.harvard.edu/file.xhtml"
"?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ" "?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ"
), ),
"doi:10.7910/DVN/6ZXAGT": ( "https://doi.org/10.7910/DVN/6ZXAGT": (
"https://dataverse.harvard.edu/dataset.xhtml" "https://dataverse.harvard.edu/dataset.xhtml"
"?persistentId=doi:10.7910/DVN/6ZXAGT" "?persistentId=doi:10.7910/DVN/6ZXAGT"
), ),
"10.7910/DVN/6ZXAGT": ( "https://dataverse.harvard.edu/api/access/datafile/3323458": (
"https://dataverse.harvard.edu/dataset.xhtml" "https://dataverse.harvard.edu/api/access/datafile/3323458"
"?persistentId=doi:10.7910/DVN/6ZXAGT" ),
"https://doi.org/10.21105/joss.01277": (
"https://joss.theoj.org/papers/10.21105/joss.01277"
), ),
"https://dataverse.harvard.edu/api/access/datafile/3323458": "https://dataverse.harvard.edu/api/access/datafile/3323458",
"hdl:11529/10016": "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
}
test_search = {
"data": {
"count_in_response": 1,
"items": [{"dataset_persistent_id": "doi:10.7910/DVN/6ZXAGT"}],
}
} }
@pytest.mark.parametrize("test_input, expected", test_hosts) @pytest.mark.parametrize("test_input, expected", test_hosts)
def test_detect_dataverse(test_input, expected): def test_detect_dataverse(test_input, expected, requests_mock):
def doi_resolver(url): def doi_resolver(req, context):
return test_responses.get(url) resp = doi_responses.get(req.url)
# doi responses are redirects
if resp is not None:
context.status_code = 302
context.headers["Location"] = resp
return resp
with patch.object(Dataverse, "urlopen") as fake_urlopen, patch.object( requests_mock.get(re.compile("https://"), json=doi_resolver)
Dataverse, "doi2url", side_effect=doi_resolver requests_mock.get(
) as fake_doi2url: "https://dataverse.harvard.edu/api/search?q=entityId:3323458&type=file",
fake_urlopen.return_value.read.return_value = json.dumps(test_search).encode() json={
# valid Dataverse DOIs trigger this content provider "count_in_response": 1,
assert Dataverse().detect(test_input[0]) == expected[0] "items": [{"dataset_persistent_id": "doi:10.7910/DVN/6ZXAGT"}],
assert fake_doi2url.call_count == 2 # File, then dataset },
assert Dataverse().detect(test_input[1]) == expected[0] )
assert Dataverse().detect(test_input[2]) == expected[0]
# only two of the three calls above have to resolve a DOI
assert fake_urlopen.call_count == 1
assert Dataverse().detect(test_input[3]) == expected[1]
with patch.object(Dataverse, "urlopen") as fake_urlopen: assert requests_mock.call_count == 0
# Don't trigger the Dataverse content provider # valid Dataverse DOIs trigger this content provider
assert Dataverse().detect("/some/path/here") is None assert Dataverse().detect(test_input[0]) == expected[0]
assert Dataverse().detect("https://example.com/path/here") is None # 4: doi resolution (302), File, doi resolution (302), then dataset
# don't handle DOIs that aren't from Dataverse assert requests_mock.call_count == 4
fake_urlopen.return_value.url = ( requests_mock.reset_mock()
"http://joss.theoj.org/papers/10.21105/joss.01277"
) assert Dataverse().detect(test_input[1]) == expected[0]
assert Dataverse().detect("https://doi.org/10.21105/joss.01277") is None # 2: doi (302), dataset
assert requests_mock.call_count == 2
requests_mock.reset_mock()
assert Dataverse().detect(test_input[2]) == expected[0]
# 1: datafile (search dataverse for the file id)
assert requests_mock.call_count == 1
requests_mock.reset_mock()
assert Dataverse().detect(test_input[3]) == expected[1]
requests_mock.reset_mock()
# Don't trigger the Dataverse content provider
assert Dataverse().detect("/some/path/here") is None
assert Dataverse().detect("https://example.com/path/here") is None
# don't handle DOIs that aren't from Dataverse
assert Dataverse().detect("https://doi.org/10.21105/joss.01277") is None
@pytest.fixture @pytest.fixture
@ -95,50 +107,50 @@ def dv_files(tmpdir):
return [f1, f2, f3] return [f1, f2, f3]
def test_dataverse_fetch(dv_files): def test_dataverse_fetch(dv_files, requests_mock):
mock_response_ds_query = BytesIO( mock_response = {
json.dumps( "latestVersion": {
{ "files": [
"data": { {"dataFile": {"id": 1}, "label": "some-file.txt"},
"latestVersion": { {
"files": [ "dataFile": {"id": 2},
{"dataFile": {"id": 1}, "label": "some-file.txt"}, "label": "some-other-file.txt",
{ "directoryLabel": "directory",
"dataFile": {"id": 2}, },
"label": "some-other-file.txt", {
"directoryLabel": "directory", "dataFile": {"id": 3},
}, "label": "the-other-file.txt",
{ "directoryLabel": "directory/subdirectory",
"dataFile": {"id": 3}, },
"label": "the-other-file.txt", ]
"directoryLabel": "directory/subdirectory", }
}, }
]
}
}
}
).encode("utf-8")
)
spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"} spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}
def mock_filecontent(req, context):
file_no = int(req.url.split("/")[-1]) - 1
return open(dv_files[file_no], "rb").read()
requests_mock.get(
"https://dataverse.harvard.edu/api/datasets/"
":persistentId?persistentId=doi:10.7910/DVN/6ZXAGT",
json=mock_response,
)
requests_mock.get(
re.compile("https://dataverse.harvard.edu/api/access/datafile"),
content=mock_filecontent,
)
dv = Dataverse() dv = Dataverse()
def mock_urlopen(self, req): with TemporaryDirectory() as d:
if isinstance(req, Request): output = []
return mock_response_ds_query for l in dv.fetch(spec, d):
else: output.append(l)
file_no = int(req.split("/")[-1]) - 1 unpacked_files = set(os.listdir(d))
return urlopen("file://{}".format(dv_files[file_no])) expected = set(["directory", "some-file.txt"])
assert expected == unpacked_files
with patch.object(Dataverse, "urlopen", new=mock_urlopen): assert os.path.isfile(
with TemporaryDirectory() as d: os.path.join(d, "directory", "subdirectory", "the-other-file.txt")
output = [] )
for l in dv.fetch(spec, d):
output.append(l)
unpacked_files = set(os.listdir(d))
expected = set(["directory", "some-file.txt"])
assert expected == unpacked_files
assert os.path.isfile(
os.path.join(d, "directory", "subdirectory", "the-other-file.txt")
)

Wyświetl plik

@ -11,6 +11,7 @@ from zipfile import ZipFile
from repo2docker.contentproviders.doi import DoiProvider from repo2docker.contentproviders.doi import DoiProvider
from repo2docker.contentproviders.base import ContentProviderException from repo2docker.contentproviders.base import ContentProviderException
from repo2docker import __version__
def test_content_id(): def test_content_id():
@ -18,20 +19,15 @@ def test_content_id():
assert doi.content_id is None assert doi.content_id is None
def fake_urlopen(req): def test_url_headers(requests_mock):
print(req) requests_mock.get("https://mybinder.org", text="resp")
return req.headers
@patch("urllib.request.urlopen", fake_urlopen)
def test_url_headers():
doi = DoiProvider() doi = DoiProvider()
headers = {"test1": "value1", "Test2": "value2"} headers = {"test1": "value1", "Test2": "value2"}
result = doi.urlopen("https://mybinder.org", headers=headers) result = doi.urlopen("https://mybinder.org", headers=headers)
assert "Test1" in result assert "test1" in result.request.headers
assert "Test2" in result assert "Test2" in result.request.headers
assert len(result) is 3 # User-agent is also set assert result.request.headers["User-Agent"] == "repo2docker {}".format(__version__)
def test_unresolving_doi(): def test_unresolving_doi():

Wyświetl plik

@ -22,12 +22,17 @@ test_content_ids = [
@pytest.mark.parametrize("link,expected", test_content_ids) @pytest.mark.parametrize("link,expected", test_content_ids)
def test_content_id(link, expected): def test_content_id(link, expected, requests_mock):
with patch.object(Figshare, "urlopen") as fake_urlopen: def mocked_get(req, context):
fake_urlopen.return_value.url = link if req.url.startswith("https://doi.org"):
fig = Figshare() context.status_code = 302
fig.detect("10.6084/m9.figshare.9782777") context.headers["Location"] = link
assert fig.content_id == expected return link
requests_mock.get(re.compile("https://"), text=mocked_get)
fig = Figshare()
fig.detect("10.6084/m9.figshare.9782777")
assert fig.content_id == expected
test_fig = Figshare() test_fig = Figshare()
@ -103,76 +108,73 @@ def figshare_archive(prefix="a_directory"):
yield zfile.name yield zfile.name
def test_fetch_zip(): def test_fetch_zip(requests_mock):
# see test_zenodo.py/test_fetch_software # see test_zenodo.py/test_fetch_software
with figshare_archive() as fig_path: with figshare_archive() as fig_path:
mock_response = BytesIO( mock_response = {
json.dumps( "files": [
{ {
"files": [ "name": "afake.zip",
{ "is_link_only": False,
"name": "afake.zip", "download_url": "file://{}".format(fig_path),
"is_link_only": False,
"download_url": "file://{}".format(fig_path),
}
]
} }
).encode("utf-8") ]
}
requests_mock.get(
"https://api.figshare.com/v2/articles/123456/versions/42",
json=mock_response,
)
requests_mock.get(
"file://{}".format(fig_path), content=open(fig_path, "rb").read()
) )
def mock_urlopen(self, req): # with patch.object(Figshare, "urlopen", new=mock_urlopen):
if isinstance(req, Request): with TemporaryDirectory() as d:
return mock_response output = []
else: for l in test_fig.fetch(test_spec, d):
return urlopen(req) output.append(l)
unpacked_files = set(os.listdir(d))
expected = set(["some-other-file.txt", "some-file.txt"])
assert expected == unpacked_files
def test_fetch_data(requests_mock):
with figshare_archive() as a_path:
with figshare_archive() as b_path:
mock_response = {
"files": [
{
"name": "afake.file",
"download_url": "file://{}".format(a_path),
"is_link_only": False,
},
{
"name": "bfake.data",
"download_url": "file://{}".format(b_path),
"is_link_only": False,
},
{"name": "cfake.link", "is_link_only": True},
]
}
requests_mock.get(
"https://api.figshare.com/v2/articles/123456/versions/42",
json=mock_response,
)
requests_mock.get(
"file://{}".format(a_path), content=open(a_path, "rb").read()
)
requests_mock.get(
"file://{}".format(b_path), content=open(b_path, "rb").read()
)
with patch.object(Figshare, "urlopen", new=mock_urlopen):
with TemporaryDirectory() as d: with TemporaryDirectory() as d:
output = [] output = []
for l in test_fig.fetch(test_spec, d): for l in test_fig.fetch(test_spec, d):
output.append(l) output.append(l)
unpacked_files = set(os.listdir(d)) unpacked_files = set(os.listdir(d))
expected = set(["some-other-file.txt", "some-file.txt"]) # ZIP files shouldn't have been unpacked
expected = {"bfake.data", "afake.file"}
assert expected == unpacked_files assert expected == unpacked_files
def test_fetch_data():
with figshare_archive() as a_path:
with figshare_archive() as b_path:
mock_response = BytesIO(
json.dumps(
{
"files": [
{
"name": "afake.file",
"download_url": "file://{}".format(a_path),
"is_link_only": False,
},
{
"name": "bfake.data",
"download_url": "file://{}".format(b_path),
"is_link_only": False,
},
{"name": "cfake.link", "is_link_only": True},
]
}
).encode("utf-8")
)
def mock_urlopen(self, req):
if isinstance(req, Request):
return mock_response
else:
return urlopen(req)
with patch.object(Figshare, "urlopen", new=mock_urlopen):
with TemporaryDirectory() as d:
output = []
for l in test_fig.fetch(test_spec, d):
output.append(l)
unpacked_files = set(os.listdir(d))
# ZIP files shouldn't have been unpacked
expected = {"bfake.data", "afake.file"}
assert expected == unpacked_files

Wyświetl plik

@ -5,87 +5,98 @@ from contextlib import contextmanager
from tempfile import TemporaryDirectory, NamedTemporaryFile from tempfile import TemporaryDirectory, NamedTemporaryFile
from unittest.mock import patch from unittest.mock import patch
from zipfile import ZipFile from zipfile import ZipFile
import re
from repo2docker.contentproviders import Hydroshare from repo2docker.contentproviders import Hydroshare
from repo2docker.contentproviders.base import ContentProviderException from repo2docker.contentproviders.base import ContentProviderException
def test_content_id(): doi_responses = {
with patch.object(Hydroshare, "urlopen") as fake_urlopen: "https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61": (
fake_urlopen.return_value.url = ( "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"
),
"https://doi.org/10.21105/joss.01277": (
"https://joss.theoj.org/papers/10.21105/joss.01277"
),
}
def doi_resolver(req, context):
resp = doi_responses.get(req.url)
# doi responses are redirects
if resp is not None:
context.status_code = 302
context.headers["Location"] = resp
return resp
hydroshare_data = {
"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]
}
def test_content_id(requests_mock):
requests_mock.get(re.compile("https://"), json=hydroshare_data)
requests_mock.get(re.compile("https://doi.org"), json=doi_resolver)
hydro = Hydroshare()
hydro.detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61")
assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61.v1569427757"
def test_detect_hydroshare(requests_mock):
requests_mock.get(re.compile("https://"), json=hydroshare_data)
requests_mock.get(re.compile("https://doi.org"), json=doi_resolver)
# valid Hydroshare DOIs trigger this content provider
expected = {
"host": {
"hostname": [
"https://www.hydroshare.org/resource/",
"http://www.hydroshare.org/resource/",
],
"django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
"version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements",
},
"resource": "b8f6eae9d89241cf8b5904033460af61",
"version": "1569427757",
}
assert (
Hydroshare().detect(
"https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61" "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"
) )
== expected
)
# assert a call to urlopen was called to fetch version
assert requests_mock.call_count == 1
requests_mock.reset_mock()
def read(): assert (
return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}' Hydroshare().detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") == expected
)
# assert 3 calls were made, 2 to resolve the DOI (302 + 200) and another to fetch the version
assert requests_mock.call_count == 3
requests_mock.reset_mock()
fake_urlopen.return_value.read = read assert (
hydro = Hydroshare() Hydroshare().detect(
"https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61"
hydro.detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61")
assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61.v1569427757"
def test_detect_hydroshare():
with patch.object(Hydroshare, "urlopen") as fake_urlopen:
fake_urlopen.return_value.url = (
"https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"
) )
== expected
)
# assert 3 more calls were made, 2 to resolve the DOI and another to fetch the version
assert requests_mock.call_count == 3
requests_mock.reset_mock()
def read(): # Don't trigger the Hydroshare content provider
return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}' assert Hydroshare().detect("/some/path/here") is None
assert Hydroshare().detect("https://example.com/path/here") is None
fake_urlopen.return_value.read = read # don't handle DOIs that aren't from Hydroshare
# valid Hydroshare DOIs trigger this content provider assert Hydroshare().detect("https://doi.org/10.21105/joss.01277") is None
expected = {
"host": {
"hostname": [
"https://www.hydroshare.org/resource/",
"http://www.hydroshare.org/resource/",
],
"django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
"version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements",
},
"resource": "b8f6eae9d89241cf8b5904033460af61",
"version": "1569427757",
}
assert (
Hydroshare().detect(
"https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"
)
== expected
)
# assert a call to urlopen was called to fetch version
assert fake_urlopen.call_count == 1
assert (
Hydroshare().detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61")
== expected
)
# assert 2 more calls were made, one to resolve the DOI and another to fetch the version
assert fake_urlopen.call_count == 3
assert (
Hydroshare().detect(
"https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61"
)
== expected
)
# assert 2 more calls were made, one to resolve the DOI and another to fetch the version
assert fake_urlopen.call_count == 5
with patch.object(Hydroshare, "urlopen") as fake_urlopen:
# Don't trigger the Hydroshare content provider
assert Hydroshare().detect("/some/path/here") is None
assert Hydroshare().detect("https://example.com/path/here") is None
# don't handle DOIs that aren't from Hydroshare
fake_urlopen.return_value.url = (
"http://joss.theoj.org/papers/10.21105/joss.01277"
)
def read():
return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}'
fake_urlopen.return_value.read = read
assert Hydroshare().detect("https://doi.org/10.21105/joss.01277") is None
@contextmanager @contextmanager

Wyświetl plik

@ -1,6 +1,7 @@
import json import json
import os import os
import pytest import pytest
import re
from contextlib import contextmanager from contextlib import contextmanager
from io import BytesIO from io import BytesIO
@ -11,14 +12,30 @@ from zipfile import ZipFile
from repo2docker.contentproviders import Zenodo from repo2docker.contentproviders import Zenodo
doi_responses = {
"https://doi.org/10.5281/zenodo.3232985": ("https://zenodo.org/record/3232985"),
"https://doi.org/10.22002/d1.1235": ("https://data.caltech.edu/records/1235"),
"https://doi.org/10.21105/joss.01277": (
"https://joss.theoj.org/papers/10.21105/joss.01277"
),
}
def test_content_id():
with patch.object(Zenodo, "urlopen") as fake_urlopen:
fake_urlopen.return_value.url = "https://zenodo.org/record/3232985"
zen = Zenodo()
zen.detect("10.5281/zenodo.3232985") def doi_resolver(req, context):
assert zen.content_id == "3232985" resp = doi_responses.get(req.url)
# doi responses are redirects
if resp is not None:
context.status_code = 302
context.headers["Location"] = resp
return resp
def test_content_id(requests_mock):
requests_mock.get(re.compile("https://"), json=doi_resolver)
zen = Zenodo()
zen.detect("10.5281/zenodo.3232985")
assert zen.content_id == "3232985"
test_zen = Zenodo() test_zen = Zenodo()
@ -43,25 +60,22 @@ test_hosts = [
@pytest.mark.parametrize("test_input,expected", test_hosts) @pytest.mark.parametrize("test_input,expected", test_hosts)
def test_detect_zenodo(test_input, expected): def test_detect_zenodo(test_input, expected, requests_mock):
with patch.object(Zenodo, "urlopen") as fake_urlopen: requests_mock.get(re.compile("https://"), json=doi_resolver)
fake_urlopen.return_value.url = test_input[0] # valid Zenodo DOIs trigger this content provider
# valid Zenodo DOIs trigger this content provider assert Zenodo().detect(test_input[0]) == expected
assert Zenodo().detect(test_input[0]) == expected assert Zenodo().detect(test_input[1]) == expected
assert Zenodo().detect(test_input[1]) == expected assert Zenodo().detect(test_input[2]) == expected
assert Zenodo().detect(test_input[2]) == expected # only two of the three calls above have to resolve a DOI (2 req per doi resolution)
# only two of the three calls above have to resolve a DOI assert requests_mock.call_count == 4
assert fake_urlopen.call_count == 2 requests_mock.reset_mock()
with patch.object(Zenodo, "urlopen") as fake_urlopen: # Don't trigger the Zenodo content provider
# Don't trigger the Zenodo content provider assert Zenodo().detect("/some/path/here") is None
assert Zenodo().detect("/some/path/here") is None assert Zenodo().detect("https://example.com/path/here") is None
assert Zenodo().detect("https://example.com/path/here") is None
# don't handle DOIs that aren't from Zenodo # don't handle DOIs that aren't from Zenodo
fake_urlopen.return_value.url = ( assert Zenodo().detect("https://doi.org/10.21105/joss.01277") is None
"http://joss.theoj.org/papers/10.21105/joss.01277"
)
assert Zenodo().detect("https://doi.org/10.21105/joss.01277") is None
@contextmanager @contextmanager
@ -74,120 +88,102 @@ def zenodo_archive(prefix="a_directory"):
yield zfile.name yield zfile.name
def test_fetch_software_from_github_archive(): def test_fetch_software_from_github_archive(requests_mock):
# we "fetch" a local ZIP file to simulate a Zenodo record created from a # we "fetch" a local ZIP file to simulate a Zenodo record created from a
# GitHub repository via the Zenodo-GitHub integration # GitHub repository via the Zenodo-GitHub integration
with zenodo_archive() as zen_path: with zenodo_archive() as zen_path:
mock_response = BytesIO( mock_response = {
json.dumps( "files": [
{ {
"files": [ "filename": "some_dir/afake.zip",
{ "links": {"download": "file://{}".format(zen_path)},
"filename": "some_dir/afake.zip",
"links": {"download": "file://{}".format(zen_path)},
}
],
"metadata": {"upload_type": "software"},
} }
).encode("utf-8") ],
"metadata": {"upload_type": "software"},
}
requests_mock.get("https://zenodo.org/api/records/1234", json=mock_response)
requests_mock.get(
"file://{}".format(zen_path), content=open(zen_path, "rb").read()
) )
def mock_urlopen(self, req): zen = Zenodo()
if isinstance(req, Request): spec = {"host": test_zen.hosts[0], "record": "1234"}
return mock_response
else:
return urlopen(req)
with patch.object(Zenodo, "urlopen", new=mock_urlopen): with TemporaryDirectory() as d:
zen = Zenodo() output = []
spec = {"host": test_zen.hosts[0], "record": "1234"} for l in zen.fetch(spec, d):
output.append(l)
with TemporaryDirectory() as d: unpacked_files = set(os.listdir(d))
output = [] expected = set(["some-other-file.txt", "some-file.txt"])
for l in zen.fetch(spec, d): assert expected == unpacked_files
output.append(l)
unpacked_files = set(os.listdir(d))
expected = set(["some-other-file.txt", "some-file.txt"])
assert expected == unpacked_files
def test_fetch_software(): def test_fetch_software(requests_mock):
# we "fetch" a local ZIP file to simulate a Zenodo software record with a # we "fetch" a local ZIP file to simulate a Zenodo software record with a
# ZIP file in it # ZIP file in it
with zenodo_archive() as zen_path: with zenodo_archive() as zen_path:
mock_response = BytesIO( mock_response = {
json.dumps( "files": [
{ {
"files": [ # this is the difference to the GitHub generated one,
{ # the ZIP file isn't in a directory
# this is the difference to the GitHub generated one, "filename": "afake.zip",
# the ZIP file isn't in a directory "links": {"download": "file://{}".format(zen_path)},
"filename": "afake.zip",
"links": {"download": "file://{}".format(zen_path)},
}
],
"metadata": {"upload_type": "software"},
} }
).encode("utf-8") ],
"metadata": {"upload_type": "software"},
}
requests_mock.get("https://zenodo.org/api/records/1234", json=mock_response)
requests_mock.get(
"file://{}".format(zen_path), content=open(zen_path, "rb").read()
) )
def mock_urlopen(self, req): with TemporaryDirectory() as d:
if isinstance(req, Request): zen = Zenodo()
return mock_response spec = spec = {"host": test_zen.hosts[0], "record": "1234"}
else: output = []
return urlopen(req) for l in zen.fetch(spec, d):
output.append(l)
unpacked_files = set(os.listdir(d))
expected = set(["some-other-file.txt", "some-file.txt"])
assert expected == unpacked_files
def test_fetch_data(requests_mock):
# we "fetch" a local ZIP file to simulate a Zenodo data record
with zenodo_archive() as a_zen_path:
with zenodo_archive() as b_zen_path:
mock_response = {
"files": [
{
"filename": "afake.zip",
"links": {"download": "file://{}".format(a_zen_path)},
},
{
"filename": "bfake.zip",
"links": {"download": "file://{}".format(b_zen_path)},
},
],
"metadata": {"upload_type": "data"},
}
requests_mock.get("https://zenodo.org/api/records/1234", json=mock_response)
requests_mock.get(
"file://{}".format(a_zen_path), content=open(a_zen_path, "rb").read()
)
requests_mock.get(
"file://{}".format(b_zen_path), content=open(b_zen_path, "rb").read()
)
with patch.object(Zenodo, "urlopen", new=mock_urlopen):
with TemporaryDirectory() as d: with TemporaryDirectory() as d:
zen = Zenodo() zen = Zenodo()
spec = spec = {"host": test_zen.hosts[0], "record": "1234"} spec = {"host": test_zen.hosts[0], "record": "1234"}
output = [] output = []
for l in zen.fetch(spec, d): for l in zen.fetch(spec, d):
output.append(l) output.append(l)
unpacked_files = set(os.listdir(d)) unpacked_files = set(os.listdir(d))
expected = set(["some-other-file.txt", "some-file.txt"]) # ZIP files shouldn't have been unpacked
expected = {"bfake.zip", "afake.zip"}
assert expected == unpacked_files assert expected == unpacked_files
def test_fetch_data():
# we "fetch" a local ZIP file to simulate a Zenodo data record
with zenodo_archive() as a_zen_path:
with zenodo_archive() as b_zen_path:
mock_response = BytesIO(
json.dumps(
{
"files": [
{
"filename": "afake.zip",
"links": {"download": "file://{}".format(a_zen_path)},
},
{
"filename": "bfake.zip",
"links": {"download": "file://{}".format(b_zen_path)},
},
],
"metadata": {"upload_type": "data"},
}
).encode("utf-8")
)
def mock_urlopen(self, req):
if isinstance(req, Request):
return mock_response
else:
return urlopen(req)
with patch.object(Zenodo, "urlopen", new=mock_urlopen):
with TemporaryDirectory() as d:
zen = Zenodo()
spec = {"host": test_zen.hosts[0], "record": "1234"}
output = []
for l in zen.fetch(spec, d):
output.append(l)
unpacked_files = set(os.listdir(d))
# ZIP files shouldn't have been unpacked
expected = {"bfake.zip", "afake.zip"}
assert expected == unpacked_files