From 830a9c89c00ab53db3f4ccfae0cef942f4ce367b Mon Sep 17 00:00:00 2001 From: David Douard Date: Thu, 10 Dec 2020 18:55:14 +0100 Subject: [PATCH] Replace urllib by requests in contentproviders requests is globally simpler to use, and more and more people are more familiar with this later than urllib. --- dev-requirements.txt | 2 +- repo2docker/contentproviders/dataverse.py | 17 +- repo2docker/contentproviders/doi.py | 91 ++++--- repo2docker/contentproviders/figshare.py | 6 +- repo2docker/contentproviders/hydroshare.py | 4 +- repo2docker/contentproviders/zenodo.py | 6 +- setup.py | 1 + tests/unit/contentproviders/test_dataverse.py | 176 +++++++------- tests/unit/contentproviders/test_doi.py | 16 +- tests/unit/contentproviders/test_figshare.py | 132 ++++++----- .../unit/contentproviders/test_hydroshare.py | 149 ++++++------ tests/unit/contentproviders/test_zenodo.py | 224 +++++++++--------- 12 files changed, 427 insertions(+), 397 deletions(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index bcecb7ba..4b75d1bb 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -5,4 +5,4 @@ pytest>=4.6 wheel pytest-cov pre-commit -requests +requests_mock diff --git a/repo2docker/contentproviders/dataverse.py b/repo2docker/contentproviders/dataverse.py index 7519d700..66632f5e 100644 --- a/repo2docker/contentproviders/dataverse.py +++ b/repo2docker/contentproviders/dataverse.py @@ -2,7 +2,6 @@ import os import json import shutil -from urllib.request import Request from urllib.parse import urlparse, urlunparse, parse_qs from .doi import DoiProvider @@ -56,7 +55,6 @@ class Dataverse(DoiProvider): return query_args = parse_qs(parsed_url.query) - # Corner case handling if parsed_url.path.startswith("/file.xhtml"): # There's no way of getting file information using its persistentId, the only thing we can do is assume that doi @@ -75,8 +73,7 @@ class Dataverse(DoiProvider): parsed_url._replace(path="/api/search", query=search_query) ) self.log.debug("Querying Dataverse: " + search_url) - resp = self.urlopen(search_url).read() - data = json.loads(resp.decode("utf-8"))["data"] + data = self.urlopen(search_url).json() if data["count_in_response"] != 1: self.log.debug( "Dataverse search query failed!\n - doi: {}\n - url: {}\n - resp: {}\n".format( @@ -101,14 +98,12 @@ class Dataverse(DoiProvider): host = spec["host"] yield "Fetching Dataverse record {}.\n".format(record_id) - req = Request( - "{}/api/datasets/:persistentId?persistentId={}".format( - host["url"], record_id - ), - headers={"accept": "application/json"}, + url = "{}/api/datasets/:persistentId?persistentId={}".format( + host["url"], record_id ) - resp = self.urlopen(req) - record = json.loads(resp.read().decode("utf-8"))["data"] + + resp = self.urlopen(url, headers={"accept": "application/json"}) + record = resp.json() for fobj in deep_get(record, "latestVersion.files"): file_url = "{}/api/access/datafile/{}".format( diff --git a/repo2docker/contentproviders/doi.py b/repo2docker/contentproviders/doi.py index 6ec2c8e1..c1941ba8 100644 --- a/repo2docker/contentproviders/doi.py +++ b/repo2docker/contentproviders/doi.py @@ -5,8 +5,8 @@ import logging from os import makedirs from os import path -from urllib import request # urlopen, Request -from urllib.error import HTTPError +from requests import Session, HTTPError + from zipfile import ZipFile, is_zipfile from .base import ContentProvider @@ -18,7 +18,21 @@ from .. import __version__ class DoiProvider(ContentProvider): """Provide contents of a repository identified by a DOI and some helper functions.""" - def urlopen(self, req, headers=None): + def __init__(self): + super().__init__() + self.session = Session() + self.session.headers.update( + { + "user-agent": "repo2docker {}".format(__version__), + } + ) + + def _request(self, url, **kwargs): + return self.session.get(url, **kwargs) + + urlopen = _request + + def _urlopen(self, req, headers=None): """A urlopen() helper""" # someone passed a string, not a request if not isinstance(req, request.Request): @@ -38,7 +52,8 @@ class DoiProvider(ContentProvider): doi = normalize_doi(doi) try: - resp = self.urlopen("https://doi.org/{}".format(doi)) + resp = self._request("https://doi.org/{}".format(doi)) + resp.raise_for_status() # If the DOI doesn't resolve, just return URL except HTTPError: return doi @@ -53,38 +68,42 @@ class DoiProvider(ContentProvider): file_url = deep_get(file_ref, host["download"]) fname = deep_get(file_ref, host["filename"]) logging.debug("Downloading file {} as {}\n".format(file_url, fname)) - with self.urlopen(file_url) as src: + + yield "Requesting {}\n".format(file_url) + resp = self._request(file_url, stream=True) + resp.raise_for_status() + + if path.dirname(fname): + sub_dir = path.join(output_dir, path.dirname(fname)) + if not path.exists(sub_dir): + yield "Creating {}\n".format(sub_dir) + makedirs(sub_dir, exist_ok=True) + + dst_fname = path.join(output_dir, fname) + with open(dst_fname, "wb") as dst: + yield "Fetching {}\n".format(fname) + for chunk in resp.iter_content(chunk_size=None): + dst.write(chunk) + + if unzip and is_zipfile(dst_fname): + yield "Extracting {}\n".format(fname) + zfile = ZipFile(dst_fname) + zfile.extractall(path=output_dir) + zfile.close() + + # delete downloaded file ... + os.remove(dst_fname) + # ... and any directories we might have created, + # in which case sub_dir will be defined if path.dirname(fname): - sub_dir = path.join(output_dir, path.dirname(fname)) - if not path.exists(sub_dir): - yield "Creating {}\n".format(sub_dir) - makedirs(sub_dir, exist_ok=True) + shutil.rmtree(sub_dir) - dst_fname = path.join(output_dir, fname) - with open(dst_fname, "wb") as dst: - yield "Fetching {}\n".format(fname) - shutil.copyfileobj(src, dst) - # first close the newly written file, then continue - # processing it - if unzip and is_zipfile(dst_fname): - yield "Extracting {}\n".format(fname) - zfile = ZipFile(dst_fname) - zfile.extractall(path=output_dir) - zfile.close() + new_subdirs = os.listdir(output_dir) + # if there is only one new subdirectory move its contents + # to the top level directory + if len(new_subdirs) == 1: + d = new_subdirs[0] + copytree(path.join(output_dir, d), output_dir) + shutil.rmtree(path.join(output_dir, d)) - # delete downloaded file ... - os.remove(dst_fname) - # ... and any directories we might have created, - # in which case sub_dir will be defined - if path.dirname(fname): - shutil.rmtree(sub_dir) - - new_subdirs = os.listdir(output_dir) - # if there is only one new subdirectory move its contents - # to the top level directory - if len(new_subdirs) == 1: - d = new_subdirs[0] - copytree(path.join(output_dir, d), output_dir) - shutil.rmtree(path.join(output_dir, d)) - - yield "Fetched files: {}\n".format(os.listdir(output_dir)) + yield "Fetched files: {}\n".format(os.listdir(output_dir)) diff --git a/repo2docker/contentproviders/figshare.py b/repo2docker/contentproviders/figshare.py index 43ec71fa..0fa09719 100644 --- a/repo2docker/contentproviders/figshare.py +++ b/repo2docker/contentproviders/figshare.py @@ -25,6 +25,7 @@ class Figshare(DoiProvider): """ def __init__(self): + super().__init__() self.hosts = [ { "hostname": [ @@ -74,13 +75,12 @@ class Figshare(DoiProvider): yield "Fetching Figshare article {} in version {}.\n".format( article_id, article_version ) - req = Request( + resp = self.urlopen( "{}{}/versions/{}".format(host["api"], article_id, article_version), headers={"accept": "application/json"}, ) - resp = self.urlopen(req) - article = json.loads(resp.read().decode("utf-8")) + article = resp.json() files = deep_get(article, host["filepath"]) # only fetch files where is_link_only: False diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py index 2531aa8b..eac66e21 100755 --- a/repo2docker/contentproviders/hydroshare.py +++ b/repo2docker/contentproviders/hydroshare.py @@ -16,9 +16,7 @@ class Hydroshare(DoiProvider): def _fetch_version(self, host): """Fetch resource modified date and convert to epoch""" - json_response = json.loads( - self.urlopen(host["version"].format(self.resource_id)).read() - ) + json_response = self.urlopen(host["version"].format(self.resource_id)).json() date = next( item for item in json_response["dates"] if item["type"] == "modified" )["start_date"] diff --git a/repo2docker/contentproviders/zenodo.py b/repo2docker/contentproviders/zenodo.py index 87e7967f..68d3494d 100644 --- a/repo2docker/contentproviders/zenodo.py +++ b/repo2docker/contentproviders/zenodo.py @@ -15,6 +15,7 @@ class Zenodo(DoiProvider): """Provide contents of a Zenodo deposit.""" def __init__(self): + super().__init__() # We need the hostname (url where records are), api url (for metadata), # filepath (path to files in metadata), filename (path to filename in # metadata), download (path to file download URL), and type (path to item type in metadata) @@ -55,13 +56,12 @@ class Zenodo(DoiProvider): host = spec["host"] yield "Fetching Zenodo record {}.\n".format(record_id) - req = Request( + resp = self.urlopen( "{}{}".format(host["api"], record_id), headers={"accept": "application/json"}, ) - resp = self.urlopen(req) - record = json.loads(resp.read().decode("utf-8")) + record = resp.json() is_software = deep_get(record, host["type"]).lower() == "software" files = deep_get(record, host["filepath"]) diff --git a/setup.py b/setup.py index 5084b486..8bfe64eb 100644 --- a/setup.py +++ b/setup.py @@ -52,6 +52,7 @@ setup( "python-json-logger", "escapism", "jinja2", + "requests", "ruamel.yaml>=0.15", "toml", "semver", diff --git a/tests/unit/contentproviders/test_dataverse.py b/tests/unit/contentproviders/test_dataverse.py index 76d39654..0e53e50a 100644 --- a/tests/unit/contentproviders/test_dataverse.py +++ b/tests/unit/contentproviders/test_dataverse.py @@ -1,6 +1,7 @@ import json import os import pytest +import re from io import BytesIO from tempfile import TemporaryDirectory @@ -19,7 +20,7 @@ test_hosts = [ "doi:10.7910/DVN/6ZXAGT/3YRRYJ", "10.7910/DVN/6ZXAGT", "https://dataverse.harvard.edu/api/access/datafile/3323458", - "hdl:11529/10016", + "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", ], [ {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}, @@ -27,57 +28,68 @@ test_hosts = [ ], ) ] -test_responses = { - "doi:10.7910/DVN/6ZXAGT/3YRRYJ": ( +doi_responses = { + "https://doi.org/10.7910/DVN/6ZXAGT/3YRRYJ": ( "https://dataverse.harvard.edu/file.xhtml" "?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ" ), - "doi:10.7910/DVN/6ZXAGT": ( + "https://doi.org/10.7910/DVN/6ZXAGT": ( "https://dataverse.harvard.edu/dataset.xhtml" "?persistentId=doi:10.7910/DVN/6ZXAGT" ), - "10.7910/DVN/6ZXAGT": ( - "https://dataverse.harvard.edu/dataset.xhtml" - "?persistentId=doi:10.7910/DVN/6ZXAGT" + "https://dataverse.harvard.edu/api/access/datafile/3323458": ( + "https://dataverse.harvard.edu/api/access/datafile/3323458" + ), + "https://doi.org/10.21105/joss.01277": ( + "https://joss.theoj.org/papers/10.21105/joss.01277" ), - "https://dataverse.harvard.edu/api/access/datafile/3323458": "https://dataverse.harvard.edu/api/access/datafile/3323458", - "hdl:11529/10016": "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", -} -test_search = { - "data": { - "count_in_response": 1, - "items": [{"dataset_persistent_id": "doi:10.7910/DVN/6ZXAGT"}], - } } @pytest.mark.parametrize("test_input, expected", test_hosts) -def test_detect_dataverse(test_input, expected): - def doi_resolver(url): - return test_responses.get(url) +def test_detect_dataverse(test_input, expected, requests_mock): + def doi_resolver(req, context): + resp = doi_responses.get(req.url) + # doi responses are redirects + if resp is not None: + context.status_code = 302 + context.headers["Location"] = resp + return resp - with patch.object(Dataverse, "urlopen") as fake_urlopen, patch.object( - Dataverse, "doi2url", side_effect=doi_resolver - ) as fake_doi2url: - fake_urlopen.return_value.read.return_value = json.dumps(test_search).encode() - # valid Dataverse DOIs trigger this content provider - assert Dataverse().detect(test_input[0]) == expected[0] - assert fake_doi2url.call_count == 2 # File, then dataset - assert Dataverse().detect(test_input[1]) == expected[0] - assert Dataverse().detect(test_input[2]) == expected[0] - # only two of the three calls above have to resolve a DOI - assert fake_urlopen.call_count == 1 - assert Dataverse().detect(test_input[3]) == expected[1] + requests_mock.get(re.compile("https://"), json=doi_resolver) + requests_mock.get( + "https://dataverse.harvard.edu/api/search?q=entityId:3323458&type=file", + json={ + "count_in_response": 1, + "items": [{"dataset_persistent_id": "doi:10.7910/DVN/6ZXAGT"}], + }, + ) - with patch.object(Dataverse, "urlopen") as fake_urlopen: - # Don't trigger the Dataverse content provider - assert Dataverse().detect("/some/path/here") is None - assert Dataverse().detect("https://example.com/path/here") is None - # don't handle DOIs that aren't from Dataverse - fake_urlopen.return_value.url = ( - "http://joss.theoj.org/papers/10.21105/joss.01277" - ) - assert Dataverse().detect("https://doi.org/10.21105/joss.01277") is None + assert requests_mock.call_count == 0 + # valid Dataverse DOIs trigger this content provider + assert Dataverse().detect(test_input[0]) == expected[0] + # 4: doi resolution (302), File, doi resolution (302), then dataset + assert requests_mock.call_count == 4 + requests_mock.reset_mock() + + assert Dataverse().detect(test_input[1]) == expected[0] + # 2: doi (302), dataset + assert requests_mock.call_count == 2 + requests_mock.reset_mock() + + assert Dataverse().detect(test_input[2]) == expected[0] + # 1: datafile (search dataverse for the file id) + assert requests_mock.call_count == 1 + requests_mock.reset_mock() + + assert Dataverse().detect(test_input[3]) == expected[1] + requests_mock.reset_mock() + + # Don't trigger the Dataverse content provider + assert Dataverse().detect("/some/path/here") is None + assert Dataverse().detect("https://example.com/path/here") is None + # don't handle DOIs that aren't from Dataverse + assert Dataverse().detect("https://doi.org/10.21105/joss.01277") is None @pytest.fixture @@ -95,50 +107,50 @@ def dv_files(tmpdir): return [f1, f2, f3] -def test_dataverse_fetch(dv_files): - mock_response_ds_query = BytesIO( - json.dumps( - { - "data": { - "latestVersion": { - "files": [ - {"dataFile": {"id": 1}, "label": "some-file.txt"}, - { - "dataFile": {"id": 2}, - "label": "some-other-file.txt", - "directoryLabel": "directory", - }, - { - "dataFile": {"id": 3}, - "label": "the-other-file.txt", - "directoryLabel": "directory/subdirectory", - }, - ] - } - } - } - ).encode("utf-8") - ) +def test_dataverse_fetch(dv_files, requests_mock): + mock_response = { + "latestVersion": { + "files": [ + {"dataFile": {"id": 1}, "label": "some-file.txt"}, + { + "dataFile": {"id": 2}, + "label": "some-other-file.txt", + "directoryLabel": "directory", + }, + { + "dataFile": {"id": 3}, + "label": "the-other-file.txt", + "directoryLabel": "directory/subdirectory", + }, + ] + } + } + spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"} + def mock_filecontent(req, context): + file_no = int(req.url.split("/")[-1]) - 1 + return open(dv_files[file_no], "rb").read() + + requests_mock.get( + "https://dataverse.harvard.edu/api/datasets/" + ":persistentId?persistentId=doi:10.7910/DVN/6ZXAGT", + json=mock_response, + ) + requests_mock.get( + re.compile("https://dataverse.harvard.edu/api/access/datafile"), + content=mock_filecontent, + ) + dv = Dataverse() - def mock_urlopen(self, req): - if isinstance(req, Request): - return mock_response_ds_query - else: - file_no = int(req.split("/")[-1]) - 1 - return urlopen("file://{}".format(dv_files[file_no])) - - with patch.object(Dataverse, "urlopen", new=mock_urlopen): - with TemporaryDirectory() as d: - output = [] - for l in dv.fetch(spec, d): - output.append(l) - - unpacked_files = set(os.listdir(d)) - expected = set(["directory", "some-file.txt"]) - assert expected == unpacked_files - assert os.path.isfile( - os.path.join(d, "directory", "subdirectory", "the-other-file.txt") - ) + with TemporaryDirectory() as d: + output = [] + for l in dv.fetch(spec, d): + output.append(l) + unpacked_files = set(os.listdir(d)) + expected = set(["directory", "some-file.txt"]) + assert expected == unpacked_files + assert os.path.isfile( + os.path.join(d, "directory", "subdirectory", "the-other-file.txt") + ) diff --git a/tests/unit/contentproviders/test_doi.py b/tests/unit/contentproviders/test_doi.py index 16d321e6..dbe39160 100644 --- a/tests/unit/contentproviders/test_doi.py +++ b/tests/unit/contentproviders/test_doi.py @@ -11,6 +11,7 @@ from zipfile import ZipFile from repo2docker.contentproviders.doi import DoiProvider from repo2docker.contentproviders.base import ContentProviderException +from repo2docker import __version__ def test_content_id(): @@ -18,20 +19,15 @@ def test_content_id(): assert doi.content_id is None -def fake_urlopen(req): - print(req) - return req.headers - - -@patch("urllib.request.urlopen", fake_urlopen) -def test_url_headers(): +def test_url_headers(requests_mock): + requests_mock.get("https://mybinder.org", text="resp") doi = DoiProvider() headers = {"test1": "value1", "Test2": "value2"} result = doi.urlopen("https://mybinder.org", headers=headers) - assert "Test1" in result - assert "Test2" in result - assert len(result) is 3 # User-agent is also set + assert "test1" in result.request.headers + assert "Test2" in result.request.headers + assert result.request.headers["User-Agent"] == "repo2docker {}".format(__version__) def test_unresolving_doi(): diff --git a/tests/unit/contentproviders/test_figshare.py b/tests/unit/contentproviders/test_figshare.py index 0e152ecf..3377d642 100644 --- a/tests/unit/contentproviders/test_figshare.py +++ b/tests/unit/contentproviders/test_figshare.py @@ -22,12 +22,17 @@ test_content_ids = [ @pytest.mark.parametrize("link,expected", test_content_ids) -def test_content_id(link, expected): - with patch.object(Figshare, "urlopen") as fake_urlopen: - fake_urlopen.return_value.url = link - fig = Figshare() - fig.detect("10.6084/m9.figshare.9782777") - assert fig.content_id == expected +def test_content_id(link, expected, requests_mock): + def mocked_get(req, context): + if req.url.startswith("https://doi.org"): + context.status_code = 302 + context.headers["Location"] = link + return link + + requests_mock.get(re.compile("https://"), text=mocked_get) + fig = Figshare() + fig.detect("10.6084/m9.figshare.9782777") + assert fig.content_id == expected test_fig = Figshare() @@ -103,76 +108,73 @@ def figshare_archive(prefix="a_directory"): yield zfile.name -def test_fetch_zip(): +def test_fetch_zip(requests_mock): # see test_zenodo.py/test_fetch_software with figshare_archive() as fig_path: - mock_response = BytesIO( - json.dumps( + mock_response = { + "files": [ { - "files": [ - { - "name": "afake.zip", - "is_link_only": False, - "download_url": "file://{}".format(fig_path), - } - ] + "name": "afake.zip", + "is_link_only": False, + "download_url": "file://{}".format(fig_path), } - ).encode("utf-8") + ] + } + requests_mock.get( + "https://api.figshare.com/v2/articles/123456/versions/42", + json=mock_response, + ) + requests_mock.get( + "file://{}".format(fig_path), content=open(fig_path, "rb").read() ) - def mock_urlopen(self, req): - if isinstance(req, Request): - return mock_response - else: - return urlopen(req) + # with patch.object(Figshare, "urlopen", new=mock_urlopen): + with TemporaryDirectory() as d: + output = [] + for l in test_fig.fetch(test_spec, d): + output.append(l) + + unpacked_files = set(os.listdir(d)) + expected = set(["some-other-file.txt", "some-file.txt"]) + assert expected == unpacked_files + + +def test_fetch_data(requests_mock): + with figshare_archive() as a_path: + with figshare_archive() as b_path: + mock_response = { + "files": [ + { + "name": "afake.file", + "download_url": "file://{}".format(a_path), + "is_link_only": False, + }, + { + "name": "bfake.data", + "download_url": "file://{}".format(b_path), + "is_link_only": False, + }, + {"name": "cfake.link", "is_link_only": True}, + ] + } + + requests_mock.get( + "https://api.figshare.com/v2/articles/123456/versions/42", + json=mock_response, + ) + requests_mock.get( + "file://{}".format(a_path), content=open(a_path, "rb").read() + ) + requests_mock.get( + "file://{}".format(b_path), content=open(b_path, "rb").read() + ) - with patch.object(Figshare, "urlopen", new=mock_urlopen): with TemporaryDirectory() as d: output = [] for l in test_fig.fetch(test_spec, d): output.append(l) unpacked_files = set(os.listdir(d)) - expected = set(["some-other-file.txt", "some-file.txt"]) + # ZIP files shouldn't have been unpacked + expected = {"bfake.data", "afake.file"} assert expected == unpacked_files - - -def test_fetch_data(): - with figshare_archive() as a_path: - with figshare_archive() as b_path: - mock_response = BytesIO( - json.dumps( - { - "files": [ - { - "name": "afake.file", - "download_url": "file://{}".format(a_path), - "is_link_only": False, - }, - { - "name": "bfake.data", - "download_url": "file://{}".format(b_path), - "is_link_only": False, - }, - {"name": "cfake.link", "is_link_only": True}, - ] - } - ).encode("utf-8") - ) - - def mock_urlopen(self, req): - if isinstance(req, Request): - return mock_response - else: - return urlopen(req) - - with patch.object(Figshare, "urlopen", new=mock_urlopen): - with TemporaryDirectory() as d: - output = [] - for l in test_fig.fetch(test_spec, d): - output.append(l) - - unpacked_files = set(os.listdir(d)) - # ZIP files shouldn't have been unpacked - expected = {"bfake.data", "afake.file"} - assert expected == unpacked_files diff --git a/tests/unit/contentproviders/test_hydroshare.py b/tests/unit/contentproviders/test_hydroshare.py index d662dbff..4af7d157 100755 --- a/tests/unit/contentproviders/test_hydroshare.py +++ b/tests/unit/contentproviders/test_hydroshare.py @@ -5,87 +5,98 @@ from contextlib import contextmanager from tempfile import TemporaryDirectory, NamedTemporaryFile from unittest.mock import patch from zipfile import ZipFile +import re from repo2docker.contentproviders import Hydroshare from repo2docker.contentproviders.base import ContentProviderException -def test_content_id(): - with patch.object(Hydroshare, "urlopen") as fake_urlopen: - fake_urlopen.return_value.url = ( +doi_responses = { + "https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61": ( + "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61" + ), + "https://doi.org/10.21105/joss.01277": ( + "https://joss.theoj.org/papers/10.21105/joss.01277" + ), +} + + +def doi_resolver(req, context): + resp = doi_responses.get(req.url) + # doi responses are redirects + if resp is not None: + context.status_code = 302 + context.headers["Location"] = resp + return resp + + +hydroshare_data = { + "dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}] +} + + +def test_content_id(requests_mock): + + requests_mock.get(re.compile("https://"), json=hydroshare_data) + requests_mock.get(re.compile("https://doi.org"), json=doi_resolver) + + hydro = Hydroshare() + + hydro.detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") + assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61.v1569427757" + + +def test_detect_hydroshare(requests_mock): + requests_mock.get(re.compile("https://"), json=hydroshare_data) + requests_mock.get(re.compile("https://doi.org"), json=doi_resolver) + + # valid Hydroshare DOIs trigger this content provider + expected = { + "host": { + "hostname": [ + "https://www.hydroshare.org/resource/", + "http://www.hydroshare.org/resource/", + ], + "django_irods": "https://www.hydroshare.org/django_irods/download/bags/", + "version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements", + }, + "resource": "b8f6eae9d89241cf8b5904033460af61", + "version": "1569427757", + } + + assert ( + Hydroshare().detect( "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61" ) + == expected + ) + # assert a call to urlopen was called to fetch version + assert requests_mock.call_count == 1 + requests_mock.reset_mock() - def read(): - return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}' + assert ( + Hydroshare().detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") == expected + ) + # assert 3 calls were made, 2 to resolve the DOI (302 + 200) and another to fetch the version + assert requests_mock.call_count == 3 + requests_mock.reset_mock() - fake_urlopen.return_value.read = read - hydro = Hydroshare() - - hydro.detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") - assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61.v1569427757" - - -def test_detect_hydroshare(): - with patch.object(Hydroshare, "urlopen") as fake_urlopen: - fake_urlopen.return_value.url = ( - "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61" + assert ( + Hydroshare().detect( + "https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61" ) + == expected + ) + # assert 3 more calls were made, 2 to resolve the DOI and another to fetch the version + assert requests_mock.call_count == 3 + requests_mock.reset_mock() - def read(): - return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}' + # Don't trigger the Hydroshare content provider + assert Hydroshare().detect("/some/path/here") is None + assert Hydroshare().detect("https://example.com/path/here") is None - fake_urlopen.return_value.read = read - # valid Hydroshare DOIs trigger this content provider - expected = { - "host": { - "hostname": [ - "https://www.hydroshare.org/resource/", - "http://www.hydroshare.org/resource/", - ], - "django_irods": "https://www.hydroshare.org/django_irods/download/bags/", - "version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements", - }, - "resource": "b8f6eae9d89241cf8b5904033460af61", - "version": "1569427757", - } - assert ( - Hydroshare().detect( - "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61" - ) - == expected - ) - # assert a call to urlopen was called to fetch version - assert fake_urlopen.call_count == 1 - assert ( - Hydroshare().detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") - == expected - ) - # assert 2 more calls were made, one to resolve the DOI and another to fetch the version - assert fake_urlopen.call_count == 3 - assert ( - Hydroshare().detect( - "https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61" - ) - == expected - ) - # assert 2 more calls were made, one to resolve the DOI and another to fetch the version - assert fake_urlopen.call_count == 5 - - with patch.object(Hydroshare, "urlopen") as fake_urlopen: - # Don't trigger the Hydroshare content provider - assert Hydroshare().detect("/some/path/here") is None - assert Hydroshare().detect("https://example.com/path/here") is None - # don't handle DOIs that aren't from Hydroshare - fake_urlopen.return_value.url = ( - "http://joss.theoj.org/papers/10.21105/joss.01277" - ) - - def read(): - return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}' - - fake_urlopen.return_value.read = read - assert Hydroshare().detect("https://doi.org/10.21105/joss.01277") is None + # don't handle DOIs that aren't from Hydroshare + assert Hydroshare().detect("https://doi.org/10.21105/joss.01277") is None @contextmanager diff --git a/tests/unit/contentproviders/test_zenodo.py b/tests/unit/contentproviders/test_zenodo.py index 61dcbfbf..fa8b30fa 100644 --- a/tests/unit/contentproviders/test_zenodo.py +++ b/tests/unit/contentproviders/test_zenodo.py @@ -1,6 +1,7 @@ import json import os import pytest +import re from contextlib import contextmanager from io import BytesIO @@ -11,14 +12,30 @@ from zipfile import ZipFile from repo2docker.contentproviders import Zenodo +doi_responses = { + "https://doi.org/10.5281/zenodo.3232985": ("https://zenodo.org/record/3232985"), + "https://doi.org/10.22002/d1.1235": ("https://data.caltech.edu/records/1235"), + "https://doi.org/10.21105/joss.01277": ( + "https://joss.theoj.org/papers/10.21105/joss.01277" + ), +} -def test_content_id(): - with patch.object(Zenodo, "urlopen") as fake_urlopen: - fake_urlopen.return_value.url = "https://zenodo.org/record/3232985" - zen = Zenodo() - zen.detect("10.5281/zenodo.3232985") - assert zen.content_id == "3232985" +def doi_resolver(req, context): + resp = doi_responses.get(req.url) + # doi responses are redirects + if resp is not None: + context.status_code = 302 + context.headers["Location"] = resp + return resp + + +def test_content_id(requests_mock): + requests_mock.get(re.compile("https://"), json=doi_resolver) + + zen = Zenodo() + zen.detect("10.5281/zenodo.3232985") + assert zen.content_id == "3232985" test_zen = Zenodo() @@ -43,25 +60,22 @@ test_hosts = [ @pytest.mark.parametrize("test_input,expected", test_hosts) -def test_detect_zenodo(test_input, expected): - with patch.object(Zenodo, "urlopen") as fake_urlopen: - fake_urlopen.return_value.url = test_input[0] - # valid Zenodo DOIs trigger this content provider - assert Zenodo().detect(test_input[0]) == expected - assert Zenodo().detect(test_input[1]) == expected - assert Zenodo().detect(test_input[2]) == expected - # only two of the three calls above have to resolve a DOI - assert fake_urlopen.call_count == 2 +def test_detect_zenodo(test_input, expected, requests_mock): + requests_mock.get(re.compile("https://"), json=doi_resolver) + # valid Zenodo DOIs trigger this content provider + assert Zenodo().detect(test_input[0]) == expected + assert Zenodo().detect(test_input[1]) == expected + assert Zenodo().detect(test_input[2]) == expected + # only two of the three calls above have to resolve a DOI (2 req per doi resolution) + assert requests_mock.call_count == 4 + requests_mock.reset_mock() - with patch.object(Zenodo, "urlopen") as fake_urlopen: - # Don't trigger the Zenodo content provider - assert Zenodo().detect("/some/path/here") is None - assert Zenodo().detect("https://example.com/path/here") is None - # don't handle DOIs that aren't from Zenodo - fake_urlopen.return_value.url = ( - "http://joss.theoj.org/papers/10.21105/joss.01277" - ) - assert Zenodo().detect("https://doi.org/10.21105/joss.01277") is None + # Don't trigger the Zenodo content provider + assert Zenodo().detect("/some/path/here") is None + assert Zenodo().detect("https://example.com/path/here") is None + + # don't handle DOIs that aren't from Zenodo + assert Zenodo().detect("https://doi.org/10.21105/joss.01277") is None @contextmanager @@ -74,120 +88,102 @@ def zenodo_archive(prefix="a_directory"): yield zfile.name -def test_fetch_software_from_github_archive(): +def test_fetch_software_from_github_archive(requests_mock): # we "fetch" a local ZIP file to simulate a Zenodo record created from a # GitHub repository via the Zenodo-GitHub integration with zenodo_archive() as zen_path: - mock_response = BytesIO( - json.dumps( + mock_response = { + "files": [ { - "files": [ - { - "filename": "some_dir/afake.zip", - "links": {"download": "file://{}".format(zen_path)}, - } - ], - "metadata": {"upload_type": "software"}, + "filename": "some_dir/afake.zip", + "links": {"download": "file://{}".format(zen_path)}, } - ).encode("utf-8") + ], + "metadata": {"upload_type": "software"}, + } + requests_mock.get("https://zenodo.org/api/records/1234", json=mock_response) + requests_mock.get( + "file://{}".format(zen_path), content=open(zen_path, "rb").read() ) - def mock_urlopen(self, req): - if isinstance(req, Request): - return mock_response - else: - return urlopen(req) + zen = Zenodo() + spec = {"host": test_zen.hosts[0], "record": "1234"} - with patch.object(Zenodo, "urlopen", new=mock_urlopen): - zen = Zenodo() - spec = {"host": test_zen.hosts[0], "record": "1234"} + with TemporaryDirectory() as d: + output = [] + for l in zen.fetch(spec, d): + output.append(l) - with TemporaryDirectory() as d: - output = [] - for l in zen.fetch(spec, d): - output.append(l) - - unpacked_files = set(os.listdir(d)) - expected = set(["some-other-file.txt", "some-file.txt"]) - assert expected == unpacked_files + unpacked_files = set(os.listdir(d)) + expected = set(["some-other-file.txt", "some-file.txt"]) + assert expected == unpacked_files -def test_fetch_software(): +def test_fetch_software(requests_mock): # we "fetch" a local ZIP file to simulate a Zenodo software record with a # ZIP file in it with zenodo_archive() as zen_path: - mock_response = BytesIO( - json.dumps( + mock_response = { + "files": [ { - "files": [ - { - # this is the difference to the GitHub generated one, - # the ZIP file isn't in a directory - "filename": "afake.zip", - "links": {"download": "file://{}".format(zen_path)}, - } - ], - "metadata": {"upload_type": "software"}, + # this is the difference to the GitHub generated one, + # the ZIP file isn't in a directory + "filename": "afake.zip", + "links": {"download": "file://{}".format(zen_path)}, } - ).encode("utf-8") + ], + "metadata": {"upload_type": "software"}, + } + requests_mock.get("https://zenodo.org/api/records/1234", json=mock_response) + requests_mock.get( + "file://{}".format(zen_path), content=open(zen_path, "rb").read() ) - def mock_urlopen(self, req): - if isinstance(req, Request): - return mock_response - else: - return urlopen(req) + with TemporaryDirectory() as d: + zen = Zenodo() + spec = spec = {"host": test_zen.hosts[0], "record": "1234"} + output = [] + for l in zen.fetch(spec, d): + output.append(l) + + unpacked_files = set(os.listdir(d)) + expected = set(["some-other-file.txt", "some-file.txt"]) + assert expected == unpacked_files + + +def test_fetch_data(requests_mock): + # we "fetch" a local ZIP file to simulate a Zenodo data record + with zenodo_archive() as a_zen_path: + with zenodo_archive() as b_zen_path: + mock_response = { + "files": [ + { + "filename": "afake.zip", + "links": {"download": "file://{}".format(a_zen_path)}, + }, + { + "filename": "bfake.zip", + "links": {"download": "file://{}".format(b_zen_path)}, + }, + ], + "metadata": {"upload_type": "data"}, + } + requests_mock.get("https://zenodo.org/api/records/1234", json=mock_response) + requests_mock.get( + "file://{}".format(a_zen_path), content=open(a_zen_path, "rb").read() + ) + requests_mock.get( + "file://{}".format(b_zen_path), content=open(b_zen_path, "rb").read() + ) - with patch.object(Zenodo, "urlopen", new=mock_urlopen): with TemporaryDirectory() as d: zen = Zenodo() - spec = spec = {"host": test_zen.hosts[0], "record": "1234"} + spec = {"host": test_zen.hosts[0], "record": "1234"} output = [] for l in zen.fetch(spec, d): output.append(l) unpacked_files = set(os.listdir(d)) - expected = set(["some-other-file.txt", "some-file.txt"]) + # ZIP files shouldn't have been unpacked + expected = {"bfake.zip", "afake.zip"} assert expected == unpacked_files - - -def test_fetch_data(): - # we "fetch" a local ZIP file to simulate a Zenodo data record - with zenodo_archive() as a_zen_path: - with zenodo_archive() as b_zen_path: - mock_response = BytesIO( - json.dumps( - { - "files": [ - { - "filename": "afake.zip", - "links": {"download": "file://{}".format(a_zen_path)}, - }, - { - "filename": "bfake.zip", - "links": {"download": "file://{}".format(b_zen_path)}, - }, - ], - "metadata": {"upload_type": "data"}, - } - ).encode("utf-8") - ) - - def mock_urlopen(self, req): - if isinstance(req, Request): - return mock_response - else: - return urlopen(req) - - with patch.object(Zenodo, "urlopen", new=mock_urlopen): - with TemporaryDirectory() as d: - zen = Zenodo() - spec = {"host": test_zen.hosts[0], "record": "1234"} - output = [] - for l in zen.fetch(spec, d): - output.append(l) - - unpacked_files = set(os.listdir(d)) - # ZIP files shouldn't have been unpacked - expected = {"bfake.zip", "afake.zip"} - assert expected == unpacked_files