Replace urllib by requests in contentproviders

requests is globally simpler to use, and more and more people
are more familiar with this later than urllib.
pull/993/head
David Douard 2020-12-10 18:55:14 +01:00
rodzic 560b1d96a0
commit 830a9c89c0
12 zmienionych plików z 427 dodań i 397 usunięć

Wyświetl plik

@ -5,4 +5,4 @@ pytest>=4.6
wheel
pytest-cov
pre-commit
requests
requests_mock

Wyświetl plik

@ -2,7 +2,6 @@ import os
import json
import shutil
from urllib.request import Request
from urllib.parse import urlparse, urlunparse, parse_qs
from .doi import DoiProvider
@ -56,7 +55,6 @@ class Dataverse(DoiProvider):
return
query_args = parse_qs(parsed_url.query)
# Corner case handling
if parsed_url.path.startswith("/file.xhtml"):
# There's no way of getting file information using its persistentId, the only thing we can do is assume that doi
@ -75,8 +73,7 @@ class Dataverse(DoiProvider):
parsed_url._replace(path="/api/search", query=search_query)
)
self.log.debug("Querying Dataverse: " + search_url)
resp = self.urlopen(search_url).read()
data = json.loads(resp.decode("utf-8"))["data"]
data = self.urlopen(search_url).json()
if data["count_in_response"] != 1:
self.log.debug(
"Dataverse search query failed!\n - doi: {}\n - url: {}\n - resp: {}\n".format(
@ -101,14 +98,12 @@ class Dataverse(DoiProvider):
host = spec["host"]
yield "Fetching Dataverse record {}.\n".format(record_id)
req = Request(
"{}/api/datasets/:persistentId?persistentId={}".format(
host["url"], record_id
),
headers={"accept": "application/json"},
url = "{}/api/datasets/:persistentId?persistentId={}".format(
host["url"], record_id
)
resp = self.urlopen(req)
record = json.loads(resp.read().decode("utf-8"))["data"]
resp = self.urlopen(url, headers={"accept": "application/json"})
record = resp.json()
for fobj in deep_get(record, "latestVersion.files"):
file_url = "{}/api/access/datafile/{}".format(

Wyświetl plik

@ -5,8 +5,8 @@ import logging
from os import makedirs
from os import path
from urllib import request # urlopen, Request
from urllib.error import HTTPError
from requests import Session, HTTPError
from zipfile import ZipFile, is_zipfile
from .base import ContentProvider
@ -18,7 +18,21 @@ from .. import __version__
class DoiProvider(ContentProvider):
"""Provide contents of a repository identified by a DOI and some helper functions."""
def urlopen(self, req, headers=None):
def __init__(self):
super().__init__()
self.session = Session()
self.session.headers.update(
{
"user-agent": "repo2docker {}".format(__version__),
}
)
def _request(self, url, **kwargs):
return self.session.get(url, **kwargs)
urlopen = _request
def _urlopen(self, req, headers=None):
"""A urlopen() helper"""
# someone passed a string, not a request
if not isinstance(req, request.Request):
@ -38,7 +52,8 @@ class DoiProvider(ContentProvider):
doi = normalize_doi(doi)
try:
resp = self.urlopen("https://doi.org/{}".format(doi))
resp = self._request("https://doi.org/{}".format(doi))
resp.raise_for_status()
# If the DOI doesn't resolve, just return URL
except HTTPError:
return doi
@ -53,38 +68,42 @@ class DoiProvider(ContentProvider):
file_url = deep_get(file_ref, host["download"])
fname = deep_get(file_ref, host["filename"])
logging.debug("Downloading file {} as {}\n".format(file_url, fname))
with self.urlopen(file_url) as src:
yield "Requesting {}\n".format(file_url)
resp = self._request(file_url, stream=True)
resp.raise_for_status()
if path.dirname(fname):
sub_dir = path.join(output_dir, path.dirname(fname))
if not path.exists(sub_dir):
yield "Creating {}\n".format(sub_dir)
makedirs(sub_dir, exist_ok=True)
dst_fname = path.join(output_dir, fname)
with open(dst_fname, "wb") as dst:
yield "Fetching {}\n".format(fname)
for chunk in resp.iter_content(chunk_size=None):
dst.write(chunk)
if unzip and is_zipfile(dst_fname):
yield "Extracting {}\n".format(fname)
zfile = ZipFile(dst_fname)
zfile.extractall(path=output_dir)
zfile.close()
# delete downloaded file ...
os.remove(dst_fname)
# ... and any directories we might have created,
# in which case sub_dir will be defined
if path.dirname(fname):
sub_dir = path.join(output_dir, path.dirname(fname))
if not path.exists(sub_dir):
yield "Creating {}\n".format(sub_dir)
makedirs(sub_dir, exist_ok=True)
shutil.rmtree(sub_dir)
dst_fname = path.join(output_dir, fname)
with open(dst_fname, "wb") as dst:
yield "Fetching {}\n".format(fname)
shutil.copyfileobj(src, dst)
# first close the newly written file, then continue
# processing it
if unzip and is_zipfile(dst_fname):
yield "Extracting {}\n".format(fname)
zfile = ZipFile(dst_fname)
zfile.extractall(path=output_dir)
zfile.close()
new_subdirs = os.listdir(output_dir)
# if there is only one new subdirectory move its contents
# to the top level directory
if len(new_subdirs) == 1:
d = new_subdirs[0]
copytree(path.join(output_dir, d), output_dir)
shutil.rmtree(path.join(output_dir, d))
# delete downloaded file ...
os.remove(dst_fname)
# ... and any directories we might have created,
# in which case sub_dir will be defined
if path.dirname(fname):
shutil.rmtree(sub_dir)
new_subdirs = os.listdir(output_dir)
# if there is only one new subdirectory move its contents
# to the top level directory
if len(new_subdirs) == 1:
d = new_subdirs[0]
copytree(path.join(output_dir, d), output_dir)
shutil.rmtree(path.join(output_dir, d))
yield "Fetched files: {}\n".format(os.listdir(output_dir))
yield "Fetched files: {}\n".format(os.listdir(output_dir))

Wyświetl plik

@ -25,6 +25,7 @@ class Figshare(DoiProvider):
"""
def __init__(self):
super().__init__()
self.hosts = [
{
"hostname": [
@ -74,13 +75,12 @@ class Figshare(DoiProvider):
yield "Fetching Figshare article {} in version {}.\n".format(
article_id, article_version
)
req = Request(
resp = self.urlopen(
"{}{}/versions/{}".format(host["api"], article_id, article_version),
headers={"accept": "application/json"},
)
resp = self.urlopen(req)
article = json.loads(resp.read().decode("utf-8"))
article = resp.json()
files = deep_get(article, host["filepath"])
# only fetch files where is_link_only: False

Wyświetl plik

@ -16,9 +16,7 @@ class Hydroshare(DoiProvider):
def _fetch_version(self, host):
"""Fetch resource modified date and convert to epoch"""
json_response = json.loads(
self.urlopen(host["version"].format(self.resource_id)).read()
)
json_response = self.urlopen(host["version"].format(self.resource_id)).json()
date = next(
item for item in json_response["dates"] if item["type"] == "modified"
)["start_date"]

Wyświetl plik

@ -15,6 +15,7 @@ class Zenodo(DoiProvider):
"""Provide contents of a Zenodo deposit."""
def __init__(self):
super().__init__()
# We need the hostname (url where records are), api url (for metadata),
# filepath (path to files in metadata), filename (path to filename in
# metadata), download (path to file download URL), and type (path to item type in metadata)
@ -55,13 +56,12 @@ class Zenodo(DoiProvider):
host = spec["host"]
yield "Fetching Zenodo record {}.\n".format(record_id)
req = Request(
resp = self.urlopen(
"{}{}".format(host["api"], record_id),
headers={"accept": "application/json"},
)
resp = self.urlopen(req)
record = json.loads(resp.read().decode("utf-8"))
record = resp.json()
is_software = deep_get(record, host["type"]).lower() == "software"
files = deep_get(record, host["filepath"])

Wyświetl plik

@ -52,6 +52,7 @@ setup(
"python-json-logger",
"escapism",
"jinja2",
"requests",
"ruamel.yaml>=0.15",
"toml",
"semver",

Wyświetl plik

@ -1,6 +1,7 @@
import json
import os
import pytest
import re
from io import BytesIO
from tempfile import TemporaryDirectory
@ -19,7 +20,7 @@ test_hosts = [
"doi:10.7910/DVN/6ZXAGT/3YRRYJ",
"10.7910/DVN/6ZXAGT",
"https://dataverse.harvard.edu/api/access/datafile/3323458",
"hdl:11529/10016",
"https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
],
[
{"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"},
@ -27,57 +28,68 @@ test_hosts = [
],
)
]
test_responses = {
"doi:10.7910/DVN/6ZXAGT/3YRRYJ": (
doi_responses = {
"https://doi.org/10.7910/DVN/6ZXAGT/3YRRYJ": (
"https://dataverse.harvard.edu/file.xhtml"
"?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ"
),
"doi:10.7910/DVN/6ZXAGT": (
"https://doi.org/10.7910/DVN/6ZXAGT": (
"https://dataverse.harvard.edu/dataset.xhtml"
"?persistentId=doi:10.7910/DVN/6ZXAGT"
),
"10.7910/DVN/6ZXAGT": (
"https://dataverse.harvard.edu/dataset.xhtml"
"?persistentId=doi:10.7910/DVN/6ZXAGT"
"https://dataverse.harvard.edu/api/access/datafile/3323458": (
"https://dataverse.harvard.edu/api/access/datafile/3323458"
),
"https://doi.org/10.21105/joss.01277": (
"https://joss.theoj.org/papers/10.21105/joss.01277"
),
"https://dataverse.harvard.edu/api/access/datafile/3323458": "https://dataverse.harvard.edu/api/access/datafile/3323458",
"hdl:11529/10016": "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
}
test_search = {
"data": {
"count_in_response": 1,
"items": [{"dataset_persistent_id": "doi:10.7910/DVN/6ZXAGT"}],
}
}
@pytest.mark.parametrize("test_input, expected", test_hosts)
def test_detect_dataverse(test_input, expected):
def doi_resolver(url):
return test_responses.get(url)
def test_detect_dataverse(test_input, expected, requests_mock):
def doi_resolver(req, context):
resp = doi_responses.get(req.url)
# doi responses are redirects
if resp is not None:
context.status_code = 302
context.headers["Location"] = resp
return resp
with patch.object(Dataverse, "urlopen") as fake_urlopen, patch.object(
Dataverse, "doi2url", side_effect=doi_resolver
) as fake_doi2url:
fake_urlopen.return_value.read.return_value = json.dumps(test_search).encode()
# valid Dataverse DOIs trigger this content provider
assert Dataverse().detect(test_input[0]) == expected[0]
assert fake_doi2url.call_count == 2 # File, then dataset
assert Dataverse().detect(test_input[1]) == expected[0]
assert Dataverse().detect(test_input[2]) == expected[0]
# only two of the three calls above have to resolve a DOI
assert fake_urlopen.call_count == 1
assert Dataverse().detect(test_input[3]) == expected[1]
requests_mock.get(re.compile("https://"), json=doi_resolver)
requests_mock.get(
"https://dataverse.harvard.edu/api/search?q=entityId:3323458&type=file",
json={
"count_in_response": 1,
"items": [{"dataset_persistent_id": "doi:10.7910/DVN/6ZXAGT"}],
},
)
with patch.object(Dataverse, "urlopen") as fake_urlopen:
# Don't trigger the Dataverse content provider
assert Dataverse().detect("/some/path/here") is None
assert Dataverse().detect("https://example.com/path/here") is None
# don't handle DOIs that aren't from Dataverse
fake_urlopen.return_value.url = (
"http://joss.theoj.org/papers/10.21105/joss.01277"
)
assert Dataverse().detect("https://doi.org/10.21105/joss.01277") is None
assert requests_mock.call_count == 0
# valid Dataverse DOIs trigger this content provider
assert Dataverse().detect(test_input[0]) == expected[0]
# 4: doi resolution (302), File, doi resolution (302), then dataset
assert requests_mock.call_count == 4
requests_mock.reset_mock()
assert Dataverse().detect(test_input[1]) == expected[0]
# 2: doi (302), dataset
assert requests_mock.call_count == 2
requests_mock.reset_mock()
assert Dataverse().detect(test_input[2]) == expected[0]
# 1: datafile (search dataverse for the file id)
assert requests_mock.call_count == 1
requests_mock.reset_mock()
assert Dataverse().detect(test_input[3]) == expected[1]
requests_mock.reset_mock()
# Don't trigger the Dataverse content provider
assert Dataverse().detect("/some/path/here") is None
assert Dataverse().detect("https://example.com/path/here") is None
# don't handle DOIs that aren't from Dataverse
assert Dataverse().detect("https://doi.org/10.21105/joss.01277") is None
@pytest.fixture
@ -95,50 +107,50 @@ def dv_files(tmpdir):
return [f1, f2, f3]
def test_dataverse_fetch(dv_files):
mock_response_ds_query = BytesIO(
json.dumps(
{
"data": {
"latestVersion": {
"files": [
{"dataFile": {"id": 1}, "label": "some-file.txt"},
{
"dataFile": {"id": 2},
"label": "some-other-file.txt",
"directoryLabel": "directory",
},
{
"dataFile": {"id": 3},
"label": "the-other-file.txt",
"directoryLabel": "directory/subdirectory",
},
]
}
}
}
).encode("utf-8")
)
def test_dataverse_fetch(dv_files, requests_mock):
mock_response = {
"latestVersion": {
"files": [
{"dataFile": {"id": 1}, "label": "some-file.txt"},
{
"dataFile": {"id": 2},
"label": "some-other-file.txt",
"directoryLabel": "directory",
},
{
"dataFile": {"id": 3},
"label": "the-other-file.txt",
"directoryLabel": "directory/subdirectory",
},
]
}
}
spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}
def mock_filecontent(req, context):
file_no = int(req.url.split("/")[-1]) - 1
return open(dv_files[file_no], "rb").read()
requests_mock.get(
"https://dataverse.harvard.edu/api/datasets/"
":persistentId?persistentId=doi:10.7910/DVN/6ZXAGT",
json=mock_response,
)
requests_mock.get(
re.compile("https://dataverse.harvard.edu/api/access/datafile"),
content=mock_filecontent,
)
dv = Dataverse()
def mock_urlopen(self, req):
if isinstance(req, Request):
return mock_response_ds_query
else:
file_no = int(req.split("/")[-1]) - 1
return urlopen("file://{}".format(dv_files[file_no]))
with patch.object(Dataverse, "urlopen", new=mock_urlopen):
with TemporaryDirectory() as d:
output = []
for l in dv.fetch(spec, d):
output.append(l)
unpacked_files = set(os.listdir(d))
expected = set(["directory", "some-file.txt"])
assert expected == unpacked_files
assert os.path.isfile(
os.path.join(d, "directory", "subdirectory", "the-other-file.txt")
)
with TemporaryDirectory() as d:
output = []
for l in dv.fetch(spec, d):
output.append(l)
unpacked_files = set(os.listdir(d))
expected = set(["directory", "some-file.txt"])
assert expected == unpacked_files
assert os.path.isfile(
os.path.join(d, "directory", "subdirectory", "the-other-file.txt")
)

Wyświetl plik

@ -11,6 +11,7 @@ from zipfile import ZipFile
from repo2docker.contentproviders.doi import DoiProvider
from repo2docker.contentproviders.base import ContentProviderException
from repo2docker import __version__
def test_content_id():
@ -18,20 +19,15 @@ def test_content_id():
assert doi.content_id is None
def fake_urlopen(req):
print(req)
return req.headers
@patch("urllib.request.urlopen", fake_urlopen)
def test_url_headers():
def test_url_headers(requests_mock):
requests_mock.get("https://mybinder.org", text="resp")
doi = DoiProvider()
headers = {"test1": "value1", "Test2": "value2"}
result = doi.urlopen("https://mybinder.org", headers=headers)
assert "Test1" in result
assert "Test2" in result
assert len(result) is 3 # User-agent is also set
assert "test1" in result.request.headers
assert "Test2" in result.request.headers
assert result.request.headers["User-Agent"] == "repo2docker {}".format(__version__)
def test_unresolving_doi():

Wyświetl plik

@ -22,12 +22,17 @@ test_content_ids = [
@pytest.mark.parametrize("link,expected", test_content_ids)
def test_content_id(link, expected):
with patch.object(Figshare, "urlopen") as fake_urlopen:
fake_urlopen.return_value.url = link
fig = Figshare()
fig.detect("10.6084/m9.figshare.9782777")
assert fig.content_id == expected
def test_content_id(link, expected, requests_mock):
def mocked_get(req, context):
if req.url.startswith("https://doi.org"):
context.status_code = 302
context.headers["Location"] = link
return link
requests_mock.get(re.compile("https://"), text=mocked_get)
fig = Figshare()
fig.detect("10.6084/m9.figshare.9782777")
assert fig.content_id == expected
test_fig = Figshare()
@ -103,76 +108,73 @@ def figshare_archive(prefix="a_directory"):
yield zfile.name
def test_fetch_zip():
def test_fetch_zip(requests_mock):
# see test_zenodo.py/test_fetch_software
with figshare_archive() as fig_path:
mock_response = BytesIO(
json.dumps(
mock_response = {
"files": [
{
"files": [
{
"name": "afake.zip",
"is_link_only": False,
"download_url": "file://{}".format(fig_path),
}
]
"name": "afake.zip",
"is_link_only": False,
"download_url": "file://{}".format(fig_path),
}
).encode("utf-8")
]
}
requests_mock.get(
"https://api.figshare.com/v2/articles/123456/versions/42",
json=mock_response,
)
requests_mock.get(
"file://{}".format(fig_path), content=open(fig_path, "rb").read()
)
def mock_urlopen(self, req):
if isinstance(req, Request):
return mock_response
else:
return urlopen(req)
# with patch.object(Figshare, "urlopen", new=mock_urlopen):
with TemporaryDirectory() as d:
output = []
for l in test_fig.fetch(test_spec, d):
output.append(l)
unpacked_files = set(os.listdir(d))
expected = set(["some-other-file.txt", "some-file.txt"])
assert expected == unpacked_files
def test_fetch_data(requests_mock):
with figshare_archive() as a_path:
with figshare_archive() as b_path:
mock_response = {
"files": [
{
"name": "afake.file",
"download_url": "file://{}".format(a_path),
"is_link_only": False,
},
{
"name": "bfake.data",
"download_url": "file://{}".format(b_path),
"is_link_only": False,
},
{"name": "cfake.link", "is_link_only": True},
]
}
requests_mock.get(
"https://api.figshare.com/v2/articles/123456/versions/42",
json=mock_response,
)
requests_mock.get(
"file://{}".format(a_path), content=open(a_path, "rb").read()
)
requests_mock.get(
"file://{}".format(b_path), content=open(b_path, "rb").read()
)
with patch.object(Figshare, "urlopen", new=mock_urlopen):
with TemporaryDirectory() as d:
output = []
for l in test_fig.fetch(test_spec, d):
output.append(l)
unpacked_files = set(os.listdir(d))
expected = set(["some-other-file.txt", "some-file.txt"])
# ZIP files shouldn't have been unpacked
expected = {"bfake.data", "afake.file"}
assert expected == unpacked_files
def test_fetch_data():
with figshare_archive() as a_path:
with figshare_archive() as b_path:
mock_response = BytesIO(
json.dumps(
{
"files": [
{
"name": "afake.file",
"download_url": "file://{}".format(a_path),
"is_link_only": False,
},
{
"name": "bfake.data",
"download_url": "file://{}".format(b_path),
"is_link_only": False,
},
{"name": "cfake.link", "is_link_only": True},
]
}
).encode("utf-8")
)
def mock_urlopen(self, req):
if isinstance(req, Request):
return mock_response
else:
return urlopen(req)
with patch.object(Figshare, "urlopen", new=mock_urlopen):
with TemporaryDirectory() as d:
output = []
for l in test_fig.fetch(test_spec, d):
output.append(l)
unpacked_files = set(os.listdir(d))
# ZIP files shouldn't have been unpacked
expected = {"bfake.data", "afake.file"}
assert expected == unpacked_files

Wyświetl plik

@ -5,87 +5,98 @@ from contextlib import contextmanager
from tempfile import TemporaryDirectory, NamedTemporaryFile
from unittest.mock import patch
from zipfile import ZipFile
import re
from repo2docker.contentproviders import Hydroshare
from repo2docker.contentproviders.base import ContentProviderException
def test_content_id():
with patch.object(Hydroshare, "urlopen") as fake_urlopen:
fake_urlopen.return_value.url = (
doi_responses = {
"https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61": (
"https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"
),
"https://doi.org/10.21105/joss.01277": (
"https://joss.theoj.org/papers/10.21105/joss.01277"
),
}
def doi_resolver(req, context):
resp = doi_responses.get(req.url)
# doi responses are redirects
if resp is not None:
context.status_code = 302
context.headers["Location"] = resp
return resp
hydroshare_data = {
"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]
}
def test_content_id(requests_mock):
requests_mock.get(re.compile("https://"), json=hydroshare_data)
requests_mock.get(re.compile("https://doi.org"), json=doi_resolver)
hydro = Hydroshare()
hydro.detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61")
assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61.v1569427757"
def test_detect_hydroshare(requests_mock):
requests_mock.get(re.compile("https://"), json=hydroshare_data)
requests_mock.get(re.compile("https://doi.org"), json=doi_resolver)
# valid Hydroshare DOIs trigger this content provider
expected = {
"host": {
"hostname": [
"https://www.hydroshare.org/resource/",
"http://www.hydroshare.org/resource/",
],
"django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
"version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements",
},
"resource": "b8f6eae9d89241cf8b5904033460af61",
"version": "1569427757",
}
assert (
Hydroshare().detect(
"https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"
)
== expected
)
# assert a call to urlopen was called to fetch version
assert requests_mock.call_count == 1
requests_mock.reset_mock()
def read():
return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}'
assert (
Hydroshare().detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") == expected
)
# assert 3 calls were made, 2 to resolve the DOI (302 + 200) and another to fetch the version
assert requests_mock.call_count == 3
requests_mock.reset_mock()
fake_urlopen.return_value.read = read
hydro = Hydroshare()
hydro.detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61")
assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61.v1569427757"
def test_detect_hydroshare():
with patch.object(Hydroshare, "urlopen") as fake_urlopen:
fake_urlopen.return_value.url = (
"https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"
assert (
Hydroshare().detect(
"https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61"
)
== expected
)
# assert 3 more calls were made, 2 to resolve the DOI and another to fetch the version
assert requests_mock.call_count == 3
requests_mock.reset_mock()
def read():
return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}'
# Don't trigger the Hydroshare content provider
assert Hydroshare().detect("/some/path/here") is None
assert Hydroshare().detect("https://example.com/path/here") is None
fake_urlopen.return_value.read = read
# valid Hydroshare DOIs trigger this content provider
expected = {
"host": {
"hostname": [
"https://www.hydroshare.org/resource/",
"http://www.hydroshare.org/resource/",
],
"django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
"version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements",
},
"resource": "b8f6eae9d89241cf8b5904033460af61",
"version": "1569427757",
}
assert (
Hydroshare().detect(
"https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"
)
== expected
)
# assert a call to urlopen was called to fetch version
assert fake_urlopen.call_count == 1
assert (
Hydroshare().detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61")
== expected
)
# assert 2 more calls were made, one to resolve the DOI and another to fetch the version
assert fake_urlopen.call_count == 3
assert (
Hydroshare().detect(
"https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61"
)
== expected
)
# assert 2 more calls were made, one to resolve the DOI and another to fetch the version
assert fake_urlopen.call_count == 5
with patch.object(Hydroshare, "urlopen") as fake_urlopen:
# Don't trigger the Hydroshare content provider
assert Hydroshare().detect("/some/path/here") is None
assert Hydroshare().detect("https://example.com/path/here") is None
# don't handle DOIs that aren't from Hydroshare
fake_urlopen.return_value.url = (
"http://joss.theoj.org/papers/10.21105/joss.01277"
)
def read():
return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}'
fake_urlopen.return_value.read = read
assert Hydroshare().detect("https://doi.org/10.21105/joss.01277") is None
# don't handle DOIs that aren't from Hydroshare
assert Hydroshare().detect("https://doi.org/10.21105/joss.01277") is None
@contextmanager

Wyświetl plik

@ -1,6 +1,7 @@
import json
import os
import pytest
import re
from contextlib import contextmanager
from io import BytesIO
@ -11,14 +12,30 @@ from zipfile import ZipFile
from repo2docker.contentproviders import Zenodo
doi_responses = {
"https://doi.org/10.5281/zenodo.3232985": ("https://zenodo.org/record/3232985"),
"https://doi.org/10.22002/d1.1235": ("https://data.caltech.edu/records/1235"),
"https://doi.org/10.21105/joss.01277": (
"https://joss.theoj.org/papers/10.21105/joss.01277"
),
}
def test_content_id():
with patch.object(Zenodo, "urlopen") as fake_urlopen:
fake_urlopen.return_value.url = "https://zenodo.org/record/3232985"
zen = Zenodo()
zen.detect("10.5281/zenodo.3232985")
assert zen.content_id == "3232985"
def doi_resolver(req, context):
resp = doi_responses.get(req.url)
# doi responses are redirects
if resp is not None:
context.status_code = 302
context.headers["Location"] = resp
return resp
def test_content_id(requests_mock):
requests_mock.get(re.compile("https://"), json=doi_resolver)
zen = Zenodo()
zen.detect("10.5281/zenodo.3232985")
assert zen.content_id == "3232985"
test_zen = Zenodo()
@ -43,25 +60,22 @@ test_hosts = [
@pytest.mark.parametrize("test_input,expected", test_hosts)
def test_detect_zenodo(test_input, expected):
with patch.object(Zenodo, "urlopen") as fake_urlopen:
fake_urlopen.return_value.url = test_input[0]
# valid Zenodo DOIs trigger this content provider
assert Zenodo().detect(test_input[0]) == expected
assert Zenodo().detect(test_input[1]) == expected
assert Zenodo().detect(test_input[2]) == expected
# only two of the three calls above have to resolve a DOI
assert fake_urlopen.call_count == 2
def test_detect_zenodo(test_input, expected, requests_mock):
requests_mock.get(re.compile("https://"), json=doi_resolver)
# valid Zenodo DOIs trigger this content provider
assert Zenodo().detect(test_input[0]) == expected
assert Zenodo().detect(test_input[1]) == expected
assert Zenodo().detect(test_input[2]) == expected
# only two of the three calls above have to resolve a DOI (2 req per doi resolution)
assert requests_mock.call_count == 4
requests_mock.reset_mock()
with patch.object(Zenodo, "urlopen") as fake_urlopen:
# Don't trigger the Zenodo content provider
assert Zenodo().detect("/some/path/here") is None
assert Zenodo().detect("https://example.com/path/here") is None
# don't handle DOIs that aren't from Zenodo
fake_urlopen.return_value.url = (
"http://joss.theoj.org/papers/10.21105/joss.01277"
)
assert Zenodo().detect("https://doi.org/10.21105/joss.01277") is None
# Don't trigger the Zenodo content provider
assert Zenodo().detect("/some/path/here") is None
assert Zenodo().detect("https://example.com/path/here") is None
# don't handle DOIs that aren't from Zenodo
assert Zenodo().detect("https://doi.org/10.21105/joss.01277") is None
@contextmanager
@ -74,120 +88,102 @@ def zenodo_archive(prefix="a_directory"):
yield zfile.name
def test_fetch_software_from_github_archive():
def test_fetch_software_from_github_archive(requests_mock):
# we "fetch" a local ZIP file to simulate a Zenodo record created from a
# GitHub repository via the Zenodo-GitHub integration
with zenodo_archive() as zen_path:
mock_response = BytesIO(
json.dumps(
mock_response = {
"files": [
{
"files": [
{
"filename": "some_dir/afake.zip",
"links": {"download": "file://{}".format(zen_path)},
}
],
"metadata": {"upload_type": "software"},
"filename": "some_dir/afake.zip",
"links": {"download": "file://{}".format(zen_path)},
}
).encode("utf-8")
],
"metadata": {"upload_type": "software"},
}
requests_mock.get("https://zenodo.org/api/records/1234", json=mock_response)
requests_mock.get(
"file://{}".format(zen_path), content=open(zen_path, "rb").read()
)
def mock_urlopen(self, req):
if isinstance(req, Request):
return mock_response
else:
return urlopen(req)
zen = Zenodo()
spec = {"host": test_zen.hosts[0], "record": "1234"}
with patch.object(Zenodo, "urlopen", new=mock_urlopen):
zen = Zenodo()
spec = {"host": test_zen.hosts[0], "record": "1234"}
with TemporaryDirectory() as d:
output = []
for l in zen.fetch(spec, d):
output.append(l)
with TemporaryDirectory() as d:
output = []
for l in zen.fetch(spec, d):
output.append(l)
unpacked_files = set(os.listdir(d))
expected = set(["some-other-file.txt", "some-file.txt"])
assert expected == unpacked_files
unpacked_files = set(os.listdir(d))
expected = set(["some-other-file.txt", "some-file.txt"])
assert expected == unpacked_files
def test_fetch_software():
def test_fetch_software(requests_mock):
# we "fetch" a local ZIP file to simulate a Zenodo software record with a
# ZIP file in it
with zenodo_archive() as zen_path:
mock_response = BytesIO(
json.dumps(
mock_response = {
"files": [
{
"files": [
{
# this is the difference to the GitHub generated one,
# the ZIP file isn't in a directory
"filename": "afake.zip",
"links": {"download": "file://{}".format(zen_path)},
}
],
"metadata": {"upload_type": "software"},
# this is the difference to the GitHub generated one,
# the ZIP file isn't in a directory
"filename": "afake.zip",
"links": {"download": "file://{}".format(zen_path)},
}
).encode("utf-8")
],
"metadata": {"upload_type": "software"},
}
requests_mock.get("https://zenodo.org/api/records/1234", json=mock_response)
requests_mock.get(
"file://{}".format(zen_path), content=open(zen_path, "rb").read()
)
def mock_urlopen(self, req):
if isinstance(req, Request):
return mock_response
else:
return urlopen(req)
with TemporaryDirectory() as d:
zen = Zenodo()
spec = spec = {"host": test_zen.hosts[0], "record": "1234"}
output = []
for l in zen.fetch(spec, d):
output.append(l)
unpacked_files = set(os.listdir(d))
expected = set(["some-other-file.txt", "some-file.txt"])
assert expected == unpacked_files
def test_fetch_data(requests_mock):
# we "fetch" a local ZIP file to simulate a Zenodo data record
with zenodo_archive() as a_zen_path:
with zenodo_archive() as b_zen_path:
mock_response = {
"files": [
{
"filename": "afake.zip",
"links": {"download": "file://{}".format(a_zen_path)},
},
{
"filename": "bfake.zip",
"links": {"download": "file://{}".format(b_zen_path)},
},
],
"metadata": {"upload_type": "data"},
}
requests_mock.get("https://zenodo.org/api/records/1234", json=mock_response)
requests_mock.get(
"file://{}".format(a_zen_path), content=open(a_zen_path, "rb").read()
)
requests_mock.get(
"file://{}".format(b_zen_path), content=open(b_zen_path, "rb").read()
)
with patch.object(Zenodo, "urlopen", new=mock_urlopen):
with TemporaryDirectory() as d:
zen = Zenodo()
spec = spec = {"host": test_zen.hosts[0], "record": "1234"}
spec = {"host": test_zen.hosts[0], "record": "1234"}
output = []
for l in zen.fetch(spec, d):
output.append(l)
unpacked_files = set(os.listdir(d))
expected = set(["some-other-file.txt", "some-file.txt"])
# ZIP files shouldn't have been unpacked
expected = {"bfake.zip", "afake.zip"}
assert expected == unpacked_files
def test_fetch_data():
# we "fetch" a local ZIP file to simulate a Zenodo data record
with zenodo_archive() as a_zen_path:
with zenodo_archive() as b_zen_path:
mock_response = BytesIO(
json.dumps(
{
"files": [
{
"filename": "afake.zip",
"links": {"download": "file://{}".format(a_zen_path)},
},
{
"filename": "bfake.zip",
"links": {"download": "file://{}".format(b_zen_path)},
},
],
"metadata": {"upload_type": "data"},
}
).encode("utf-8")
)
def mock_urlopen(self, req):
if isinstance(req, Request):
return mock_response
else:
return urlopen(req)
with patch.object(Zenodo, "urlopen", new=mock_urlopen):
with TemporaryDirectory() as d:
zen = Zenodo()
spec = {"host": test_zen.hosts[0], "record": "1234"}
output = []
for l in zen.fetch(spec, d):
output.append(l)
unpacked_files = set(os.listdir(d))
# ZIP files shouldn't have been unpacked
expected = {"bfake.zip", "afake.zip"}
assert expected == unpacked_files