Detect running in docker container in WACZ enricher

pull/74/head
Logan Williams 2023-05-09 12:12:02 +02:00
rodzic 2f7181ced6
commit 0fae7d96fb
2 zmienionych plików z 39 dodań i 18 usunięć

Wyświetl plik

@ -1,6 +1,8 @@
# stage 1 - all dependencies
FROM webrecorder/browsertrix-crawler:latest
ENV RUNNING_IN_DOCKER=1
WORKDIR /app
# TODO: use custom ffmpeg builds instead of apt-get install
@ -28,7 +30,7 @@ COPY ./src/ .
# TODO: figure out how to make volumes not be root, does it depend on host or dockerfile?
# RUN useradd --system --groups sudo --shell /bin/bash archiver && chown -R archiver:sudo .
# USER archiver
ENTRYPOINT ["python"]
ENTRYPOINT ["python3"]
# ENTRYPOINT ["docker-entrypoint.sh"]
# should be executed with 2 volumes (3 if local_storage)

Wyświetl plik

@ -26,30 +26,49 @@ class WaczEnricher(Enricher):
def enrich(self, to_enrich: Metadata) -> bool:
# TODO: figure out support for browsertrix in docker
url = to_enrich.get_url()
if UrlUtil.is_auth_wall(url):
logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}")
return
logger.debug(f"generating WACZ for {url=}")
collection = str(uuid.uuid4())[0:8]
browsertrix_home = os.path.abspath(ArchivingContext.get_tmp_dir())
cmd = [
"docker", "run",
"--rm", # delete container once it has completed running
"-v", f"{browsertrix_home}:/crawls/",
# "-it", # this leads to "the input device is not a TTY"
"webrecorder/browsertrix-crawler", "crawl",
"--url", url,
"--scopeType", "page",
"--generateWACZ",
"--text",
"--collection", collection,
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
"--behaviorTimeout", str(self.timeout),
"--timeout", str(self.timeout)
]
if os.getenv('RUNNING_IN_DOCKER'):
logger.debug(f"generating WACZ without Docker for {url=}")
cmd = [
"crawl",
"--url", url,
"--scopeType", "page",
"--generateWACZ",
"--text",
"--collection", collection,
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
"--behaviorTimeout", str(self.timeout),
"--timeout", str(self.timeout)
]
else:
logger.debug(f"generating WACZ in Docker for {url=}")
cmd = [
"docker", "run",
"--rm", # delete container once it has completed running
"-v", f"{browsertrix_home}:/crawls/",
# "-it", # this leads to "the input device is not a TTY"
"webrecorder/browsertrix-crawler", "crawl",
"--url", url,
"--scopeType", "page",
"--generateWACZ",
"--text",
"--collection", collection,
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
"--behaviorTimeout", str(self.timeout),
"--timeout", str(self.timeout)
]
if self.profile:
profile_fn = os.path.join(browsertrix_home, "profile.tar.gz")
shutil.copyfile(self.profile, profile_fn)