kopia lustrzana https://github.com/bellingcat/auto-archiver
browsertrix docker updates
rodzic
f4827770e6
commit
9c7824de57
|
@ -1,4 +1,4 @@
|
||||||
FROM webrecorder/browsertrix-crawler:latest
|
FROM webrecorder/browsertrix-crawler:1.0.4
|
||||||
|
|
||||||
ENV RUNNING_IN_DOCKER=1
|
ENV RUNNING_IN_DOCKER=1
|
||||||
|
|
||||||
|
@ -19,9 +19,8 @@ RUN pip install --upgrade pip && \
|
||||||
|
|
||||||
COPY Pipfile* ./
|
COPY Pipfile* ./
|
||||||
# install from pipenv, with browsertrix-only requirements
|
# install from pipenv, with browsertrix-only requirements
|
||||||
RUN pipenv install && \
|
RUN pipenv install
|
||||||
pipenv install pywb uwsgi
|
|
||||||
|
|
||||||
# doing this at the end helps during development, builds are quick
|
# doing this at the end helps during development, builds are quick
|
||||||
COPY ./src/ .
|
COPY ./src/ .
|
||||||
|
|
||||||
|
|
|
@ -75,14 +75,16 @@ class WaczArchiverEnricher(Enricher, Archiver):
|
||||||
"--url", url,
|
"--url", url,
|
||||||
"--scopeType", "page",
|
"--scopeType", "page",
|
||||||
"--generateWACZ",
|
"--generateWACZ",
|
||||||
"--text",
|
"--text", "to-pages",
|
||||||
"--screenshot", "fullPage",
|
"--screenshot", "fullPage",
|
||||||
"--collection", collection,
|
"--collection", collection,
|
||||||
"--id", collection,
|
"--id", collection,
|
||||||
"--saveState", "never",
|
"--saveState", "never",
|
||||||
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
|
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
|
||||||
"--behaviorTimeout", str(self.timeout),
|
"--behaviorTimeout", str(self.timeout),
|
||||||
"--timeout", str(self.timeout)]
|
"--timeout", str(self.timeout),
|
||||||
|
"--blockAds" # TODO: test
|
||||||
|
]
|
||||||
|
|
||||||
if self.docker_in_docker:
|
if self.docker_in_docker:
|
||||||
cmd.extend(["--cwd", self.cwd_dind])
|
cmd.extend(["--cwd", self.cwd_dind])
|
||||||
|
@ -110,9 +112,9 @@ class WaczArchiverEnricher(Enricher, Archiver):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
|
logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
|
||||||
|
my_env = os.environ.copy()
|
||||||
if self.socks_proxy_host and self.socks_proxy_port:
|
if self.socks_proxy_host and self.socks_proxy_port:
|
||||||
logger.debug("Using SOCKS proxy for browsertrix-crawler")
|
logger.debug("Using SOCKS proxy for browsertrix-crawler")
|
||||||
my_env = os.environ.copy()
|
|
||||||
my_env["SOCKS_HOST"] = self.socks_proxy_host
|
my_env["SOCKS_HOST"] = self.socks_proxy_host
|
||||||
my_env["SOCKS_PORT"] = str(self.socks_proxy_port)
|
my_env["SOCKS_PORT"] = str(self.socks_proxy_port)
|
||||||
subprocess.run(cmd, check=True, env=my_env)
|
subprocess.run(cmd, check=True, env=my_env)
|
||||||
|
@ -161,7 +163,7 @@ class WaczArchiverEnricher(Enricher, Archiver):
|
||||||
"""
|
"""
|
||||||
Receives a .wacz archive, and extracts all relevant media from it, adding them to to_enrich.
|
Receives a .wacz archive, and extracts all relevant media from it, adding them to to_enrich.
|
||||||
"""
|
"""
|
||||||
logger.info(f"WACZ extract_media flag is set, extracting media from {wacz_filename=}")
|
logger.info(f"WACZ extract_media or extract_screenshot flag is set, extracting media from {wacz_filename=}")
|
||||||
|
|
||||||
# unzipping the .wacz
|
# unzipping the .wacz
|
||||||
tmp_dir = ArchivingContext.get_tmp_dir()
|
tmp_dir = ArchivingContext.get_tmp_dir()
|
||||||
|
@ -182,10 +184,11 @@ class WaczArchiverEnricher(Enricher, Archiver):
|
||||||
# get media out of .warc
|
# get media out of .warc
|
||||||
counter = 0
|
counter = 0
|
||||||
seen_urls = set()
|
seen_urls = set()
|
||||||
|
import json
|
||||||
with open(warc_filename, 'rb') as warc_stream:
|
with open(warc_filename, 'rb') as warc_stream:
|
||||||
for record in ArchiveIterator(warc_stream):
|
for record in ArchiveIterator(warc_stream):
|
||||||
# only include fetched resources
|
# only include fetched resources
|
||||||
if record.rec_type == "resource" and self.extract_screenshot: # screenshots
|
if record.rec_type == "resource" and record.content_type == "image/png" and self.extract_screenshot: # screenshots
|
||||||
fn = os.path.join(tmp_dir, f"warc-file-{counter}.png")
|
fn = os.path.join(tmp_dir, f"warc-file-{counter}.png")
|
||||||
with open(fn, "wb") as outf: outf.write(record.raw_stream.read())
|
with open(fn, "wb") as outf: outf.write(record.raw_stream.read())
|
||||||
m = Media(filename=fn)
|
m = Media(filename=fn)
|
||||||
|
@ -231,4 +234,4 @@ class WaczArchiverEnricher(Enricher, Archiver):
|
||||||
to_enrich.add_media(m, warc_fn)
|
to_enrich.add_media(m, warc_fn)
|
||||||
counter += 1
|
counter += 1
|
||||||
seen_urls.add(record_url)
|
seen_urls.add(record_url)
|
||||||
logger.info(f"WACZ extract_media finished, found {counter} relevant media file(s)")
|
logger.info(f"WACZ extract_media/extract_screenshot finished, found {counter} relevant media file(s)")
|
||||||
|
|
Ładowanie…
Reference in New Issue