pull/72/head
msramalho 2023-01-21 19:01:02 +00:00
rodzic ea2c266fa2
commit 753039240f
73 zmienionych plików z 404 dodań i 689 usunięć

4
.gitignore vendored
Wyświetl plik

@ -24,4 +24,6 @@ browsertrix/*
browsertrix-tmp/*
instaloader/*
instaloader.session
orchestration.yaml
orchestration.yaml
auto_archiver.egg-info*
logs*

Wyświetl plik

@ -29,9 +29,11 @@ instaloader = "*"
tqdm = "*"
jinja2 = "*"
cryptography = "==38.0.4"
dataclasses-json = "*"
[requires]
python_version = "3.9"
[dev-packages]
autopep8 = "*"
setuptools-pipfile = "*"

189
Pipfile.lock wygenerowano
Wyświetl plik

@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "bcc36e9ecdf6d383a1010629484eec271699ac23b40be045d9a9669b4c9fac8c"
"sha256": "e2f5d017d9bc9eef90cced189b6e3017d740c35d204962479417109a4deeb7f4"
},
"pipfile-spec": 6,
"requires": {
@ -57,19 +57,19 @@
},
"boto3": {
"hashes": [
"sha256:96055651f7be882175aa334ad46528e1ad79fb8ca33fa9c3998cc1d985b34eab",
"sha256:e24d65c31780c208768ebcd152d8a0181591c9c8e7d971e23f318d7f41910ba1"
"sha256:4e876ba5d64928cde0c416dd844f04f22d6b73d14002bbc3ca55591f80f49927",
"sha256:c729bb0af76e85a2776b6bd3da8d9fa0f4b91b425eab51612aa53956f644ee23"
],
"index": "pypi",
"version": "==1.26.46"
"version": "==1.26.54"
},
"botocore": {
"hashes": [
"sha256:78bf25933e35eb6354a9e80fe156f86dce4d346a92afe364dfce25c17ab0639f",
"sha256:dbac2fde265f13beb9191ec3ff63b90b515e9ed63875edc3afbd72c5f585e48b"
"sha256:ca3ef7588daa664fe196d3234718db5f6b5dab961507500b4bb921e31133eea1",
"sha256:f2fe17ed6b8e163769a715f81cb6ce3d4628d172918de535256bdf34d29b704f"
],
"markers": "python_version >= '3.7'",
"version": "==1.29.46"
"version": "==1.29.54"
},
"brotli": {
"hashes": [
@ -269,17 +269,10 @@
},
"cloudscraper": {
"hashes": [
"sha256:2776c70f3661c028e59fd306ac2b104882c9b3cb3f798086251e00fc2d72c3a2",
"sha256:3b9753724616ac4d811e7922ddc9dba9b4419749ebaa35b0ba503d442522df2e"
"sha256:401409859697edae9384a7623b450cc97ab14dd0b2c8cdcac62edc2d50b31741",
"sha256:4d02aceffa90abd4dabc75b79bafa31636309baa7c0f2ee665e2d345aadb8863"
],
"version": "==1.2.67"
},
"commonmark": {
"hashes": [
"sha256:452f9dc859be7f06631ddcb328b6919c67984aca654e5fefb3914d54691aed60",
"sha256:da2f38c92590f83de410ba1a3cbceafbc74fee9def35f9251ba9a971d6d66fd9"
],
"version": "==0.9.1"
"version": "==1.2.68"
},
"cryptography": {
"hashes": [
@ -318,16 +311,16 @@
"sha256:bc285b5f892094c3a53d558858a88553dd6a61a11ab1a8128a0e554385dcc5dd",
"sha256:c2c11bc8214fbf709ffc369d11446ff6945254a7f09128154a7620613d8fda90"
],
"markers": "python_version >= '3.6'",
"index": "pypi",
"version": "==0.5.7"
},
"dateparser": {
"hashes": [
"sha256:107f3cc87a60770e10d111349adc1504224a6b60753a47a64b0ec842ab85b5a9",
"sha256:ceb159f1b4a9df54ed6209e91298097deafde476037f8611b4cb2b1cb8b31c58"
"sha256:c47b6e4b8c4b2b2a21690111b6571b6991295ba327ec6503753abeebf5e80696",
"sha256:e703db1815270c020552f4b3e3a981937b48b2cbcfcef5347071b74788dd9214"
],
"index": "pypi",
"version": "==1.1.5"
"version": "==1.1.6"
},
"exceptiongroup": {
"hashes": [
@ -363,10 +356,10 @@
},
"future": {
"hashes": [
"sha256:b1bead90b70cf6ec3f0710ae53a525360fa360d306a86583adc6bf83a4db537d"
"sha256:34a17436ed1e96697a86f9de3d15a3b0be01d8bc8de9c1dffd59fb8234ed5307"
],
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==0.18.2"
"version": "==0.18.3"
},
"google-api-core": {
"hashes": [
@ -378,11 +371,11 @@
},
"google-api-python-client": {
"hashes": [
"sha256:9412ad3445518fa9d24d02c673a70b07c9d124990f44763cdf4f5304ca5b4d08",
"sha256:a4ea351db2bb2a9b1a7e96d8fa8de0fcbc31d9e237b724f4a07b243c2d63e9a4"
"sha256:7e860e3ec27b504fb797fa23c07c012a874dd736491fddbe50a20d3bdde8ace6",
"sha256:bafce2a02b06ee501df039eba5874afc7d28c9cf5ef92253327776448706556d"
],
"index": "pypi",
"version": "==2.71.0"
"version": "==2.73.0"
},
"google-auth": {
"hashes": [
@ -570,51 +563,69 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==4.9.2"
},
"markupsafe": {
"markdown-it-py": {
"hashes": [
"sha256:0212a68688482dc52b2d45013df70d169f542b7394fc744c02a57374a4207003",
"sha256:089cf3dbf0cd6c100f02945abeb18484bd1ee57a079aefd52cffd17fba910b88",
"sha256:10c1bfff05d95783da83491be968e8fe789263689c02724e0c691933c52994f5",
"sha256:33b74d289bd2f5e527beadcaa3f401e0df0a89927c1559c8566c066fa4248ab7",
"sha256:3799351e2336dc91ea70b034983ee71cf2f9533cdff7c14c90ea126bfd95d65a",
"sha256:3ce11ee3f23f79dbd06fb3d63e2f6af7b12db1d46932fe7bd8afa259a5996603",
"sha256:421be9fbf0ffe9ffd7a378aafebbf6f4602d564d34be190fc19a193232fd12b1",
"sha256:43093fb83d8343aac0b1baa75516da6092f58f41200907ef92448ecab8825135",
"sha256:46d00d6cfecdde84d40e572d63735ef81423ad31184100411e6e3388d405e247",
"sha256:4a33dea2b688b3190ee12bd7cfa29d39c9ed176bda40bfa11099a3ce5d3a7ac6",
"sha256:4b9fe39a2ccc108a4accc2676e77da025ce383c108593d65cc909add5c3bd601",
"sha256:56442863ed2b06d19c37f94d999035e15ee982988920e12a5b4ba29b62ad1f77",
"sha256:671cd1187ed5e62818414afe79ed29da836dde67166a9fac6d435873c44fdd02",
"sha256:694deca8d702d5db21ec83983ce0bb4b26a578e71fbdbd4fdcd387daa90e4d5e",
"sha256:6a074d34ee7a5ce3effbc526b7083ec9731bb3cbf921bbe1d3005d4d2bdb3a63",
"sha256:6d0072fea50feec76a4c418096652f2c3238eaa014b2f94aeb1d56a66b41403f",
"sha256:6fbf47b5d3728c6aea2abb0589b5d30459e369baa772e0f37a0320185e87c980",
"sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b",
"sha256:86b1f75c4e7c2ac2ccdaec2b9022845dbb81880ca318bb7a0a01fbf7813e3812",
"sha256:8dc1c72a69aa7e082593c4a203dcf94ddb74bb5c8a731e4e1eb68d031e8498ff",
"sha256:8e3dcf21f367459434c18e71b2a9532d96547aef8a871872a5bd69a715c15f96",
"sha256:8e576a51ad59e4bfaac456023a78f6b5e6e7651dcd383bcc3e18d06f9b55d6d1",
"sha256:96e37a3dc86e80bf81758c152fe66dbf60ed5eca3d26305edf01892257049925",
"sha256:97a68e6ada378df82bc9f16b800ab77cbf4b2fada0081794318520138c088e4a",
"sha256:99a2a507ed3ac881b975a2976d59f38c19386d128e7a9a18b7df6fff1fd4c1d6",
"sha256:a49907dd8420c5685cfa064a1335b6754b74541bbb3706c259c02ed65b644b3e",
"sha256:b09bf97215625a311f669476f44b8b318b075847b49316d3e28c08e41a7a573f",
"sha256:b7bd98b796e2b6553da7225aeb61f447f80a1ca64f41d83612e6139ca5213aa4",
"sha256:b87db4360013327109564f0e591bd2a3b318547bcef31b468a92ee504d07ae4f",
"sha256:bcb3ed405ed3222f9904899563d6fc492ff75cce56cba05e32eff40e6acbeaa3",
"sha256:d4306c36ca495956b6d568d276ac11fdd9c30a36f1b6eb928070dc5360b22e1c",
"sha256:d5ee4f386140395a2c818d149221149c54849dfcfcb9f1debfe07a8b8bd63f9a",
"sha256:dda30ba7e87fbbb7eab1ec9f58678558fd9a6b8b853530e176eabd064da81417",
"sha256:e04e26803c9c3851c931eac40c695602c6295b8d432cbe78609649ad9bd2da8a",
"sha256:e1c0b87e09fa55a220f058d1d49d3fb8df88fbfab58558f1198e08c1e1de842a",
"sha256:e72591e9ecd94d7feb70c1cbd7be7b3ebea3f548870aa91e2732960fa4d57a37",
"sha256:e8c843bbcda3a2f1e3c2ab25913c80a3c5376cd00c6e8c4a86a89a28c8dc5452",
"sha256:efc1913fd2ca4f334418481c7e595c00aad186563bbc1ec76067848c7ca0a933",
"sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a",
"sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7"
"sha256:93de681e5c021a432c63147656fe21790bc01231e0cd2da73626f1aa3ac0fe27",
"sha256:cf7e59fed14b5ae17c0006eff14a2d9a00ed5f3a846148153899a0224e2c07da"
],
"markers": "python_version >= '3.7'",
"version": "==2.1.1"
"version": "==2.1.0"
},
"markupsafe": {
"hashes": [
"sha256:0576fe974b40a400449768941d5d0858cc624e3249dfd1e0c33674e5c7ca7aed",
"sha256:085fd3201e7b12809f9e6e9bc1e5c96a368c8523fad5afb02afe3c051ae4afcc",
"sha256:090376d812fb6ac5f171e5938e82e7f2d7adc2b629101cec0db8b267815c85e2",
"sha256:0b462104ba25f1ac006fdab8b6a01ebbfbce9ed37fd37fd4acd70c67c973e460",
"sha256:137678c63c977754abe9086a3ec011e8fd985ab90631145dfb9294ad09c102a7",
"sha256:1bea30e9bf331f3fef67e0a3877b2288593c98a21ccb2cf29b74c581a4eb3af0",
"sha256:22152d00bf4a9c7c83960521fc558f55a1adbc0631fbb00a9471e097b19d72e1",
"sha256:22731d79ed2eb25059ae3df1dfc9cb1546691cc41f4e3130fe6bfbc3ecbbecfa",
"sha256:2298c859cfc5463f1b64bd55cb3e602528db6fa0f3cfd568d3605c50678f8f03",
"sha256:28057e985dace2f478e042eaa15606c7efccb700797660629da387eb289b9323",
"sha256:2e7821bffe00aa6bd07a23913b7f4e01328c3d5cc0b40b36c0bd81d362faeb65",
"sha256:2ec4f2d48ae59bbb9d1f9d7efb9236ab81429a764dedca114f5fdabbc3788013",
"sha256:340bea174e9761308703ae988e982005aedf427de816d1afe98147668cc03036",
"sha256:40627dcf047dadb22cd25ea7ecfe9cbf3bbbad0482ee5920b582f3809c97654f",
"sha256:40dfd3fefbef579ee058f139733ac336312663c6706d1163b82b3003fb1925c4",
"sha256:4cf06cdc1dda95223e9d2d3c58d3b178aa5dacb35ee7e3bbac10e4e1faacb419",
"sha256:50c42830a633fa0cf9e7d27664637532791bfc31c731a87b202d2d8ac40c3ea2",
"sha256:55f44b440d491028addb3b88f72207d71eeebfb7b5dbf0643f7c023ae1fba619",
"sha256:608e7073dfa9e38a85d38474c082d4281f4ce276ac0010224eaba11e929dd53a",
"sha256:63ba06c9941e46fa389d389644e2d8225e0e3e5ebcc4ff1ea8506dce646f8c8a",
"sha256:65608c35bfb8a76763f37036547f7adfd09270fbdbf96608be2bead319728fcd",
"sha256:665a36ae6f8f20a4676b53224e33d456a6f5a72657d9c83c2aa00765072f31f7",
"sha256:6d6607f98fcf17e534162f0709aaad3ab7a96032723d8ac8750ffe17ae5a0666",
"sha256:7313ce6a199651c4ed9d7e4cfb4aa56fe923b1adf9af3b420ee14e6d9a73df65",
"sha256:7668b52e102d0ed87cb082380a7e2e1e78737ddecdde129acadb0eccc5423859",
"sha256:7df70907e00c970c60b9ef2938d894a9381f38e6b9db73c5be35e59d92e06625",
"sha256:7e007132af78ea9df29495dbf7b5824cb71648d7133cf7848a2a5dd00d36f9ff",
"sha256:835fb5e38fd89328e9c81067fd642b3593c33e1e17e2fdbf77f5676abb14a156",
"sha256:8bca7e26c1dd751236cfb0c6c72d4ad61d986e9a41bbf76cb445f69488b2a2bd",
"sha256:8db032bf0ce9022a8e41a22598eefc802314e81b879ae093f36ce9ddf39ab1ba",
"sha256:99625a92da8229df6d44335e6fcc558a5037dd0a760e11d84be2260e6f37002f",
"sha256:9cad97ab29dfc3f0249b483412c85c8ef4766d96cdf9dcf5a1e3caa3f3661cf1",
"sha256:a4abaec6ca3ad8660690236d11bfe28dfd707778e2442b45addd2f086d6ef094",
"sha256:a6e40afa7f45939ca356f348c8e23048e02cb109ced1eb8420961b2f40fb373a",
"sha256:a6f2fcca746e8d5910e18782f976489939d54a91f9411c32051b4aab2bd7c513",
"sha256:a806db027852538d2ad7555b203300173dd1b77ba116de92da9afbc3a3be3eed",
"sha256:abcabc8c2b26036d62d4c746381a6f7cf60aafcc653198ad678306986b09450d",
"sha256:b8526c6d437855442cdd3d87eede9c425c4445ea011ca38d937db299382e6fa3",
"sha256:bb06feb762bade6bf3c8b844462274db0c76acc95c52abe8dbed28ae3d44a147",
"sha256:c0a33bc9f02c2b17c3ea382f91b4db0e6cde90b63b296422a939886a7a80de1c",
"sha256:c4a549890a45f57f1ebf99c067a4ad0cb423a05544accaf2b065246827ed9603",
"sha256:ca244fa73f50a800cf8c3ebf7fd93149ec37f5cb9596aa8873ae2c1d23498601",
"sha256:cf877ab4ed6e302ec1d04952ca358b381a882fbd9d1b07cccbfd61783561f98a",
"sha256:d9d971ec1e79906046aa3ca266de79eac42f1dbf3612a05dc9368125952bd1a1",
"sha256:da25303d91526aac3672ee6d49a2f3db2d9502a4a60b55519feb1a4c7714e07d",
"sha256:e55e40ff0cc8cc5c07996915ad367fa47da6b3fc091fdadca7f5403239c5fec3",
"sha256:f03a532d7dee1bed20bc4884194a16160a2de9ffc6354b3878ec9682bb623c54",
"sha256:f1cd098434e83e656abf198f103a8207a8187c0fc110306691a2e94a78d0abb2",
"sha256:f2bfb563d0211ce16b63c7cb9395d2c682a23187f54c3d79bfec33e6705473c6",
"sha256:f8ffb705ffcf5ddd0e80b65ddf7bed7ee4f5a441ea7d3419e861a12eaf41af58"
],
"markers": "python_version >= '3.7'",
"version": "==2.1.2"
},
"marshmallow": {
"hashes": [
@ -631,6 +642,14 @@
],
"version": "==1.5.1"
},
"mdurl": {
"hashes": [
"sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8",
"sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"
],
"markers": "python_version >= '3.7'",
"version": "==0.1.2"
},
"mutagen": {
"hashes": [
"sha256:6e5f8ba84836b99fe60be5fb27f84be4ad919bbb6b49caa6ae81e70584b55e58",
@ -837,10 +856,10 @@
},
"pytz": {
"hashes": [
"sha256:7ccfae7b4b2c067464a6733c6261673fdb8fd1be905460396b97a073e9fa683a",
"sha256:93007def75ae22f7cd991c84e02d434876818661f8df9ad5df9e950ff4e52cfd"
"sha256:01a0681c4b9684a28304615eba55d1ab31ae00bf68ec157ec3708a8182dbbcd0",
"sha256:78f4f37d8198e0627c5f1143240bb0206b8691d8d7ac6d78fee88b78733f8c4a"
],
"version": "==2022.7"
"version": "==2022.7.1"
},
"pytz-deprecation-shim": {
"hashes": [
@ -1016,11 +1035,11 @@
},
"rich": {
"hashes": [
"sha256:25f83363f636995627a99f6e4abc52ed0970ebbd544960cc63cbb43aaac3d6f0",
"sha256:41fe1d05f433b0f4724cda8345219213d2bfa472ef56b2f64f415b5b94d51b04"
"sha256:7c963f0d03819221e9ac561e1bc866e3f95a02248c1234daa48954e6d381c003",
"sha256:f1a00cdd3eebf999a15d85ec498bfe0b1a77efe9b34f645768a54132ef444ac5"
],
"markers": "python_version >= '3.7'",
"version": "==13.0.1"
"version": "==13.2.0"
},
"rsa": {
"hashes": [
@ -1064,11 +1083,11 @@
},
"snscrape": {
"hashes": [
"sha256:af30d12872da692ff9ccaf5651962edceb1fd4a28cf7cc92c8c898902f009ce3",
"sha256:fd176765196ca17979be7f54e041f430e4cb23a5e651fa29cf3dc382258019f2"
"sha256:106bd375d47b683f88e96acbf425747358fd851f5282a91a0fa0c6784f29f2e4",
"sha256:194078946ff53c8b2a79db7695dde351819b7849009aa137e26cda924d3ae702"
],
"index": "pypi",
"version": "==0.4.3.20220106"
"version": "==0.5.0.20230113"
},
"sortedcontainers": {
"hashes": [
@ -1310,6 +1329,22 @@
"markers": "python_version >= '3.6'",
"version": "==2.10.0"
},
"setuptools-pipfile": {
"hashes": [
"sha256:54cb6bf6a662fe74951425d509772a5302d1cf723d9a3654d19c2468d3d80b6b",
"sha256:f6049892af8e8233a438cf00fb4477fe81de3ea0e8e90c1241d196cb40f703b5"
],
"index": "pypi",
"version": "==0.7.0"
},
"toml": {
"hashes": [
"sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
"sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
],
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==0.10.2"
},
"tomli": {
"hashes": [
"sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",

4
pyproject.toml 100644
Wyświetl plik

@ -0,0 +1,4 @@
[build-system]
requires = ["setuptools", "wheel", "setuptools-pipfile"]
build-backend = "setuptools.build_meta"
[tool.setuptools-pipfile]

49
setup.cfg 100644
Wyświetl plik

@ -0,0 +1,49 @@
[metadata]
name = auto_archiver
version = 2.0.0
author = Bellingcat
author_email = tech@bellingcat.com
description = Easily archive online media content
long_description = file: README.md, LICENSE
keywords = archive, oosi, osint, scraping
license = MIT
classifiers =
Intended Audience :: Developers,
Intended Audience :: Science/Research,
License :: OSI Approved :: MIT License,
Programming Language :: Python :: 3,
[options]
setup_requires =
setuptools-pipfile
zip_safe = False
include_package_data = True
package_dir=
=src
packages=find:
find_packages=true
python_requires = >=3.8
# [options.package_data]
# * = *.txt, *.rst
# hello = *.msg
[options.entry_points]
console_scripts =
auto-archiver = auto_archiver.__main__:main
# [options.extras_require]
# pdf = ReportLab>=1.2; RXP
# rest = docutils>=0.3; pack ==1.1, ==1.3
[options.packages.find]
where=src
# include=auto_archiver*
# exclude =
# examples*
# .eggs*
# build*
# secrets*
# tmp*
# docs*
# src.tests*

Wyświetl plik

@ -0,0 +1 @@
# from .auto_archiver import *

Wyświetl plik

@ -0,0 +1,7 @@
from . import archivers, databases, enrichers, feeders, formatters, storages, utils, core
# need to manually specify due to cyclical deps
from .core.orchestrator import ArchivingOrchestrator
from .core.v2config import ConfigV2
# making accessible directly
from .core.metadata import Metadata

Wyświetl plik

@ -0,0 +1,12 @@
from . import ConfigV2
from . import ArchivingOrchestrator
def main():
config = ConfigV2()
config.parse()
orchestrator = ArchivingOrchestrator(config)
orchestrator.feed()
if __name__ == "__main__":
main()

Wyświetl plik

@ -1,5 +1,5 @@
# we need to explicitly expose the available imports here
from .base_archiver import Archiver, ArchiveResult
# from .base_archiver import Archiver, ArchiveResult
# from .telegram_archiver import TelegramArchiver
# from .telethon_archiver import TelethonArchiver
# from .tiktok_archiver import TiktokArchiver

Wyświetl plik

@ -2,9 +2,9 @@ from __future__ import annotations
from abc import abstractmethod
from dataclasses import dataclass
import os
from metadata import Metadata
from steps.step import Step
import mimetypes, requests
from ..core import Metadata
from ..core import Step
@dataclass

Wyświetl plik

@ -11,7 +11,7 @@ from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from slugify import slugify
from configs import Config
from ..configs import Config
from storages import Storage
from utils import mkdir_if_not_exists

Wyświetl plik

@ -2,10 +2,9 @@ import re, os, shutil, html, traceback
import instaloader # https://instaloader.github.io/as-module.html
from loguru import logger
from metadata import Metadata
from media import Media
from .archiver import Archiverv2
from . import Archiverv2
from ..core import Metadata
from ..core import Media
class InstagramArchiver(Archiverv2):
"""

Wyświetl plik

@ -4,9 +4,9 @@ import html
from bs4 import BeautifulSoup
from loguru import logger
from metadata import Metadata
from media import Media
from .archiver import Archiverv2
from . import Archiverv2
from ..core import Metadata
from ..core import Media
class TelegramArchiver(Archiverv2):

Wyświetl plik

@ -1,5 +1,4 @@
from archivers import Archiverv2
from metadata import Metadata
from telethon.sync import TelegramClient
from telethon.errors import ChannelInvalidError
from telethon.tl.types import PeerUser, PeerChat, PeerChannel
@ -9,7 +8,9 @@ from loguru import logger
from tqdm import tqdm
import re, time, json, os
from media import Media
from . import Archiverv2
from ..core import Metadata
from ..core import Media
class TelethonArchiver(Archiverv2):

Wyświetl plik

@ -5,9 +5,9 @@ import uuid
import tiktok_downloader
from loguru import logger
from metadata import Metadata
from media import Media
from .archiver import Archiverv2
from . import Archiverv2
from ..core import Metadata
from ..core import Media
class TiktokArchiver(Archiverv2):

Wyświetl plik

@ -7,10 +7,10 @@ from loguru import logger
from pytwitter import Api
from slugify import slugify
from metadata import Metadata
from media import Media
from . import Archiverv2
from .twitter_archiverv2 import TwitterArchiver
from .archiver import Archiverv2
from ..core import Metadata
from ..core import Media
class TwitterApiArchiver(TwitterArchiver, Archiverv2):

Wyświetl plik

@ -4,12 +4,12 @@ import json
import os
from datetime import datetime
from loguru import logger
from metadata import Metadata
from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
from archivers import Archiverv2
from media import Media
from slugify import slugify
from . import Archiverv2
from ..core import Metadata
from ..core import Media
class TwitterArchiver(Archiverv2):
"""

Wyświetl plik

@ -1,10 +1,10 @@
from loguru import logger
from vk_url_scraper import VkScraper
from utils.misc import dump_payload
from metadata import Metadata
from media import Media
from .archiver import Archiverv2
from ..utils.misc import dump_payload
from . import Archiverv2
from ..core import Metadata
from ..core import Media
class VkArchiver(Archiverv2):

Wyświetl plik

@ -4,9 +4,9 @@ import os
import yt_dlp
from loguru import logger
from metadata import Metadata
from media import Media
from .archiver import Archiverv2
from . import Archiverv2
from ..core import Metadata
from ..core import Media
class YoutubeDLArchiver(Archiverv2):

Wyświetl plik

@ -0,0 +1,7 @@
from .media import Media
from .metadata import Metadata
from .step import Step
# cannot import ArchivingOrchestrator/Config to avoid circular dep
# from .orchestrator import ArchivingOrchestrator
# from .v2config import ConfigV2

Wyświetl plik

@ -3,9 +3,11 @@ from __future__ import annotations
from ast import List
from typing import Any
from dataclasses import dataclass, field
from dataclasses_json import dataclass_json
import mimetypes
# annotation order matters
@dataclass_json
@dataclass
class Media:
filename: str

Wyświetl plik

@ -3,13 +3,15 @@ from __future__ import annotations
from ast import List, Set
from typing import Any, Union, Dict
from dataclasses import dataclass, field
from dataclasses_json import dataclass_json
import datetime, mimetypes
from urllib.parse import urlparse
from loguru import logger
from dateutil.parser import parse as parse_dt
from media import Media
from .media import Media
# annotation order matters
@dataclass_json
@dataclass
class Metadata:
status: str = "no archiver"

Wyświetl plik

@ -3,14 +3,14 @@ from ast import List
from typing import Union, Dict
from dataclasses import dataclass
from archivers import Archiverv2
from feeders import Feeder
from formatters import Formatter
from media import Media
from storages import StorageV2
from enrichers import Enricher
from databases import Database
from metadata import Metadata
from ..archivers import Archiverv2
from ..feeders import Feeder
from ..formatters import Formatter
from ..storages import StorageV2
from ..enrichers import Enricher
from ..databases import Database
from .media import Media
from .metadata import Metadata
import tempfile, time, traceback
from loguru import logger
@ -56,17 +56,6 @@ Cisticola considerations:
class ArchivingOrchestrator:
def __init__(self, config) -> None:
# in config.py we should test that the archivers exist and log mismatches (blocking execution)
# identify each formatter, storage, database, etc
# self.feeder = Feeder.init(config.feeder, config.get(config.feeder))
# Is it possible to overwrite config.yaml values? it could be useful: share config file and modify gsheet_feeder.sheet via CLI
# where does that update/processing happen? in config.py
# reflection for Archiver to know which child classes it has? use Archiver.__subclasses__
# self.archivers = [
# Archiver.init(a, config)
# for a in config.archivers
# ]
self.feeder: Feeder = config.feeder
self.formatter: Formatter = config.formatter
self.enrichers = config.enrichers
@ -76,50 +65,32 @@ class ArchivingOrchestrator:
for a in self.archivers: a.setup()
self.formatters = []
# self.formatters = [
# Formatter.init(f, config)
# for f in config.formatters
# ]
# self.storages = [
# Storage.init(s, config)
# for s in config.storages
# ]
# self.databases = [
# Database.init(f, config)
# for f in config.formatters
# ]
# these rules are checked in config.py
# assert len(archivers) > 1, "there needs to be at least one Archiver"
def feed(self) -> list(Metadata):
def feed(self) -> None:
for item in self.feeder:
print("ARCHIVING", item)
try:
with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
item.set_tmp_dir(tmp_dir)
result = self.archive(item)
print(result)
except KeyboardInterrupt:
# catches keyboard interruptions to do a clean exit
logger.warning(f"caught interrupt on {item=}")
for d in self.databases: d.aborted(item)
exit()
except Exception as e:
logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}')
for d in self.databases: d.failed(item)
self.feed_item(item)
print("holding on 5s")
time.sleep(5)
def feed_item(self, item:Metadata) -> Metadata:
print("ARCHIVING", item)
try:
with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
item.set_tmp_dir(tmp_dir)
result = self.archive(item)
except KeyboardInterrupt:
# catches keyboard interruptions to do a clean exit
logger.warning(f"caught interrupt on {item=}")
for d in self.databases: d.aborted(item)
exit()
except Exception as e:
logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}')
for d in self.databases: d.failed(item)
# how does this handle the parameters like folder which can be different for each archiver?
# the storage needs to know where to archive!!
# solution: feeders have context: extra metadata that they can read or ignore,
# all of it should have sensible defaults (eg: folder)
# default feeder is a list with 1 element
return result
# how does this handle the parameters like folder which can be different for each archiver?
# the storage needs to know where to archive!!
# solution: feeders have context: extra metadata that they can read or ignore,
# all of it should have sensible defaults (eg: folder)
# default feeder is a list with 1 element
def archive(self, result: Metadata) -> Union[Metadata, None]:
url = result.get_url()

Wyświetl plik

@ -2,7 +2,7 @@ from __future__ import annotations
from dataclasses import dataclass, field
from inspect import ClassFoundException
from typing import Type
from metadata import Metadata
from ..core import Metadata
from abc import ABC
# from collections.abc import Iterable

Wyświetl plik

@ -3,15 +3,16 @@
import argparse, yaml
from dataclasses import dataclass, field
from typing import List
from archivers import Archiverv2
from feeders import Feeder
from databases import Database
from formatters import Formatter
from storages import StorageV2
from steps.step import Step
from enrichers import Enricher
from collections import defaultdict
from ..archivers import Archiverv2
from ..feeders import Feeder
from ..databases import Database
from ..formatters import Formatter
from ..storages import StorageV2
from . import Step
from ..enrichers import Enricher
@dataclass
class ConfigV2:
@ -37,17 +38,22 @@ class ConfigV2:
self.defaults = {}
self.cli_ops = {}
self.config = {}
# TODO: make this work for nested props like gsheet_feeder.columns.url = "URL"
def parse(self):
# 1. parse CLI values
parser = argparse.ArgumentParser(
# prog = "auto-archiver",
description="Auto Archiver is a ...!",
epilog="Check the code at https://github.com/bellingcat/auto-archiver"
)
parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='config.yaml')
def parse(self, use_cli=True, yaml_config_filename: str = None):
"""
if yaml_config_filename is provided, the --config argument is ignored,
useful for library usage when the config values are preloaded
"""
# 1. parse CLI values
if use_cli:
parser = argparse.ArgumentParser(
# prog = "auto-archiver",
description="Auto Archiver is a ...!", # TODO: update
epilog="Check the code at https://github.com/bellingcat/auto-archiver"
)
parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='config.yaml')
for configurable in self.configurable_parents:
child: Step
@ -57,28 +63,32 @@ class ConfigV2:
assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}"
assert "." not in config, f"config property cannot contain dots('.'): {config}"
config_path = f"{child.name}.{config}"
try:
parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})", choices=details.get("choices", None))
except argparse.ArgumentError:
# captures cases when a Step is used in 2 flows, eg: wayback enricher vs wayback archiver
pass
if use_cli:
try:
parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})", choices=details.get("choices", None))
except argparse.ArgumentError:
# captures cases when a Step is used in 2 flows, eg: wayback enricher vs wayback archiver
pass
self.defaults[config_path] = details["default"]
if "cli_set" in details:
self.cli_ops[config_path] = details["cli_set"]
args = parser.parse_args()
if use_cli:
args = parser.parse_args()
yaml_config_filename = yaml_config_filename or getattr(args, "config")
else: args = {}
# 2. read YAML config file
with open(args.config, "r", encoding="utf-8") as inf:
self.yaml_config = yaml.safe_load(inf)
# 2. read YAML config file (or use provided value)
self.yaml_config = self.read_yaml(yaml_config_filename)
# print(f"{self.yaml_config.get('configurations', {})=}")
# 3. CONFIGS: decide value with priority: CLI >> config.yaml >> default
self.config = defaultdict(dict)
for config_path, default in self.defaults.items():
child, config = tuple(config_path.split("."))
val = getattr(args, config_path)
val = getattr(args, config_path, None)
if val is not None and config_path in self.cli_ops:
val = self.cli_ops[config_path](val, default)
if val is None:
@ -108,5 +118,6 @@ class ConfigV2:
print("storages", [e for e in self.storages])
print("formatter", self.formatter)
def validate(self):
pass
def read_yaml(self, yaml_filename: str) -> dict:
with open(yaml_filename, "r", encoding="utf-8") as inf:
return yaml.safe_load(inf)

Wyświetl plik

@ -0,0 +1,3 @@
from .database import Database
from .gsheet_db import GsheetsDb
from .console_db import ConsoleDb

Wyświetl plik

@ -0,0 +1,32 @@
from loguru import logger
from . import Database
from ..core import Metadata
class ConsoleDb(Database):
"""
Outputs results to the console
"""
name = "console_db"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
@staticmethod
def configs() -> dict:
return {}
def started(self, item: Metadata) -> None:
logger.warning(f"STARTED {item}")
def failed(self, item: Metadata) -> None:
logger.error(f"FAILED {item}")
def aborted(self, item: Metadata) -> None:
logger.warning(f"ABORTED {item}")
def done(self, item: Metadata) -> None:
"""archival result ready - should be saved to DB"""
logger.success(f"DONE {item}")

Wyświetl plik

@ -2,8 +2,8 @@ from __future__ import annotations
from dataclasses import dataclass
from abc import abstractmethod, ABC
from typing import Union
from metadata import Metadata
from steps.step import Step
from ..core import Metadata
from ..core import Step
@dataclass

Wyświetl plik

@ -5,11 +5,11 @@ import gspread, datetime
from loguru import logger
# from . import Enricher
from databases import Database
from metadata import Metadata
from media import Media
from steps.gsheet import Gsheets
from utils import GWorksheet
from . import Database
from ..core import Metadata
from ..core import Media
from ..utils import Gsheets
from ..utils import GWorksheet
class GsheetsDb(Database):
@ -91,7 +91,7 @@ class GsheetsDb(Database):
logger.debug(f"Unable to update sheet: {e}")
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
# TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from item and, if missing, manage its own singleton - not needed for now
gw: GWorksheet = item.get("gsheet").get("worksheet")
row: int = item.get("gsheet").get("row")
# TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from item and, if missing, manage its own singleton - not needed for now
return gw, row

Wyświetl plik

@ -1,8 +1,8 @@
from __future__ import annotations
from dataclasses import dataclass
from abc import abstractmethod, ABC
from metadata import Metadata
from steps.step import Step
from ..core import Metadata
from ..core import Step
@dataclass
class Enricher(Step, ABC):

Wyświetl plik

@ -1,11 +1,12 @@
import hashlib
from utils import Webdriver
from . import Enricher
from metadata import Metadata
from loguru import logger
from selenium.common.exceptions import TimeoutException
import time, requests
from . import Enricher
from ..utils import Webdriver
from ..core import Metadata
class HashEnricher(Enricher):
"""

Wyświetl plik

@ -1,11 +1,11 @@
from media import Media
from utils import Webdriver
from . import Enricher
from metadata import Metadata
from loguru import logger
import time, uuid, os
from selenium.common.exceptions import TimeoutException
from . import Enricher
from ..utils import Webdriver
from ..core import Media
from ..core import Metadata
class ScreenshotEnricher(Enricher):
name = "screenshot_enricher"

Wyświetl plik

@ -1,10 +1,10 @@
import uuid
from media import Media
from . import Enricher
from metadata import Metadata
from loguru import logger
import ffmpeg, os
from . import Enricher
from ..core import Media
from ..core import Metadata
class ThumbnailEnricher(Enricher):
"""

Wyświetl plik

@ -2,13 +2,13 @@ import os
import shutil
import subprocess
import uuid
from archivers.archiver import Archiverv2
from media import Media
from . import Enricher
from metadata import Metadata
from loguru import logger
import time, requests
from ..core import Media
from ..core import Metadata
from . import Enricher
class WaczEnricher(Enricher):
"""

Wyświetl plik

@ -1,9 +1,9 @@
from archivers.archiver import Archiverv2
from . import Enricher
from metadata import Metadata
from loguru import logger
import time, requests
from . import Enricher
from ..archivers import Archiverv2
from ..core import Metadata
class WaybackArchiverEnricher(Enricher, Archiverv2):
"""

Wyświetl plik

@ -1,8 +1,8 @@
from __future__ import annotations
from dataclasses import dataclass
from abc import abstractmethod
from metadata import Metadata
from steps.step import Step
from ..core import Metadata
from ..core import Step
@dataclass

Wyświetl plik

@ -2,13 +2,13 @@ import gspread, os
# from metadata import Metadata
from loguru import logger
from slugify import slugify
# from . import Enricher
from feeders import Feeder
from metadata import Metadata
from steps.gsheet import Gsheets
from utils import GWorksheet
from slugify import slugify
from . import Feeder
from ..core import Metadata
from ..utils import Gsheets
from ..utils import GWorksheet
class GsheetsFeeder(Gsheets, Feeder):
name = "gsheet_feeder"

Wyświetl plik

@ -1,8 +1,8 @@
from __future__ import annotations
from dataclasses import dataclass
from abc import abstractmethod
from metadata import Metadata
from steps.step import Step
from ..core import Metadata
from ..core import Step
@dataclass

Wyświetl plik

@ -2,12 +2,13 @@ from __future__ import annotations
from dataclasses import dataclass
from abc import abstractmethod
import mimetypes
from metadata import Metadata
from media import Media
from formatters import Formatter
from jinja2 import Environment, FileSystemLoader
import uuid, os, pathlib
from ..core import Metadata
from ..core import Media
from . import Formatter
@dataclass
class HtmlFormatter(Formatter):
@ -72,5 +73,6 @@ def is_audio_jinja(s: str) -> bool:
m = mimetypes.guess_type(s)[0]
return "audio" in (m or "")
def is_media_jinja(v) -> bool:
return isinstance(v, Media)

Wyświetl plik

@ -0,0 +1,9 @@
# we need to explicitly expose the available imports here
from .base_storage import Storage
# from .local_storage import LocalStorage, LocalConfig
# from .s3_storage import S3Config, S3Storage
# from .gd_storage import GDConfig, GDStorage
from .storage import StorageV2
from .s3 import S3StorageV2
from .local import LocalStorageV2

Wyświetl plik

@ -3,12 +3,13 @@ import shutil
from typing import IO, Any
import boto3, uuid, os, mimetypes
from botocore.errorfactory import ClientError
from metadata import Metadata
from media import Media
from storages import StorageV2
from loguru import logger
from slugify import slugify
from ..core import Metadata
from ..core import Media
from ..storages import StorageV2
class LocalStorageV2(StorageV2):
name = "local_storage"

Wyświetl plik

@ -2,9 +2,9 @@
from typing import IO, Any
import boto3, uuid, os, mimetypes
from botocore.errorfactory import ClientError
from metadata import Metadata
from media import Media
from storages import StorageV2
from ..core import Metadata
from ..core import Media
from ..storages import StorageV2
from loguru import logger
from slugify import slugify

Wyświetl plik

@ -2,9 +2,8 @@ from __future__ import annotations
from abc import abstractmethod
from dataclasses import dataclass
from typing import IO, Any
from media import Media
from metadata import Metadata
from steps.step import Step
from ..core import Media, Metadata, Step
from loguru import logger
import os, uuid
from slugify import slugify

Wyświetl plik

@ -2,4 +2,5 @@
from .gworksheet import GWorksheet
from .misc import *
from .util import Util
from .webdriver import Webdriver
from .webdriver import Webdriver
from .gsheet import Gsheets

Wyświetl plik

@ -1,7 +1,7 @@
import json, gspread
from loguru import logger
from steps.step import Step
from ..core import Step
class Gsheets(Step):

Wyświetl plik

@ -1,8 +1,7 @@
from __future__ import annotations
from abc import abstractmethod
from dataclasses import dataclass
from metadata import Metadata
from steps.step import Step
from ..core import Metadata, Step
#TODO: likely unused
@dataclass

Wyświetl plik

@ -1,7 +0,0 @@
from .config import Config
from .selenium_config import SeleniumConfig
from .telethon_config import TelethonConfig
from .wayback_config import WaybackConfig
from .twitter_api_config import TwitterApiConfig
from .vk_config import VkConfig
from .instagram_config import InstagramConfig

Wyświetl plik

@ -1,7 +0,0 @@
from dataclasses import dataclass
@dataclass
class BrowsertrixConfig:
enabled: bool
profile: str
timeout_seconds: str

Wyświetl plik

@ -1,309 +0,0 @@
import argparse, yaml, json, os
import gspread
from loguru import logger
from selenium import webdriver
from dataclasses import asdict
from selenium.common.exceptions import TimeoutException
from utils import GWorksheet, getattr_or
from .wayback_config import WaybackConfig
from .telethon_config import TelethonConfig
from .selenium_config import SeleniumConfig
from .vk_config import VkConfig
from .twitter_api_config import TwitterApiConfig
from .browsertrix_config import BrowsertrixConfig
from .instagram_config import InstagramConfig
from storages import S3Config, S3Storage, GDStorage, GDConfig, LocalStorage, LocalConfig
class Config:
"""
Controls the current execution parameters and manages API configurations
Usage:
c = Config() # initializes the argument parser
c.parse() # parses the values and initializes the Services and API clients
# you can then access the Services and APIs like 'c.s3_config'
All the configurations available as cmd line options, when included, will
override the configurations in the config.yaml file.
Configurations are split between:
1. "secrets" containing API keys for generating services - not kept in memory
2. "execution" containing specific execution configurations
"""
AVAILABLE_STORAGES = {"s3", "gd", "local"}
def __init__(self):
self.parser = self.get_argument_parser()
self.folder = ""
self.is_docker = bool(os.environ.get("IS_DOCKER", 0))
def parse(self):
self.args = self.parser.parse_args()
logger.success(f'Command line arguments parsed successfully')
self.config_file = self.args.config
self.read_config_yaml()
logger.info(f'APIs and Services initialized:\n{self}')
def read_config_yaml(self):
with open(self.config_file, "r", encoding="utf-8") as inf:
self.config = yaml.safe_load(inf)
self.url = getattr_or(self.args, "url", '')
# ----------------------EXECUTION - execution configurations
execution = self.config.get("execution", {})
self.sheet = getattr_or(self.args, "sheet", execution.get("sheet"))
assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file"
def ensure_set(l):
# always returns a set of strings, can receive a set or a string
l = l if isinstance(l, list) else [l]
return set([x for x in l if isinstance(x, str) and len(x) > 0])
self.worksheet_allow = ensure_set(execution.get("worksheet_allow", []))
self.worksheet_block = ensure_set(execution.get("worksheet_block", []))
self.header = int(getattr_or(self.args, "header", execution.get("header", 1)))
self.storage = getattr_or(self.args, "storage", execution.get("storage", "s3"))
self.save_logs = getattr(self.args, "save_logs") or execution.get("save_logs", False)
if self.save_logs:
self.set_log_files()
self.check_if_exists = getattr(self.args, "check_if_exists") or execution.get("check_if_exists", False)
# Column names come from config and can be overwritten by CMD
# in the end all are considered as lower case
config_column_names = execution.get("column_names", {})
self.column_names = {}
for k in GWorksheet.COLUMN_NAMES.keys():
self.column_names[k] = getattr_or(self.args, k, config_column_names.get(k, GWorksheet.COLUMN_NAMES[k])).lower()
# selenium driver
selenium_configs = execution.get("selenium", {})
self.selenium_config = SeleniumConfig(
timeout_seconds=int(selenium_configs.get("timeout_seconds", SeleniumConfig.timeout_seconds)),
window_width=int(selenium_configs.get("window_width", SeleniumConfig.window_width)),
window_height=int(selenium_configs.get("window_height", SeleniumConfig.window_height))
)
self.webdriver = "not initialized"
# browsertrix config
browsertrix_configs = execution.get("browsertrix", {})
if len(browsertrix_profile := browsertrix_configs.get("profile", "")):
browsertrix_profile = os.path.abspath(browsertrix_profile)
self.browsertrix_config = BrowsertrixConfig(
enabled=bool(browsertrix_configs.get("enabled", False)),
profile=browsertrix_profile,
timeout_seconds=browsertrix_configs.get("timeout_seconds", "90")
)
self.hash_algorithm = execution.get("hash_algorithm", "SHA-256")
# ---------------------- SECRETS - APIs and service configurations
secrets = self.config.get("secrets", {})
# assert selected storage credentials exist
for key, name in [("s3", "s3"), ("gd", "google_drive"), ("local", "local")]:
assert self.storage != key or name in secrets, f"selected storage '{key}' requires secrets.'{name}' in {self.config_file}"
# google sheets config
self.gsheets_client = gspread.service_account(
filename=secrets.get("google_sheets", {}).get("service_account", 'service_account.json')
)
# facebook config
self.facebook_cookie = secrets.get("facebook", {}).get("cookie", None)
# s3 config
if "s3" in secrets:
s3 = secrets["s3"]
self.s3_config = S3Config(
bucket=s3["bucket"],
region=s3["region"],
key=s3["key"],
secret=s3["secret"],
endpoint_url=s3.get("endpoint_url", S3Config.endpoint_url),
cdn_url=s3.get("cdn_url", S3Config.cdn_url),
key_path=s3.get("key_path", S3Config.key_path),
private=getattr_or(self.args, "s3-private", s3.get("private", S3Config.private))
)
# GDrive config
if "google_drive" in secrets:
gd = secrets["google_drive"]
self.gd_config = GDConfig(
root_folder_id=gd.get("root_folder_id"),
oauth_token_filename=gd.get("oauth_token_filename"),
service_account=gd.get("service_account", GDConfig.service_account)
)
if "local" in secrets:
self.local_config = LocalConfig(
save_to=secrets["local"].get("save_to", LocalConfig.save_to),
)
# wayback machine config
if "wayback" in secrets:
self.wayback_config = WaybackConfig(
key=secrets["wayback"]["key"],
secret=secrets["wayback"]["secret"],
)
else:
self.wayback_config = None
logger.debug(f"'wayback' key not present in the {self.config_file=}")
# telethon config
if "telegram" in secrets:
self.telegram_config = TelethonConfig(
api_id=secrets["telegram"]["api_id"],
api_hash=secrets["telegram"]["api_hash"],
bot_token=secrets["telegram"].get("bot_token", None),
session_file=secrets["telegram"].get("session_file", "./anon")
)
else:
self.telegram_config = None
logger.debug(f"'telegram' key not present in the {self.config_file=}")
# twitter config
if "twitter" in secrets:
self.twitter_config = TwitterApiConfig(
bearer_token=secrets["twitter"].get("bearer_token"),
consumer_key=secrets["twitter"].get("consumer_key"),
consumer_secret=secrets["twitter"].get("consumer_secret"),
access_token=secrets["twitter"].get("access_token"),
access_secret=secrets["twitter"].get("access_secret"),
)
else:
self.twitter_config = None
logger.debug(f"'twitter' key not present in the {self.config_file=}")
# vk config
if "vk" in secrets:
self.vk_config = VkConfig(
username=secrets["vk"]["username"],
password=secrets["vk"]["password"],
session_file=secrets["vk"].get("session_file", "./vk_config.v2.json")
)
else:
self.vk_config = None
logger.debug(f"'vk' key not present in the {self.config_file=}")
# instagram config
if "instagram" in secrets:
self.instagram_config = InstagramConfig(
username=secrets["instagram"]["username"],
password=secrets["instagram"]["password"],
session_file=secrets["instagram"].get("session_file", "instaloader.session")
)
else:
self.instagram_config = None
logger.debug(f"'instagram' key not present in the {self.config_file=}")
del self.config["secrets"] # delete to prevent leaks
def set_log_files(self):
# called only when config.execution.save_logs=true
logger.add("logs/1trace.log", level="TRACE")
logger.add("logs/2info.log", level="INFO")
logger.add("logs/3success.log", level="SUCCESS")
logger.add("logs/4warning.log", level="WARNING")
logger.add("logs/5error.log", level="ERROR")
def get_argument_parser(self):
"""
Creates the CMD line arguments. 'python auto_archive.py --help'
"""
parser = argparse.ArgumentParser(description='Automatically archive social media posts, videos, and images from a Google Sheets document. The command line arguments will always override the configurations in the provided YAML config file (--config), only some high-level options are allowed via the command line and the YAML configuration file is the preferred method. The sheet must have the "url" and "status" for the archiver to work. ')
parser.add_argument('--url', action='store', dest='url', help='single URL to archive - to use only via cli.py and not google sheets interaction')
parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='config.yaml')
parser.add_argument('--storage', action='store', dest='storage', help='which storage to use [execution.storage in config.yaml]', choices=Config.AVAILABLE_STORAGES)
parser.add_argument('--sheet', action='store', dest='sheet', help='the name of the google sheets document [execution.sheet in config.yaml]')
parser.add_argument('--header', action='store', dest='header', help='1-based index for the header row [execution.header in config.yaml]')
parser.add_argument('--check-if-exists', action='store_true', dest='check_if_exists', help='when possible checks if the URL has been archived before and does not archive the same URL twice [exceution.check_if_exists]')
parser.add_argument('--save-logs', action='store_true', dest='save_logs', help='creates or appends execution logs to files logs/LEVEL.log [exceution.save_logs]')
parser.add_argument('--s3-private', action='store_true', help='Store content without public access permission (only for storage=s3) [secrets.s3.private in config.yaml]')
for k, v in GWorksheet.COLUMN_NAMES.items():
help = f"the name of the column to FILL WITH {k} (default='{v}')"
if k in ["url", "folder"]:
help = f"the name of the column to READ {k} FROM (default='{v}')"
parser.add_argument(f'--col-{k}', action='store', dest=k, help=help)
return parser
def set_folder(self, folder):
"""
update the folder in each of the storages
"""
self.folder = folder
logger.info(f"setting folder to {folder}")
# s3
if hasattr(self, "s3_config"): self.s3_config.folder = folder
if hasattr(self, "s3_storage"): self.s3_storage.folder = folder
# gdrive
if hasattr(self, "gd_config"): self.gd_config.folder = folder
if hasattr(self, "gd_storage"): self.gd_storage.folder = folder
# local
if hasattr(self, "local_config"): self.local_config.folder = folder
if hasattr(self, "local_storage"): self.local_storage.folder = folder
def get_storage(self):
"""
returns the configured type of storage, creating if needed
"""
if self.storage == "s3":
self.s3_storage = getattr_or(self, "s3_storage", S3Storage(self.s3_config))
return self.s3_storage
elif self.storage == "gd":
self.gd_storage = getattr_or(self, "gd_storage", GDStorage(self.gd_config))
return self.gd_storage
elif self.storage == "local":
self.local_storage = getattr_or(self, "local_storage", LocalStorage(self.local_config))
return self.local_storage
raise f"storage {self.storage} not implemented, available: {Config.AVAILABLE_STORAGES}"
def destroy_webdriver(self):
if self.webdriver is not None and type(self.webdriver) != str:
self.webdriver.close()
self.webdriver.quit()
del self.webdriver
def recreate_webdriver(self):
options = webdriver.FirefoxOptions()
options.headless = True
options.set_preference('network.protocol-handler.external.tg', False)
try:
new_webdriver = webdriver.Firefox(options=options)
# only destroy if creation is successful
self.destroy_webdriver()
self.webdriver = new_webdriver
self.webdriver.set_window_size(self.selenium_config.window_width,
self.selenium_config.window_height)
self.webdriver.set_page_load_timeout(self.selenium_config.timeout_seconds)
except TimeoutException as e:
logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}")
def __str__(self) -> str:
return json.dumps({
"config_file": self.config_file,
"sheet": self.sheet,
"worksheet_allow": list(self.worksheet_allow),
"worksheet_block": list(self.worksheet_block),
"storage": self.storage,
"header": self.header,
"check_if_exists": self.check_if_exists,
"hash_algorithm": self.hash_algorithm,
"browsertrix_config": asdict(self.browsertrix_config),
"save_logs": self.save_logs,
"selenium_config": asdict(self.selenium_config),
"selenium_webdriver": self.webdriver != None,
"s3_config": hasattr(self, "s3_config"),
"s3_private": getattr_or(getattr(self, "s3_config", {}), "private", None),
"gd_config": hasattr(self, "gd_config"),
"local_config": hasattr(self, "local_config"),
"wayback_config": self.wayback_config != None,
"telegram_config": self.telegram_config != None,
"twitter_config": self.twitter_config != None,
"vk_config": self.vk_config != None,
"gsheets_client": self.gsheets_client != None,
"column_names": self.column_names,
}, ensure_ascii=False, indent=4)

Wyświetl plik

@ -1,9 +0,0 @@
from dataclasses import dataclass
@dataclass
class InstagramConfig:
username: str
password: str
session_file: str

Wyświetl plik

@ -1,8 +0,0 @@
from dataclasses import dataclass
@dataclass
class SeleniumConfig:
timeout_seconds: int = 120
window_width: int = 1400
window_height: int = 2000

Wyświetl plik

@ -1,10 +0,0 @@
from dataclasses import dataclass
@dataclass
class TelethonConfig:
api_id: str
api_hash: str
bot_token: str
session_file: str

Wyświetl plik

@ -1,11 +0,0 @@
from dataclasses import dataclass
@dataclass
class TwitterApiConfig:
bearer_token: str
consumer_key: str
consumer_secret: str
access_token: str
access_secret: str

Wyświetl plik

@ -1,9 +0,0 @@
from dataclasses import dataclass
@dataclass
class VkConfig:
username: str
password: str
session_file: str

Wyświetl plik

@ -1,8 +0,0 @@
from dataclasses import dataclass
@dataclass
class WaybackConfig:
key: str
secret: str

Wyświetl plik

@ -1,2 +0,0 @@
from .database import Database
from .gsheet_db import GsheetsDb

Wyświetl plik

@ -1,9 +0,0 @@
# we need to explicitly expose the available imports here
from .base_storage import Storage
from .local_storage import LocalStorage, LocalConfig
from .s3_storage import S3Config, S3Storage
from .gd_storage import GDConfig, GDStorage
from .storage import StorageV2
from .s3 import S3StorageV2
from .local import LocalStorageV2

Wyświetl plik

@ -1,36 +0,0 @@
import os
from dataclasses import dataclass
from loguru import logger
from .base_storage import Storage
from utils import mkdir_if_not_exists
@dataclass
class LocalConfig:
folder: str = ""
save_to: str = "./"
class LocalStorage(Storage):
def __init__(self, config:LocalConfig):
self.folder = config.folder
self.save_to = config.save_to
mkdir_if_not_exists(self.save_to)
def get_cdn_url(self, key):
key = self.clean_key(key)
logger.info(f"{key=}")
full_path = os.path.join(self.save_to, self.folder, key)
logger.debug(f"{full_path=} creating dir structure to {os.path.dirname(full_path)}")
os.makedirs(os.path.dirname(full_path), exist_ok=True)
# mkdir_if_not_exists(os.path.join(*full_path.split(os.path.sep)[0:-1]))
return os.path.abspath(full_path)
def exists(self, key):
return os.path.isfile(self.get_cdn_url(key))
def uploadf(self, file, key, **kwargs):
path = self.get_cdn_url(key)
with open(path, "wb") as outf:
outf.write(file.read())

Wyświetl plik

@ -1,12 +0,0 @@
from abc import ABC
from configs.v2config import ConfigV2
from orchestrator import ArchivingOrchestrator
config = ConfigV2()
config.parse()
orchestrator = ArchivingOrchestrator(config)
orchestrator.feed()