kopia lustrzana https://github.com/bellingcat/auto-archiver
pyproject
rodzic
ea2c266fa2
commit
753039240f
|
@ -24,4 +24,6 @@ browsertrix/*
|
|||
browsertrix-tmp/*
|
||||
instaloader/*
|
||||
instaloader.session
|
||||
orchestration.yaml
|
||||
orchestration.yaml
|
||||
auto_archiver.egg-info*
|
||||
logs*
|
2
Pipfile
2
Pipfile
|
@ -29,9 +29,11 @@ instaloader = "*"
|
|||
tqdm = "*"
|
||||
jinja2 = "*"
|
||||
cryptography = "==38.0.4"
|
||||
dataclasses-json = "*"
|
||||
|
||||
[requires]
|
||||
python_version = "3.9"
|
||||
|
||||
[dev-packages]
|
||||
autopep8 = "*"
|
||||
setuptools-pipfile = "*"
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "bcc36e9ecdf6d383a1010629484eec271699ac23b40be045d9a9669b4c9fac8c"
|
||||
"sha256": "e2f5d017d9bc9eef90cced189b6e3017d740c35d204962479417109a4deeb7f4"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
|
@ -57,19 +57,19 @@
|
|||
},
|
||||
"boto3": {
|
||||
"hashes": [
|
||||
"sha256:96055651f7be882175aa334ad46528e1ad79fb8ca33fa9c3998cc1d985b34eab",
|
||||
"sha256:e24d65c31780c208768ebcd152d8a0181591c9c8e7d971e23f318d7f41910ba1"
|
||||
"sha256:4e876ba5d64928cde0c416dd844f04f22d6b73d14002bbc3ca55591f80f49927",
|
||||
"sha256:c729bb0af76e85a2776b6bd3da8d9fa0f4b91b425eab51612aa53956f644ee23"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.26.46"
|
||||
"version": "==1.26.54"
|
||||
},
|
||||
"botocore": {
|
||||
"hashes": [
|
||||
"sha256:78bf25933e35eb6354a9e80fe156f86dce4d346a92afe364dfce25c17ab0639f",
|
||||
"sha256:dbac2fde265f13beb9191ec3ff63b90b515e9ed63875edc3afbd72c5f585e48b"
|
||||
"sha256:ca3ef7588daa664fe196d3234718db5f6b5dab961507500b4bb921e31133eea1",
|
||||
"sha256:f2fe17ed6b8e163769a715f81cb6ce3d4628d172918de535256bdf34d29b704f"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==1.29.46"
|
||||
"version": "==1.29.54"
|
||||
},
|
||||
"brotli": {
|
||||
"hashes": [
|
||||
|
@ -269,17 +269,10 @@
|
|||
},
|
||||
"cloudscraper": {
|
||||
"hashes": [
|
||||
"sha256:2776c70f3661c028e59fd306ac2b104882c9b3cb3f798086251e00fc2d72c3a2",
|
||||
"sha256:3b9753724616ac4d811e7922ddc9dba9b4419749ebaa35b0ba503d442522df2e"
|
||||
"sha256:401409859697edae9384a7623b450cc97ab14dd0b2c8cdcac62edc2d50b31741",
|
||||
"sha256:4d02aceffa90abd4dabc75b79bafa31636309baa7c0f2ee665e2d345aadb8863"
|
||||
],
|
||||
"version": "==1.2.67"
|
||||
},
|
||||
"commonmark": {
|
||||
"hashes": [
|
||||
"sha256:452f9dc859be7f06631ddcb328b6919c67984aca654e5fefb3914d54691aed60",
|
||||
"sha256:da2f38c92590f83de410ba1a3cbceafbc74fee9def35f9251ba9a971d6d66fd9"
|
||||
],
|
||||
"version": "==0.9.1"
|
||||
"version": "==1.2.68"
|
||||
},
|
||||
"cryptography": {
|
||||
"hashes": [
|
||||
|
@ -318,16 +311,16 @@
|
|||
"sha256:bc285b5f892094c3a53d558858a88553dd6a61a11ab1a8128a0e554385dcc5dd",
|
||||
"sha256:c2c11bc8214fbf709ffc369d11446ff6945254a7f09128154a7620613d8fda90"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"index": "pypi",
|
||||
"version": "==0.5.7"
|
||||
},
|
||||
"dateparser": {
|
||||
"hashes": [
|
||||
"sha256:107f3cc87a60770e10d111349adc1504224a6b60753a47a64b0ec842ab85b5a9",
|
||||
"sha256:ceb159f1b4a9df54ed6209e91298097deafde476037f8611b4cb2b1cb8b31c58"
|
||||
"sha256:c47b6e4b8c4b2b2a21690111b6571b6991295ba327ec6503753abeebf5e80696",
|
||||
"sha256:e703db1815270c020552f4b3e3a981937b48b2cbcfcef5347071b74788dd9214"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.1.5"
|
||||
"version": "==1.1.6"
|
||||
},
|
||||
"exceptiongroup": {
|
||||
"hashes": [
|
||||
|
@ -363,10 +356,10 @@
|
|||
},
|
||||
"future": {
|
||||
"hashes": [
|
||||
"sha256:b1bead90b70cf6ec3f0710ae53a525360fa360d306a86583adc6bf83a4db537d"
|
||||
"sha256:34a17436ed1e96697a86f9de3d15a3b0be01d8bc8de9c1dffd59fb8234ed5307"
|
||||
],
|
||||
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==0.18.2"
|
||||
"version": "==0.18.3"
|
||||
},
|
||||
"google-api-core": {
|
||||
"hashes": [
|
||||
|
@ -378,11 +371,11 @@
|
|||
},
|
||||
"google-api-python-client": {
|
||||
"hashes": [
|
||||
"sha256:9412ad3445518fa9d24d02c673a70b07c9d124990f44763cdf4f5304ca5b4d08",
|
||||
"sha256:a4ea351db2bb2a9b1a7e96d8fa8de0fcbc31d9e237b724f4a07b243c2d63e9a4"
|
||||
"sha256:7e860e3ec27b504fb797fa23c07c012a874dd736491fddbe50a20d3bdde8ace6",
|
||||
"sha256:bafce2a02b06ee501df039eba5874afc7d28c9cf5ef92253327776448706556d"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.71.0"
|
||||
"version": "==2.73.0"
|
||||
},
|
||||
"google-auth": {
|
||||
"hashes": [
|
||||
|
@ -570,51 +563,69 @@
|
|||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==4.9.2"
|
||||
},
|
||||
"markupsafe": {
|
||||
"markdown-it-py": {
|
||||
"hashes": [
|
||||
"sha256:0212a68688482dc52b2d45013df70d169f542b7394fc744c02a57374a4207003",
|
||||
"sha256:089cf3dbf0cd6c100f02945abeb18484bd1ee57a079aefd52cffd17fba910b88",
|
||||
"sha256:10c1bfff05d95783da83491be968e8fe789263689c02724e0c691933c52994f5",
|
||||
"sha256:33b74d289bd2f5e527beadcaa3f401e0df0a89927c1559c8566c066fa4248ab7",
|
||||
"sha256:3799351e2336dc91ea70b034983ee71cf2f9533cdff7c14c90ea126bfd95d65a",
|
||||
"sha256:3ce11ee3f23f79dbd06fb3d63e2f6af7b12db1d46932fe7bd8afa259a5996603",
|
||||
"sha256:421be9fbf0ffe9ffd7a378aafebbf6f4602d564d34be190fc19a193232fd12b1",
|
||||
"sha256:43093fb83d8343aac0b1baa75516da6092f58f41200907ef92448ecab8825135",
|
||||
"sha256:46d00d6cfecdde84d40e572d63735ef81423ad31184100411e6e3388d405e247",
|
||||
"sha256:4a33dea2b688b3190ee12bd7cfa29d39c9ed176bda40bfa11099a3ce5d3a7ac6",
|
||||
"sha256:4b9fe39a2ccc108a4accc2676e77da025ce383c108593d65cc909add5c3bd601",
|
||||
"sha256:56442863ed2b06d19c37f94d999035e15ee982988920e12a5b4ba29b62ad1f77",
|
||||
"sha256:671cd1187ed5e62818414afe79ed29da836dde67166a9fac6d435873c44fdd02",
|
||||
"sha256:694deca8d702d5db21ec83983ce0bb4b26a578e71fbdbd4fdcd387daa90e4d5e",
|
||||
"sha256:6a074d34ee7a5ce3effbc526b7083ec9731bb3cbf921bbe1d3005d4d2bdb3a63",
|
||||
"sha256:6d0072fea50feec76a4c418096652f2c3238eaa014b2f94aeb1d56a66b41403f",
|
||||
"sha256:6fbf47b5d3728c6aea2abb0589b5d30459e369baa772e0f37a0320185e87c980",
|
||||
"sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b",
|
||||
"sha256:86b1f75c4e7c2ac2ccdaec2b9022845dbb81880ca318bb7a0a01fbf7813e3812",
|
||||
"sha256:8dc1c72a69aa7e082593c4a203dcf94ddb74bb5c8a731e4e1eb68d031e8498ff",
|
||||
"sha256:8e3dcf21f367459434c18e71b2a9532d96547aef8a871872a5bd69a715c15f96",
|
||||
"sha256:8e576a51ad59e4bfaac456023a78f6b5e6e7651dcd383bcc3e18d06f9b55d6d1",
|
||||
"sha256:96e37a3dc86e80bf81758c152fe66dbf60ed5eca3d26305edf01892257049925",
|
||||
"sha256:97a68e6ada378df82bc9f16b800ab77cbf4b2fada0081794318520138c088e4a",
|
||||
"sha256:99a2a507ed3ac881b975a2976d59f38c19386d128e7a9a18b7df6fff1fd4c1d6",
|
||||
"sha256:a49907dd8420c5685cfa064a1335b6754b74541bbb3706c259c02ed65b644b3e",
|
||||
"sha256:b09bf97215625a311f669476f44b8b318b075847b49316d3e28c08e41a7a573f",
|
||||
"sha256:b7bd98b796e2b6553da7225aeb61f447f80a1ca64f41d83612e6139ca5213aa4",
|
||||
"sha256:b87db4360013327109564f0e591bd2a3b318547bcef31b468a92ee504d07ae4f",
|
||||
"sha256:bcb3ed405ed3222f9904899563d6fc492ff75cce56cba05e32eff40e6acbeaa3",
|
||||
"sha256:d4306c36ca495956b6d568d276ac11fdd9c30a36f1b6eb928070dc5360b22e1c",
|
||||
"sha256:d5ee4f386140395a2c818d149221149c54849dfcfcb9f1debfe07a8b8bd63f9a",
|
||||
"sha256:dda30ba7e87fbbb7eab1ec9f58678558fd9a6b8b853530e176eabd064da81417",
|
||||
"sha256:e04e26803c9c3851c931eac40c695602c6295b8d432cbe78609649ad9bd2da8a",
|
||||
"sha256:e1c0b87e09fa55a220f058d1d49d3fb8df88fbfab58558f1198e08c1e1de842a",
|
||||
"sha256:e72591e9ecd94d7feb70c1cbd7be7b3ebea3f548870aa91e2732960fa4d57a37",
|
||||
"sha256:e8c843bbcda3a2f1e3c2ab25913c80a3c5376cd00c6e8c4a86a89a28c8dc5452",
|
||||
"sha256:efc1913fd2ca4f334418481c7e595c00aad186563bbc1ec76067848c7ca0a933",
|
||||
"sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a",
|
||||
"sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7"
|
||||
"sha256:93de681e5c021a432c63147656fe21790bc01231e0cd2da73626f1aa3ac0fe27",
|
||||
"sha256:cf7e59fed14b5ae17c0006eff14a2d9a00ed5f3a846148153899a0224e2c07da"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==2.1.1"
|
||||
"version": "==2.1.0"
|
||||
},
|
||||
"markupsafe": {
|
||||
"hashes": [
|
||||
"sha256:0576fe974b40a400449768941d5d0858cc624e3249dfd1e0c33674e5c7ca7aed",
|
||||
"sha256:085fd3201e7b12809f9e6e9bc1e5c96a368c8523fad5afb02afe3c051ae4afcc",
|
||||
"sha256:090376d812fb6ac5f171e5938e82e7f2d7adc2b629101cec0db8b267815c85e2",
|
||||
"sha256:0b462104ba25f1ac006fdab8b6a01ebbfbce9ed37fd37fd4acd70c67c973e460",
|
||||
"sha256:137678c63c977754abe9086a3ec011e8fd985ab90631145dfb9294ad09c102a7",
|
||||
"sha256:1bea30e9bf331f3fef67e0a3877b2288593c98a21ccb2cf29b74c581a4eb3af0",
|
||||
"sha256:22152d00bf4a9c7c83960521fc558f55a1adbc0631fbb00a9471e097b19d72e1",
|
||||
"sha256:22731d79ed2eb25059ae3df1dfc9cb1546691cc41f4e3130fe6bfbc3ecbbecfa",
|
||||
"sha256:2298c859cfc5463f1b64bd55cb3e602528db6fa0f3cfd568d3605c50678f8f03",
|
||||
"sha256:28057e985dace2f478e042eaa15606c7efccb700797660629da387eb289b9323",
|
||||
"sha256:2e7821bffe00aa6bd07a23913b7f4e01328c3d5cc0b40b36c0bd81d362faeb65",
|
||||
"sha256:2ec4f2d48ae59bbb9d1f9d7efb9236ab81429a764dedca114f5fdabbc3788013",
|
||||
"sha256:340bea174e9761308703ae988e982005aedf427de816d1afe98147668cc03036",
|
||||
"sha256:40627dcf047dadb22cd25ea7ecfe9cbf3bbbad0482ee5920b582f3809c97654f",
|
||||
"sha256:40dfd3fefbef579ee058f139733ac336312663c6706d1163b82b3003fb1925c4",
|
||||
"sha256:4cf06cdc1dda95223e9d2d3c58d3b178aa5dacb35ee7e3bbac10e4e1faacb419",
|
||||
"sha256:50c42830a633fa0cf9e7d27664637532791bfc31c731a87b202d2d8ac40c3ea2",
|
||||
"sha256:55f44b440d491028addb3b88f72207d71eeebfb7b5dbf0643f7c023ae1fba619",
|
||||
"sha256:608e7073dfa9e38a85d38474c082d4281f4ce276ac0010224eaba11e929dd53a",
|
||||
"sha256:63ba06c9941e46fa389d389644e2d8225e0e3e5ebcc4ff1ea8506dce646f8c8a",
|
||||
"sha256:65608c35bfb8a76763f37036547f7adfd09270fbdbf96608be2bead319728fcd",
|
||||
"sha256:665a36ae6f8f20a4676b53224e33d456a6f5a72657d9c83c2aa00765072f31f7",
|
||||
"sha256:6d6607f98fcf17e534162f0709aaad3ab7a96032723d8ac8750ffe17ae5a0666",
|
||||
"sha256:7313ce6a199651c4ed9d7e4cfb4aa56fe923b1adf9af3b420ee14e6d9a73df65",
|
||||
"sha256:7668b52e102d0ed87cb082380a7e2e1e78737ddecdde129acadb0eccc5423859",
|
||||
"sha256:7df70907e00c970c60b9ef2938d894a9381f38e6b9db73c5be35e59d92e06625",
|
||||
"sha256:7e007132af78ea9df29495dbf7b5824cb71648d7133cf7848a2a5dd00d36f9ff",
|
||||
"sha256:835fb5e38fd89328e9c81067fd642b3593c33e1e17e2fdbf77f5676abb14a156",
|
||||
"sha256:8bca7e26c1dd751236cfb0c6c72d4ad61d986e9a41bbf76cb445f69488b2a2bd",
|
||||
"sha256:8db032bf0ce9022a8e41a22598eefc802314e81b879ae093f36ce9ddf39ab1ba",
|
||||
"sha256:99625a92da8229df6d44335e6fcc558a5037dd0a760e11d84be2260e6f37002f",
|
||||
"sha256:9cad97ab29dfc3f0249b483412c85c8ef4766d96cdf9dcf5a1e3caa3f3661cf1",
|
||||
"sha256:a4abaec6ca3ad8660690236d11bfe28dfd707778e2442b45addd2f086d6ef094",
|
||||
"sha256:a6e40afa7f45939ca356f348c8e23048e02cb109ced1eb8420961b2f40fb373a",
|
||||
"sha256:a6f2fcca746e8d5910e18782f976489939d54a91f9411c32051b4aab2bd7c513",
|
||||
"sha256:a806db027852538d2ad7555b203300173dd1b77ba116de92da9afbc3a3be3eed",
|
||||
"sha256:abcabc8c2b26036d62d4c746381a6f7cf60aafcc653198ad678306986b09450d",
|
||||
"sha256:b8526c6d437855442cdd3d87eede9c425c4445ea011ca38d937db299382e6fa3",
|
||||
"sha256:bb06feb762bade6bf3c8b844462274db0c76acc95c52abe8dbed28ae3d44a147",
|
||||
"sha256:c0a33bc9f02c2b17c3ea382f91b4db0e6cde90b63b296422a939886a7a80de1c",
|
||||
"sha256:c4a549890a45f57f1ebf99c067a4ad0cb423a05544accaf2b065246827ed9603",
|
||||
"sha256:ca244fa73f50a800cf8c3ebf7fd93149ec37f5cb9596aa8873ae2c1d23498601",
|
||||
"sha256:cf877ab4ed6e302ec1d04952ca358b381a882fbd9d1b07cccbfd61783561f98a",
|
||||
"sha256:d9d971ec1e79906046aa3ca266de79eac42f1dbf3612a05dc9368125952bd1a1",
|
||||
"sha256:da25303d91526aac3672ee6d49a2f3db2d9502a4a60b55519feb1a4c7714e07d",
|
||||
"sha256:e55e40ff0cc8cc5c07996915ad367fa47da6b3fc091fdadca7f5403239c5fec3",
|
||||
"sha256:f03a532d7dee1bed20bc4884194a16160a2de9ffc6354b3878ec9682bb623c54",
|
||||
"sha256:f1cd098434e83e656abf198f103a8207a8187c0fc110306691a2e94a78d0abb2",
|
||||
"sha256:f2bfb563d0211ce16b63c7cb9395d2c682a23187f54c3d79bfec33e6705473c6",
|
||||
"sha256:f8ffb705ffcf5ddd0e80b65ddf7bed7ee4f5a441ea7d3419e861a12eaf41af58"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==2.1.2"
|
||||
},
|
||||
"marshmallow": {
|
||||
"hashes": [
|
||||
|
@ -631,6 +642,14 @@
|
|||
],
|
||||
"version": "==1.5.1"
|
||||
},
|
||||
"mdurl": {
|
||||
"hashes": [
|
||||
"sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8",
|
||||
"sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==0.1.2"
|
||||
},
|
||||
"mutagen": {
|
||||
"hashes": [
|
||||
"sha256:6e5f8ba84836b99fe60be5fb27f84be4ad919bbb6b49caa6ae81e70584b55e58",
|
||||
|
@ -837,10 +856,10 @@
|
|||
},
|
||||
"pytz": {
|
||||
"hashes": [
|
||||
"sha256:7ccfae7b4b2c067464a6733c6261673fdb8fd1be905460396b97a073e9fa683a",
|
||||
"sha256:93007def75ae22f7cd991c84e02d434876818661f8df9ad5df9e950ff4e52cfd"
|
||||
"sha256:01a0681c4b9684a28304615eba55d1ab31ae00bf68ec157ec3708a8182dbbcd0",
|
||||
"sha256:78f4f37d8198e0627c5f1143240bb0206b8691d8d7ac6d78fee88b78733f8c4a"
|
||||
],
|
||||
"version": "==2022.7"
|
||||
"version": "==2022.7.1"
|
||||
},
|
||||
"pytz-deprecation-shim": {
|
||||
"hashes": [
|
||||
|
@ -1016,11 +1035,11 @@
|
|||
},
|
||||
"rich": {
|
||||
"hashes": [
|
||||
"sha256:25f83363f636995627a99f6e4abc52ed0970ebbd544960cc63cbb43aaac3d6f0",
|
||||
"sha256:41fe1d05f433b0f4724cda8345219213d2bfa472ef56b2f64f415b5b94d51b04"
|
||||
"sha256:7c963f0d03819221e9ac561e1bc866e3f95a02248c1234daa48954e6d381c003",
|
||||
"sha256:f1a00cdd3eebf999a15d85ec498bfe0b1a77efe9b34f645768a54132ef444ac5"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==13.0.1"
|
||||
"version": "==13.2.0"
|
||||
},
|
||||
"rsa": {
|
||||
"hashes": [
|
||||
|
@ -1064,11 +1083,11 @@
|
|||
},
|
||||
"snscrape": {
|
||||
"hashes": [
|
||||
"sha256:af30d12872da692ff9ccaf5651962edceb1fd4a28cf7cc92c8c898902f009ce3",
|
||||
"sha256:fd176765196ca17979be7f54e041f430e4cb23a5e651fa29cf3dc382258019f2"
|
||||
"sha256:106bd375d47b683f88e96acbf425747358fd851f5282a91a0fa0c6784f29f2e4",
|
||||
"sha256:194078946ff53c8b2a79db7695dde351819b7849009aa137e26cda924d3ae702"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.4.3.20220106"
|
||||
"version": "==0.5.0.20230113"
|
||||
},
|
||||
"sortedcontainers": {
|
||||
"hashes": [
|
||||
|
@ -1310,6 +1329,22 @@
|
|||
"markers": "python_version >= '3.6'",
|
||||
"version": "==2.10.0"
|
||||
},
|
||||
"setuptools-pipfile": {
|
||||
"hashes": [
|
||||
"sha256:54cb6bf6a662fe74951425d509772a5302d1cf723d9a3654d19c2468d3d80b6b",
|
||||
"sha256:f6049892af8e8233a438cf00fb4477fe81de3ea0e8e90c1241d196cb40f703b5"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.7.0"
|
||||
},
|
||||
"toml": {
|
||||
"hashes": [
|
||||
"sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
|
||||
"sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
|
||||
],
|
||||
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==0.10.2"
|
||||
},
|
||||
"tomli": {
|
||||
"hashes": [
|
||||
"sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
[build-system]
|
||||
requires = ["setuptools", "wheel", "setuptools-pipfile"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
[tool.setuptools-pipfile]
|
|
@ -0,0 +1,49 @@
|
|||
[metadata]
|
||||
name = auto_archiver
|
||||
version = 2.0.0
|
||||
author = Bellingcat
|
||||
author_email = tech@bellingcat.com
|
||||
description = Easily archive online media content
|
||||
long_description = file: README.md, LICENSE
|
||||
keywords = archive, oosi, osint, scraping
|
||||
license = MIT
|
||||
classifiers =
|
||||
Intended Audience :: Developers,
|
||||
Intended Audience :: Science/Research,
|
||||
License :: OSI Approved :: MIT License,
|
||||
Programming Language :: Python :: 3,
|
||||
|
||||
[options]
|
||||
setup_requires =
|
||||
setuptools-pipfile
|
||||
zip_safe = False
|
||||
include_package_data = True
|
||||
package_dir=
|
||||
=src
|
||||
packages=find:
|
||||
find_packages=true
|
||||
python_requires = >=3.8
|
||||
|
||||
# [options.package_data]
|
||||
# * = *.txt, *.rst
|
||||
# hello = *.msg
|
||||
|
||||
[options.entry_points]
|
||||
console_scripts =
|
||||
auto-archiver = auto_archiver.__main__:main
|
||||
|
||||
# [options.extras_require]
|
||||
# pdf = ReportLab>=1.2; RXP
|
||||
# rest = docutils>=0.3; pack ==1.1, ==1.3
|
||||
|
||||
[options.packages.find]
|
||||
where=src
|
||||
# include=auto_archiver*
|
||||
# exclude =
|
||||
# examples*
|
||||
# .eggs*
|
||||
# build*
|
||||
# secrets*
|
||||
# tmp*
|
||||
# docs*
|
||||
# src.tests*
|
|
@ -0,0 +1 @@
|
|||
# from .auto_archiver import *
|
|
@ -0,0 +1,7 @@
|
|||
from . import archivers, databases, enrichers, feeders, formatters, storages, utils, core
|
||||
|
||||
# need to manually specify due to cyclical deps
|
||||
from .core.orchestrator import ArchivingOrchestrator
|
||||
from .core.v2config import ConfigV2
|
||||
# making accessible directly
|
||||
from .core.metadata import Metadata
|
|
@ -0,0 +1,12 @@
|
|||
from . import ConfigV2
|
||||
from . import ArchivingOrchestrator
|
||||
|
||||
def main():
|
||||
config = ConfigV2()
|
||||
config.parse()
|
||||
orchestrator = ArchivingOrchestrator(config)
|
||||
orchestrator.feed()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -1,5 +1,5 @@
|
|||
# we need to explicitly expose the available imports here
|
||||
from .base_archiver import Archiver, ArchiveResult
|
||||
# from .base_archiver import Archiver, ArchiveResult
|
||||
# from .telegram_archiver import TelegramArchiver
|
||||
# from .telethon_archiver import TelethonArchiver
|
||||
# from .tiktok_archiver import TiktokArchiver
|
|
@ -2,9 +2,9 @@ from __future__ import annotations
|
|||
from abc import abstractmethod
|
||||
from dataclasses import dataclass
|
||||
import os
|
||||
from metadata import Metadata
|
||||
from steps.step import Step
|
||||
import mimetypes, requests
|
||||
from ..core import Metadata
|
||||
from ..core import Step
|
||||
|
||||
|
||||
@dataclass
|
|
@ -11,7 +11,7 @@ from selenium.common.exceptions import TimeoutException
|
|||
from selenium.webdriver.common.by import By
|
||||
from slugify import slugify
|
||||
|
||||
from configs import Config
|
||||
from ..configs import Config
|
||||
from storages import Storage
|
||||
from utils import mkdir_if_not_exists
|
||||
|
|
@ -2,10 +2,9 @@ import re, os, shutil, html, traceback
|
|||
import instaloader # https://instaloader.github.io/as-module.html
|
||||
from loguru import logger
|
||||
|
||||
from metadata import Metadata
|
||||
from media import Media
|
||||
from .archiver import Archiverv2
|
||||
|
||||
from . import Archiverv2
|
||||
from ..core import Metadata
|
||||
from ..core import Media
|
||||
|
||||
class InstagramArchiver(Archiverv2):
|
||||
"""
|
|
@ -4,9 +4,9 @@ import html
|
|||
from bs4 import BeautifulSoup
|
||||
from loguru import logger
|
||||
|
||||
from metadata import Metadata
|
||||
from media import Media
|
||||
from .archiver import Archiverv2
|
||||
from . import Archiverv2
|
||||
from ..core import Metadata
|
||||
from ..core import Media
|
||||
|
||||
|
||||
class TelegramArchiver(Archiverv2):
|
|
@ -1,5 +1,4 @@
|
|||
from archivers import Archiverv2
|
||||
from metadata import Metadata
|
||||
|
||||
from telethon.sync import TelegramClient
|
||||
from telethon.errors import ChannelInvalidError
|
||||
from telethon.tl.types import PeerUser, PeerChat, PeerChannel
|
||||
|
@ -9,7 +8,9 @@ from loguru import logger
|
|||
from tqdm import tqdm
|
||||
import re, time, json, os
|
||||
|
||||
from media import Media
|
||||
from . import Archiverv2
|
||||
from ..core import Metadata
|
||||
from ..core import Media
|
||||
|
||||
|
||||
class TelethonArchiver(Archiverv2):
|
|
@ -5,9 +5,9 @@ import uuid
|
|||
import tiktok_downloader
|
||||
from loguru import logger
|
||||
|
||||
from metadata import Metadata
|
||||
from media import Media
|
||||
from .archiver import Archiverv2
|
||||
from . import Archiverv2
|
||||
from ..core import Metadata
|
||||
from ..core import Media
|
||||
|
||||
|
||||
class TiktokArchiver(Archiverv2):
|
|
@ -7,10 +7,10 @@ from loguru import logger
|
|||
from pytwitter import Api
|
||||
from slugify import slugify
|
||||
|
||||
from metadata import Metadata
|
||||
from media import Media
|
||||
from . import Archiverv2
|
||||
from .twitter_archiverv2 import TwitterArchiver
|
||||
from .archiver import Archiverv2
|
||||
from ..core import Metadata
|
||||
from ..core import Media
|
||||
|
||||
|
||||
class TwitterApiArchiver(TwitterArchiver, Archiverv2):
|
|
@ -4,12 +4,12 @@ import json
|
|||
import os
|
||||
from datetime import datetime
|
||||
from loguru import logger
|
||||
from metadata import Metadata
|
||||
from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
|
||||
from archivers import Archiverv2
|
||||
from media import Media
|
||||
from slugify import slugify
|
||||
|
||||
from . import Archiverv2
|
||||
from ..core import Metadata
|
||||
from ..core import Media
|
||||
|
||||
class TwitterArchiver(Archiverv2):
|
||||
"""
|
|
@ -1,10 +1,10 @@
|
|||
from loguru import logger
|
||||
from vk_url_scraper import VkScraper
|
||||
|
||||
from utils.misc import dump_payload
|
||||
from metadata import Metadata
|
||||
from media import Media
|
||||
from .archiver import Archiverv2
|
||||
from ..utils.misc import dump_payload
|
||||
from . import Archiverv2
|
||||
from ..core import Metadata
|
||||
from ..core import Media
|
||||
|
||||
|
||||
class VkArchiver(Archiverv2):
|
|
@ -4,9 +4,9 @@ import os
|
|||
import yt_dlp
|
||||
from loguru import logger
|
||||
|
||||
from metadata import Metadata
|
||||
from media import Media
|
||||
from .archiver import Archiverv2
|
||||
from . import Archiverv2
|
||||
from ..core import Metadata
|
||||
from ..core import Media
|
||||
|
||||
|
||||
class YoutubeDLArchiver(Archiverv2):
|
|
@ -0,0 +1,7 @@
|
|||
from .media import Media
|
||||
from .metadata import Metadata
|
||||
from .step import Step
|
||||
|
||||
# cannot import ArchivingOrchestrator/Config to avoid circular dep
|
||||
# from .orchestrator import ArchivingOrchestrator
|
||||
# from .v2config import ConfigV2
|
|
@ -3,9 +3,11 @@ from __future__ import annotations
|
|||
from ast import List
|
||||
from typing import Any
|
||||
from dataclasses import dataclass, field
|
||||
from dataclasses_json import dataclass_json
|
||||
import mimetypes
|
||||
|
||||
|
||||
# annotation order matters
|
||||
@dataclass_json
|
||||
@dataclass
|
||||
class Media:
|
||||
filename: str
|
|
@ -3,13 +3,15 @@ from __future__ import annotations
|
|||
from ast import List, Set
|
||||
from typing import Any, Union, Dict
|
||||
from dataclasses import dataclass, field
|
||||
from dataclasses_json import dataclass_json
|
||||
import datetime, mimetypes
|
||||
from urllib.parse import urlparse
|
||||
from loguru import logger
|
||||
from dateutil.parser import parse as parse_dt
|
||||
from media import Media
|
||||
|
||||
from .media import Media
|
||||
|
||||
# annotation order matters
|
||||
@dataclass_json
|
||||
@dataclass
|
||||
class Metadata:
|
||||
status: str = "no archiver"
|
|
@ -3,14 +3,14 @@ from ast import List
|
|||
from typing import Union, Dict
|
||||
from dataclasses import dataclass
|
||||
|
||||
from archivers import Archiverv2
|
||||
from feeders import Feeder
|
||||
from formatters import Formatter
|
||||
from media import Media
|
||||
from storages import StorageV2
|
||||
from enrichers import Enricher
|
||||
from databases import Database
|
||||
from metadata import Metadata
|
||||
from ..archivers import Archiverv2
|
||||
from ..feeders import Feeder
|
||||
from ..formatters import Formatter
|
||||
from ..storages import StorageV2
|
||||
from ..enrichers import Enricher
|
||||
from ..databases import Database
|
||||
from .media import Media
|
||||
from .metadata import Metadata
|
||||
|
||||
import tempfile, time, traceback
|
||||
from loguru import logger
|
||||
|
@ -56,17 +56,6 @@ Cisticola considerations:
|
|||
|
||||
class ArchivingOrchestrator:
|
||||
def __init__(self, config) -> None:
|
||||
# in config.py we should test that the archivers exist and log mismatches (blocking execution)
|
||||
# identify each formatter, storage, database, etc
|
||||
# self.feeder = Feeder.init(config.feeder, config.get(config.feeder))
|
||||
|
||||
# Is it possible to overwrite config.yaml values? it could be useful: share config file and modify gsheet_feeder.sheet via CLI
|
||||
# where does that update/processing happen? in config.py
|
||||
# reflection for Archiver to know which child classes it has? use Archiver.__subclasses__
|
||||
# self.archivers = [
|
||||
# Archiver.init(a, config)
|
||||
# for a in config.archivers
|
||||
# ]
|
||||
self.feeder: Feeder = config.feeder
|
||||
self.formatter: Formatter = config.formatter
|
||||
self.enrichers = config.enrichers
|
||||
|
@ -76,50 +65,32 @@ class ArchivingOrchestrator:
|
|||
|
||||
for a in self.archivers: a.setup()
|
||||
|
||||
self.formatters = []
|
||||
# self.formatters = [
|
||||
# Formatter.init(f, config)
|
||||
# for f in config.formatters
|
||||
# ]
|
||||
|
||||
# self.storages = [
|
||||
# Storage.init(s, config)
|
||||
# for s in config.storages
|
||||
# ]
|
||||
|
||||
# self.databases = [
|
||||
# Database.init(f, config)
|
||||
# for f in config.formatters
|
||||
# ]
|
||||
|
||||
# these rules are checked in config.py
|
||||
# assert len(archivers) > 1, "there needs to be at least one Archiver"
|
||||
|
||||
def feed(self) -> list(Metadata):
|
||||
def feed(self) -> None:
|
||||
for item in self.feeder:
|
||||
print("ARCHIVING", item)
|
||||
try:
|
||||
with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
|
||||
item.set_tmp_dir(tmp_dir)
|
||||
result = self.archive(item)
|
||||
print(result)
|
||||
except KeyboardInterrupt:
|
||||
# catches keyboard interruptions to do a clean exit
|
||||
logger.warning(f"caught interrupt on {item=}")
|
||||
for d in self.databases: d.aborted(item)
|
||||
exit()
|
||||
except Exception as e:
|
||||
logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}')
|
||||
for d in self.databases: d.failed(item)
|
||||
self.feed_item(item)
|
||||
|
||||
print("holding on 5s")
|
||||
time.sleep(5)
|
||||
def feed_item(self, item:Metadata) -> Metadata:
|
||||
print("ARCHIVING", item)
|
||||
try:
|
||||
with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
|
||||
item.set_tmp_dir(tmp_dir)
|
||||
result = self.archive(item)
|
||||
except KeyboardInterrupt:
|
||||
# catches keyboard interruptions to do a clean exit
|
||||
logger.warning(f"caught interrupt on {item=}")
|
||||
for d in self.databases: d.aborted(item)
|
||||
exit()
|
||||
except Exception as e:
|
||||
logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}')
|
||||
for d in self.databases: d.failed(item)
|
||||
|
||||
# how does this handle the parameters like folder which can be different for each archiver?
|
||||
# the storage needs to know where to archive!!
|
||||
# solution: feeders have context: extra metadata that they can read or ignore,
|
||||
# all of it should have sensible defaults (eg: folder)
|
||||
# default feeder is a list with 1 element
|
||||
return result
|
||||
|
||||
# how does this handle the parameters like folder which can be different for each archiver?
|
||||
# the storage needs to know where to archive!!
|
||||
# solution: feeders have context: extra metadata that they can read or ignore,
|
||||
# all of it should have sensible defaults (eg: folder)
|
||||
# default feeder is a list with 1 element
|
||||
|
||||
def archive(self, result: Metadata) -> Union[Metadata, None]:
|
||||
url = result.get_url()
|
|
@ -2,7 +2,7 @@ from __future__ import annotations
|
|||
from dataclasses import dataclass, field
|
||||
from inspect import ClassFoundException
|
||||
from typing import Type
|
||||
from metadata import Metadata
|
||||
from ..core import Metadata
|
||||
from abc import ABC
|
||||
# from collections.abc import Iterable
|
||||
|
|
@ -3,15 +3,16 @@
|
|||
import argparse, yaml
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List
|
||||
from archivers import Archiverv2
|
||||
from feeders import Feeder
|
||||
from databases import Database
|
||||
from formatters import Formatter
|
||||
from storages import StorageV2
|
||||
from steps.step import Step
|
||||
from enrichers import Enricher
|
||||
from collections import defaultdict
|
||||
|
||||
from ..archivers import Archiverv2
|
||||
from ..feeders import Feeder
|
||||
from ..databases import Database
|
||||
from ..formatters import Formatter
|
||||
from ..storages import StorageV2
|
||||
from . import Step
|
||||
from ..enrichers import Enricher
|
||||
|
||||
|
||||
@dataclass
|
||||
class ConfigV2:
|
||||
|
@ -37,17 +38,22 @@ class ConfigV2:
|
|||
self.defaults = {}
|
||||
self.cli_ops = {}
|
||||
self.config = {}
|
||||
|
||||
# TODO: make this work for nested props like gsheet_feeder.columns.url = "URL"
|
||||
def parse(self):
|
||||
# 1. parse CLI values
|
||||
parser = argparse.ArgumentParser(
|
||||
# prog = "auto-archiver",
|
||||
description="Auto Archiver is a ...!",
|
||||
epilog="Check the code at https://github.com/bellingcat/auto-archiver"
|
||||
)
|
||||
|
||||
parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='config.yaml')
|
||||
def parse(self, use_cli=True, yaml_config_filename: str = None):
|
||||
"""
|
||||
if yaml_config_filename is provided, the --config argument is ignored,
|
||||
useful for library usage when the config values are preloaded
|
||||
"""
|
||||
# 1. parse CLI values
|
||||
if use_cli:
|
||||
parser = argparse.ArgumentParser(
|
||||
# prog = "auto-archiver",
|
||||
description="Auto Archiver is a ...!", # TODO: update
|
||||
epilog="Check the code at https://github.com/bellingcat/auto-archiver"
|
||||
)
|
||||
|
||||
parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='config.yaml')
|
||||
|
||||
for configurable in self.configurable_parents:
|
||||
child: Step
|
||||
|
@ -57,28 +63,32 @@ class ConfigV2:
|
|||
assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}"
|
||||
assert "." not in config, f"config property cannot contain dots('.'): {config}"
|
||||
config_path = f"{child.name}.{config}"
|
||||
try:
|
||||
parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})", choices=details.get("choices", None))
|
||||
except argparse.ArgumentError:
|
||||
# captures cases when a Step is used in 2 flows, eg: wayback enricher vs wayback archiver
|
||||
pass
|
||||
|
||||
if use_cli:
|
||||
try:
|
||||
parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})", choices=details.get("choices", None))
|
||||
except argparse.ArgumentError:
|
||||
# captures cases when a Step is used in 2 flows, eg: wayback enricher vs wayback archiver
|
||||
pass
|
||||
|
||||
self.defaults[config_path] = details["default"]
|
||||
if "cli_set" in details:
|
||||
self.cli_ops[config_path] = details["cli_set"]
|
||||
|
||||
args = parser.parse_args()
|
||||
if use_cli:
|
||||
args = parser.parse_args()
|
||||
yaml_config_filename = yaml_config_filename or getattr(args, "config")
|
||||
else: args = {}
|
||||
|
||||
# 2. read YAML config file
|
||||
with open(args.config, "r", encoding="utf-8") as inf:
|
||||
self.yaml_config = yaml.safe_load(inf)
|
||||
# 2. read YAML config file (or use provided value)
|
||||
self.yaml_config = self.read_yaml(yaml_config_filename)
|
||||
|
||||
# print(f"{self.yaml_config.get('configurations', {})=}")
|
||||
# 3. CONFIGS: decide value with priority: CLI >> config.yaml >> default
|
||||
self.config = defaultdict(dict)
|
||||
for config_path, default in self.defaults.items():
|
||||
child, config = tuple(config_path.split("."))
|
||||
val = getattr(args, config_path)
|
||||
val = getattr(args, config_path, None)
|
||||
if val is not None and config_path in self.cli_ops:
|
||||
val = self.cli_ops[config_path](val, default)
|
||||
if val is None:
|
||||
|
@ -108,5 +118,6 @@ class ConfigV2:
|
|||
print("storages", [e for e in self.storages])
|
||||
print("formatter", self.formatter)
|
||||
|
||||
def validate(self):
|
||||
pass
|
||||
def read_yaml(self, yaml_filename: str) -> dict:
|
||||
with open(yaml_filename, "r", encoding="utf-8") as inf:
|
||||
return yaml.safe_load(inf)
|
|
@ -0,0 +1,3 @@
|
|||
from .database import Database
|
||||
from .gsheet_db import GsheetsDb
|
||||
from .console_db import ConsoleDb
|
|
@ -0,0 +1,32 @@
|
|||
from loguru import logger
|
||||
|
||||
from . import Database
|
||||
from ..core import Metadata
|
||||
|
||||
|
||||
class ConsoleDb(Database):
|
||||
"""
|
||||
Outputs results to the console
|
||||
"""
|
||||
name = "console_db"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {}
|
||||
|
||||
def started(self, item: Metadata) -> None:
|
||||
logger.warning(f"STARTED {item}")
|
||||
|
||||
def failed(self, item: Metadata) -> None:
|
||||
logger.error(f"FAILED {item}")
|
||||
|
||||
def aborted(self, item: Metadata) -> None:
|
||||
logger.warning(f"ABORTED {item}")
|
||||
|
||||
def done(self, item: Metadata) -> None:
|
||||
"""archival result ready - should be saved to DB"""
|
||||
logger.success(f"DONE {item}")
|
|
@ -2,8 +2,8 @@ from __future__ import annotations
|
|||
from dataclasses import dataclass
|
||||
from abc import abstractmethod, ABC
|
||||
from typing import Union
|
||||
from metadata import Metadata
|
||||
from steps.step import Step
|
||||
from ..core import Metadata
|
||||
from ..core import Step
|
||||
|
||||
|
||||
@dataclass
|
|
@ -5,11 +5,11 @@ import gspread, datetime
|
|||
from loguru import logger
|
||||
|
||||
# from . import Enricher
|
||||
from databases import Database
|
||||
from metadata import Metadata
|
||||
from media import Media
|
||||
from steps.gsheet import Gsheets
|
||||
from utils import GWorksheet
|
||||
from . import Database
|
||||
from ..core import Metadata
|
||||
from ..core import Media
|
||||
from ..utils import Gsheets
|
||||
from ..utils import GWorksheet
|
||||
|
||||
|
||||
class GsheetsDb(Database):
|
||||
|
@ -91,7 +91,7 @@ class GsheetsDb(Database):
|
|||
logger.debug(f"Unable to update sheet: {e}")
|
||||
|
||||
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
|
||||
# TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from item and, if missing, manage its own singleton - not needed for now
|
||||
gw: GWorksheet = item.get("gsheet").get("worksheet")
|
||||
row: int = item.get("gsheet").get("row")
|
||||
# TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from item and, if missing, manage its own singleton - not needed for now
|
||||
return gw, row
|
|
@ -1,8 +1,8 @@
|
|||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
from abc import abstractmethod, ABC
|
||||
from metadata import Metadata
|
||||
from steps.step import Step
|
||||
from ..core import Metadata
|
||||
from ..core import Step
|
||||
|
||||
@dataclass
|
||||
class Enricher(Step, ABC):
|
|
@ -1,11 +1,12 @@
|
|||
import hashlib
|
||||
from utils import Webdriver
|
||||
from . import Enricher
|
||||
from metadata import Metadata
|
||||
from loguru import logger
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
import time, requests
|
||||
|
||||
from . import Enricher
|
||||
from ..utils import Webdriver
|
||||
from ..core import Metadata
|
||||
|
||||
|
||||
class HashEnricher(Enricher):
|
||||
"""
|
|
@ -1,11 +1,11 @@
|
|||
from media import Media
|
||||
from utils import Webdriver
|
||||
from . import Enricher
|
||||
from metadata import Metadata
|
||||
from loguru import logger
|
||||
import time, uuid, os
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
|
||||
from . import Enricher
|
||||
from ..utils import Webdriver
|
||||
from ..core import Media
|
||||
from ..core import Metadata
|
||||
|
||||
class ScreenshotEnricher(Enricher):
|
||||
name = "screenshot_enricher"
|
|
@ -1,10 +1,10 @@
|
|||
import uuid
|
||||
from media import Media
|
||||
from . import Enricher
|
||||
from metadata import Metadata
|
||||
from loguru import logger
|
||||
import ffmpeg, os
|
||||
|
||||
from . import Enricher
|
||||
from ..core import Media
|
||||
from ..core import Metadata
|
||||
|
||||
class ThumbnailEnricher(Enricher):
|
||||
"""
|
|
@ -2,13 +2,13 @@ import os
|
|||
import shutil
|
||||
import subprocess
|
||||
import uuid
|
||||
from archivers.archiver import Archiverv2
|
||||
from media import Media
|
||||
from . import Enricher
|
||||
from metadata import Metadata
|
||||
from loguru import logger
|
||||
import time, requests
|
||||
|
||||
from ..core import Media
|
||||
from ..core import Metadata
|
||||
from . import Enricher
|
||||
|
||||
|
||||
class WaczEnricher(Enricher):
|
||||
"""
|
|
@ -1,9 +1,9 @@
|
|||
from archivers.archiver import Archiverv2
|
||||
from . import Enricher
|
||||
from metadata import Metadata
|
||||
from loguru import logger
|
||||
import time, requests
|
||||
|
||||
from . import Enricher
|
||||
from ..archivers import Archiverv2
|
||||
from ..core import Metadata
|
||||
|
||||
class WaybackArchiverEnricher(Enricher, Archiverv2):
|
||||
"""
|
|
@ -1,8 +1,8 @@
|
|||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
from abc import abstractmethod
|
||||
from metadata import Metadata
|
||||
from steps.step import Step
|
||||
from ..core import Metadata
|
||||
from ..core import Step
|
||||
|
||||
|
||||
@dataclass
|
|
@ -2,13 +2,13 @@ import gspread, os
|
|||
|
||||
# from metadata import Metadata
|
||||
from loguru import logger
|
||||
from slugify import slugify
|
||||
|
||||
# from . import Enricher
|
||||
from feeders import Feeder
|
||||
from metadata import Metadata
|
||||
from steps.gsheet import Gsheets
|
||||
from utils import GWorksheet
|
||||
from slugify import slugify
|
||||
from . import Feeder
|
||||
from ..core import Metadata
|
||||
from ..utils import Gsheets
|
||||
from ..utils import GWorksheet
|
||||
|
||||
class GsheetsFeeder(Gsheets, Feeder):
|
||||
name = "gsheet_feeder"
|
|
@ -1,8 +1,8 @@
|
|||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
from abc import abstractmethod
|
||||
from metadata import Metadata
|
||||
from steps.step import Step
|
||||
from ..core import Metadata
|
||||
from ..core import Step
|
||||
|
||||
|
||||
@dataclass
|
|
@ -2,12 +2,13 @@ from __future__ import annotations
|
|||
from dataclasses import dataclass
|
||||
from abc import abstractmethod
|
||||
import mimetypes
|
||||
from metadata import Metadata
|
||||
from media import Media
|
||||
from formatters import Formatter
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
import uuid, os, pathlib
|
||||
|
||||
from ..core import Metadata
|
||||
from ..core import Media
|
||||
from . import Formatter
|
||||
|
||||
|
||||
@dataclass
|
||||
class HtmlFormatter(Formatter):
|
||||
|
@ -72,5 +73,6 @@ def is_audio_jinja(s: str) -> bool:
|
|||
m = mimetypes.guess_type(s)[0]
|
||||
return "audio" in (m or "")
|
||||
|
||||
|
||||
def is_media_jinja(v) -> bool:
|
||||
return isinstance(v, Media)
|
|
@ -0,0 +1,9 @@
|
|||
# we need to explicitly expose the available imports here
|
||||
from .base_storage import Storage
|
||||
# from .local_storage import LocalStorage, LocalConfig
|
||||
# from .s3_storage import S3Config, S3Storage
|
||||
# from .gd_storage import GDConfig, GDStorage
|
||||
|
||||
from .storage import StorageV2
|
||||
from .s3 import S3StorageV2
|
||||
from .local import LocalStorageV2
|
|
@ -3,12 +3,13 @@ import shutil
|
|||
from typing import IO, Any
|
||||
import boto3, uuid, os, mimetypes
|
||||
from botocore.errorfactory import ClientError
|
||||
from metadata import Metadata
|
||||
from media import Media
|
||||
from storages import StorageV2
|
||||
from loguru import logger
|
||||
from slugify import slugify
|
||||
|
||||
from ..core import Metadata
|
||||
from ..core import Media
|
||||
from ..storages import StorageV2
|
||||
|
||||
|
||||
class LocalStorageV2(StorageV2):
|
||||
name = "local_storage"
|
|
@ -2,9 +2,9 @@
|
|||
from typing import IO, Any
|
||||
import boto3, uuid, os, mimetypes
|
||||
from botocore.errorfactory import ClientError
|
||||
from metadata import Metadata
|
||||
from media import Media
|
||||
from storages import StorageV2
|
||||
from ..core import Metadata
|
||||
from ..core import Media
|
||||
from ..storages import StorageV2
|
||||
from loguru import logger
|
||||
from slugify import slugify
|
||||
|
|
@ -2,9 +2,8 @@ from __future__ import annotations
|
|||
from abc import abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from typing import IO, Any
|
||||
from media import Media
|
||||
from metadata import Metadata
|
||||
from steps.step import Step
|
||||
|
||||
from ..core import Media, Metadata, Step
|
||||
from loguru import logger
|
||||
import os, uuid
|
||||
from slugify import slugify
|
|
@ -2,4 +2,5 @@
|
|||
from .gworksheet import GWorksheet
|
||||
from .misc import *
|
||||
from .util import Util
|
||||
from .webdriver import Webdriver
|
||||
from .webdriver import Webdriver
|
||||
from .gsheet import Gsheets
|
|
@ -1,7 +1,7 @@
|
|||
import json, gspread
|
||||
|
||||
from loguru import logger
|
||||
from steps.step import Step
|
||||
from ..core import Step
|
||||
|
||||
|
||||
class Gsheets(Step):
|
|
@ -1,8 +1,7 @@
|
|||
from __future__ import annotations
|
||||
from abc import abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from metadata import Metadata
|
||||
from steps.step import Step
|
||||
from ..core import Metadata, Step
|
||||
|
||||
#TODO: likely unused
|
||||
@dataclass
|
|
@ -1,7 +0,0 @@
|
|||
from .config import Config
|
||||
from .selenium_config import SeleniumConfig
|
||||
from .telethon_config import TelethonConfig
|
||||
from .wayback_config import WaybackConfig
|
||||
from .twitter_api_config import TwitterApiConfig
|
||||
from .vk_config import VkConfig
|
||||
from .instagram_config import InstagramConfig
|
|
@ -1,7 +0,0 @@
|
|||
from dataclasses import dataclass
|
||||
|
||||
@dataclass
|
||||
class BrowsertrixConfig:
|
||||
enabled: bool
|
||||
profile: str
|
||||
timeout_seconds: str
|
|
@ -1,309 +0,0 @@
|
|||
import argparse, yaml, json, os
|
||||
import gspread
|
||||
from loguru import logger
|
||||
from selenium import webdriver
|
||||
from dataclasses import asdict
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
|
||||
from utils import GWorksheet, getattr_or
|
||||
from .wayback_config import WaybackConfig
|
||||
from .telethon_config import TelethonConfig
|
||||
from .selenium_config import SeleniumConfig
|
||||
from .vk_config import VkConfig
|
||||
from .twitter_api_config import TwitterApiConfig
|
||||
from .browsertrix_config import BrowsertrixConfig
|
||||
from .instagram_config import InstagramConfig
|
||||
from storages import S3Config, S3Storage, GDStorage, GDConfig, LocalStorage, LocalConfig
|
||||
|
||||
|
||||
class Config:
|
||||
"""
|
||||
Controls the current execution parameters and manages API configurations
|
||||
Usage:
|
||||
c = Config() # initializes the argument parser
|
||||
c.parse() # parses the values and initializes the Services and API clients
|
||||
# you can then access the Services and APIs like 'c.s3_config'
|
||||
All the configurations available as cmd line options, when included, will
|
||||
override the configurations in the config.yaml file.
|
||||
Configurations are split between:
|
||||
1. "secrets" containing API keys for generating services - not kept in memory
|
||||
2. "execution" containing specific execution configurations
|
||||
"""
|
||||
AVAILABLE_STORAGES = {"s3", "gd", "local"}
|
||||
|
||||
def __init__(self):
|
||||
self.parser = self.get_argument_parser()
|
||||
self.folder = ""
|
||||
self.is_docker = bool(os.environ.get("IS_DOCKER", 0))
|
||||
|
||||
def parse(self):
|
||||
self.args = self.parser.parse_args()
|
||||
logger.success(f'Command line arguments parsed successfully')
|
||||
self.config_file = self.args.config
|
||||
self.read_config_yaml()
|
||||
logger.info(f'APIs and Services initialized:\n{self}')
|
||||
|
||||
def read_config_yaml(self):
|
||||
with open(self.config_file, "r", encoding="utf-8") as inf:
|
||||
self.config = yaml.safe_load(inf)
|
||||
|
||||
self.url = getattr_or(self.args, "url", '')
|
||||
|
||||
# ----------------------EXECUTION - execution configurations
|
||||
execution = self.config.get("execution", {})
|
||||
|
||||
self.sheet = getattr_or(self.args, "sheet", execution.get("sheet"))
|
||||
assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file"
|
||||
|
||||
def ensure_set(l):
|
||||
# always returns a set of strings, can receive a set or a string
|
||||
l = l if isinstance(l, list) else [l]
|
||||
return set([x for x in l if isinstance(x, str) and len(x) > 0])
|
||||
self.worksheet_allow = ensure_set(execution.get("worksheet_allow", []))
|
||||
self.worksheet_block = ensure_set(execution.get("worksheet_block", []))
|
||||
|
||||
self.header = int(getattr_or(self.args, "header", execution.get("header", 1)))
|
||||
self.storage = getattr_or(self.args, "storage", execution.get("storage", "s3"))
|
||||
self.save_logs = getattr(self.args, "save_logs") or execution.get("save_logs", False)
|
||||
if self.save_logs:
|
||||
self.set_log_files()
|
||||
self.check_if_exists = getattr(self.args, "check_if_exists") or execution.get("check_if_exists", False)
|
||||
|
||||
# Column names come from config and can be overwritten by CMD
|
||||
# in the end all are considered as lower case
|
||||
config_column_names = execution.get("column_names", {})
|
||||
self.column_names = {}
|
||||
for k in GWorksheet.COLUMN_NAMES.keys():
|
||||
self.column_names[k] = getattr_or(self.args, k, config_column_names.get(k, GWorksheet.COLUMN_NAMES[k])).lower()
|
||||
|
||||
# selenium driver
|
||||
selenium_configs = execution.get("selenium", {})
|
||||
self.selenium_config = SeleniumConfig(
|
||||
timeout_seconds=int(selenium_configs.get("timeout_seconds", SeleniumConfig.timeout_seconds)),
|
||||
window_width=int(selenium_configs.get("window_width", SeleniumConfig.window_width)),
|
||||
window_height=int(selenium_configs.get("window_height", SeleniumConfig.window_height))
|
||||
)
|
||||
self.webdriver = "not initialized"
|
||||
|
||||
# browsertrix config
|
||||
browsertrix_configs = execution.get("browsertrix", {})
|
||||
if len(browsertrix_profile := browsertrix_configs.get("profile", "")):
|
||||
browsertrix_profile = os.path.abspath(browsertrix_profile)
|
||||
self.browsertrix_config = BrowsertrixConfig(
|
||||
enabled=bool(browsertrix_configs.get("enabled", False)),
|
||||
profile=browsertrix_profile,
|
||||
timeout_seconds=browsertrix_configs.get("timeout_seconds", "90")
|
||||
)
|
||||
|
||||
self.hash_algorithm = execution.get("hash_algorithm", "SHA-256")
|
||||
|
||||
# ---------------------- SECRETS - APIs and service configurations
|
||||
secrets = self.config.get("secrets", {})
|
||||
|
||||
# assert selected storage credentials exist
|
||||
for key, name in [("s3", "s3"), ("gd", "google_drive"), ("local", "local")]:
|
||||
assert self.storage != key or name in secrets, f"selected storage '{key}' requires secrets.'{name}' in {self.config_file}"
|
||||
|
||||
# google sheets config
|
||||
self.gsheets_client = gspread.service_account(
|
||||
filename=secrets.get("google_sheets", {}).get("service_account", 'service_account.json')
|
||||
)
|
||||
|
||||
# facebook config
|
||||
self.facebook_cookie = secrets.get("facebook", {}).get("cookie", None)
|
||||
|
||||
# s3 config
|
||||
if "s3" in secrets:
|
||||
s3 = secrets["s3"]
|
||||
self.s3_config = S3Config(
|
||||
bucket=s3["bucket"],
|
||||
region=s3["region"],
|
||||
key=s3["key"],
|
||||
secret=s3["secret"],
|
||||
endpoint_url=s3.get("endpoint_url", S3Config.endpoint_url),
|
||||
cdn_url=s3.get("cdn_url", S3Config.cdn_url),
|
||||
key_path=s3.get("key_path", S3Config.key_path),
|
||||
private=getattr_or(self.args, "s3-private", s3.get("private", S3Config.private))
|
||||
)
|
||||
|
||||
# GDrive config
|
||||
if "google_drive" in secrets:
|
||||
gd = secrets["google_drive"]
|
||||
self.gd_config = GDConfig(
|
||||
root_folder_id=gd.get("root_folder_id"),
|
||||
oauth_token_filename=gd.get("oauth_token_filename"),
|
||||
service_account=gd.get("service_account", GDConfig.service_account)
|
||||
)
|
||||
|
||||
if "local" in secrets:
|
||||
self.local_config = LocalConfig(
|
||||
save_to=secrets["local"].get("save_to", LocalConfig.save_to),
|
||||
)
|
||||
|
||||
# wayback machine config
|
||||
if "wayback" in secrets:
|
||||
self.wayback_config = WaybackConfig(
|
||||
key=secrets["wayback"]["key"],
|
||||
secret=secrets["wayback"]["secret"],
|
||||
)
|
||||
else:
|
||||
self.wayback_config = None
|
||||
logger.debug(f"'wayback' key not present in the {self.config_file=}")
|
||||
|
||||
# telethon config
|
||||
if "telegram" in secrets:
|
||||
self.telegram_config = TelethonConfig(
|
||||
api_id=secrets["telegram"]["api_id"],
|
||||
api_hash=secrets["telegram"]["api_hash"],
|
||||
bot_token=secrets["telegram"].get("bot_token", None),
|
||||
session_file=secrets["telegram"].get("session_file", "./anon")
|
||||
)
|
||||
else:
|
||||
self.telegram_config = None
|
||||
logger.debug(f"'telegram' key not present in the {self.config_file=}")
|
||||
|
||||
# twitter config
|
||||
if "twitter" in secrets:
|
||||
self.twitter_config = TwitterApiConfig(
|
||||
bearer_token=secrets["twitter"].get("bearer_token"),
|
||||
consumer_key=secrets["twitter"].get("consumer_key"),
|
||||
consumer_secret=secrets["twitter"].get("consumer_secret"),
|
||||
access_token=secrets["twitter"].get("access_token"),
|
||||
access_secret=secrets["twitter"].get("access_secret"),
|
||||
)
|
||||
else:
|
||||
self.twitter_config = None
|
||||
logger.debug(f"'twitter' key not present in the {self.config_file=}")
|
||||
|
||||
# vk config
|
||||
if "vk" in secrets:
|
||||
self.vk_config = VkConfig(
|
||||
username=secrets["vk"]["username"],
|
||||
password=secrets["vk"]["password"],
|
||||
session_file=secrets["vk"].get("session_file", "./vk_config.v2.json")
|
||||
)
|
||||
else:
|
||||
self.vk_config = None
|
||||
logger.debug(f"'vk' key not present in the {self.config_file=}")
|
||||
|
||||
# instagram config
|
||||
if "instagram" in secrets:
|
||||
self.instagram_config = InstagramConfig(
|
||||
username=secrets["instagram"]["username"],
|
||||
password=secrets["instagram"]["password"],
|
||||
session_file=secrets["instagram"].get("session_file", "instaloader.session")
|
||||
)
|
||||
else:
|
||||
self.instagram_config = None
|
||||
logger.debug(f"'instagram' key not present in the {self.config_file=}")
|
||||
|
||||
del self.config["secrets"] # delete to prevent leaks
|
||||
|
||||
def set_log_files(self):
|
||||
# called only when config.execution.save_logs=true
|
||||
logger.add("logs/1trace.log", level="TRACE")
|
||||
logger.add("logs/2info.log", level="INFO")
|
||||
logger.add("logs/3success.log", level="SUCCESS")
|
||||
logger.add("logs/4warning.log", level="WARNING")
|
||||
logger.add("logs/5error.log", level="ERROR")
|
||||
|
||||
def get_argument_parser(self):
|
||||
"""
|
||||
Creates the CMD line arguments. 'python auto_archive.py --help'
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description='Automatically archive social media posts, videos, and images from a Google Sheets document. The command line arguments will always override the configurations in the provided YAML config file (--config), only some high-level options are allowed via the command line and the YAML configuration file is the preferred method. The sheet must have the "url" and "status" for the archiver to work. ')
|
||||
|
||||
parser.add_argument('--url', action='store', dest='url', help='single URL to archive - to use only via cli.py and not google sheets interaction')
|
||||
parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='config.yaml')
|
||||
parser.add_argument('--storage', action='store', dest='storage', help='which storage to use [execution.storage in config.yaml]', choices=Config.AVAILABLE_STORAGES)
|
||||
parser.add_argument('--sheet', action='store', dest='sheet', help='the name of the google sheets document [execution.sheet in config.yaml]')
|
||||
parser.add_argument('--header', action='store', dest='header', help='1-based index for the header row [execution.header in config.yaml]')
|
||||
parser.add_argument('--check-if-exists', action='store_true', dest='check_if_exists', help='when possible checks if the URL has been archived before and does not archive the same URL twice [exceution.check_if_exists]')
|
||||
parser.add_argument('--save-logs', action='store_true', dest='save_logs', help='creates or appends execution logs to files logs/LEVEL.log [exceution.save_logs]')
|
||||
parser.add_argument('--s3-private', action='store_true', help='Store content without public access permission (only for storage=s3) [secrets.s3.private in config.yaml]')
|
||||
|
||||
for k, v in GWorksheet.COLUMN_NAMES.items():
|
||||
help = f"the name of the column to FILL WITH {k} (default='{v}')"
|
||||
if k in ["url", "folder"]:
|
||||
help = f"the name of the column to READ {k} FROM (default='{v}')"
|
||||
parser.add_argument(f'--col-{k}', action='store', dest=k, help=help)
|
||||
|
||||
return parser
|
||||
|
||||
def set_folder(self, folder):
|
||||
"""
|
||||
update the folder in each of the storages
|
||||
"""
|
||||
self.folder = folder
|
||||
logger.info(f"setting folder to {folder}")
|
||||
# s3
|
||||
if hasattr(self, "s3_config"): self.s3_config.folder = folder
|
||||
if hasattr(self, "s3_storage"): self.s3_storage.folder = folder
|
||||
# gdrive
|
||||
if hasattr(self, "gd_config"): self.gd_config.folder = folder
|
||||
if hasattr(self, "gd_storage"): self.gd_storage.folder = folder
|
||||
# local
|
||||
if hasattr(self, "local_config"): self.local_config.folder = folder
|
||||
if hasattr(self, "local_storage"): self.local_storage.folder = folder
|
||||
|
||||
def get_storage(self):
|
||||
"""
|
||||
returns the configured type of storage, creating if needed
|
||||
"""
|
||||
if self.storage == "s3":
|
||||
self.s3_storage = getattr_or(self, "s3_storage", S3Storage(self.s3_config))
|
||||
return self.s3_storage
|
||||
elif self.storage == "gd":
|
||||
self.gd_storage = getattr_or(self, "gd_storage", GDStorage(self.gd_config))
|
||||
return self.gd_storage
|
||||
elif self.storage == "local":
|
||||
self.local_storage = getattr_or(self, "local_storage", LocalStorage(self.local_config))
|
||||
return self.local_storage
|
||||
raise f"storage {self.storage} not implemented, available: {Config.AVAILABLE_STORAGES}"
|
||||
|
||||
def destroy_webdriver(self):
|
||||
if self.webdriver is not None and type(self.webdriver) != str:
|
||||
self.webdriver.close()
|
||||
self.webdriver.quit()
|
||||
del self.webdriver
|
||||
|
||||
def recreate_webdriver(self):
|
||||
options = webdriver.FirefoxOptions()
|
||||
options.headless = True
|
||||
options.set_preference('network.protocol-handler.external.tg', False)
|
||||
try:
|
||||
new_webdriver = webdriver.Firefox(options=options)
|
||||
# only destroy if creation is successful
|
||||
self.destroy_webdriver()
|
||||
self.webdriver = new_webdriver
|
||||
self.webdriver.set_window_size(self.selenium_config.window_width,
|
||||
self.selenium_config.window_height)
|
||||
self.webdriver.set_page_load_timeout(self.selenium_config.timeout_seconds)
|
||||
except TimeoutException as e:
|
||||
logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}")
|
||||
|
||||
def __str__(self) -> str:
|
||||
return json.dumps({
|
||||
"config_file": self.config_file,
|
||||
"sheet": self.sheet,
|
||||
"worksheet_allow": list(self.worksheet_allow),
|
||||
"worksheet_block": list(self.worksheet_block),
|
||||
"storage": self.storage,
|
||||
"header": self.header,
|
||||
"check_if_exists": self.check_if_exists,
|
||||
"hash_algorithm": self.hash_algorithm,
|
||||
"browsertrix_config": asdict(self.browsertrix_config),
|
||||
"save_logs": self.save_logs,
|
||||
"selenium_config": asdict(self.selenium_config),
|
||||
"selenium_webdriver": self.webdriver != None,
|
||||
"s3_config": hasattr(self, "s3_config"),
|
||||
"s3_private": getattr_or(getattr(self, "s3_config", {}), "private", None),
|
||||
"gd_config": hasattr(self, "gd_config"),
|
||||
"local_config": hasattr(self, "local_config"),
|
||||
"wayback_config": self.wayback_config != None,
|
||||
"telegram_config": self.telegram_config != None,
|
||||
"twitter_config": self.twitter_config != None,
|
||||
"vk_config": self.vk_config != None,
|
||||
"gsheets_client": self.gsheets_client != None,
|
||||
"column_names": self.column_names,
|
||||
}, ensure_ascii=False, indent=4)
|
|
@ -1,9 +0,0 @@
|
|||
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class InstagramConfig:
|
||||
username: str
|
||||
password: str
|
||||
session_file: str
|
|
@ -1,8 +0,0 @@
|
|||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class SeleniumConfig:
|
||||
timeout_seconds: int = 120
|
||||
window_width: int = 1400
|
||||
window_height: int = 2000
|
|
@ -1,10 +0,0 @@
|
|||
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class TelethonConfig:
|
||||
api_id: str
|
||||
api_hash: str
|
||||
bot_token: str
|
||||
session_file: str
|
|
@ -1,11 +0,0 @@
|
|||
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class TwitterApiConfig:
|
||||
bearer_token: str
|
||||
consumer_key: str
|
||||
consumer_secret: str
|
||||
access_token: str
|
||||
access_secret: str
|
|
@ -1,9 +0,0 @@
|
|||
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class VkConfig:
|
||||
username: str
|
||||
password: str
|
||||
session_file: str
|
|
@ -1,8 +0,0 @@
|
|||
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class WaybackConfig:
|
||||
key: str
|
||||
secret: str
|
|
@ -1,2 +0,0 @@
|
|||
from .database import Database
|
||||
from .gsheet_db import GsheetsDb
|
|
@ -1,9 +0,0 @@
|
|||
# we need to explicitly expose the available imports here
|
||||
from .base_storage import Storage
|
||||
from .local_storage import LocalStorage, LocalConfig
|
||||
from .s3_storage import S3Config, S3Storage
|
||||
from .gd_storage import GDConfig, GDStorage
|
||||
|
||||
from .storage import StorageV2
|
||||
from .s3 import S3StorageV2
|
||||
from .local import LocalStorageV2
|
|
@ -1,36 +0,0 @@
|
|||
import os
|
||||
|
||||
from dataclasses import dataclass
|
||||
from loguru import logger
|
||||
|
||||
from .base_storage import Storage
|
||||
from utils import mkdir_if_not_exists
|
||||
|
||||
|
||||
@dataclass
|
||||
class LocalConfig:
|
||||
folder: str = ""
|
||||
save_to: str = "./"
|
||||
|
||||
class LocalStorage(Storage):
|
||||
def __init__(self, config:LocalConfig):
|
||||
self.folder = config.folder
|
||||
self.save_to = config.save_to
|
||||
mkdir_if_not_exists(self.save_to)
|
||||
|
||||
def get_cdn_url(self, key):
|
||||
key = self.clean_key(key)
|
||||
logger.info(f"{key=}")
|
||||
full_path = os.path.join(self.save_to, self.folder, key)
|
||||
logger.debug(f"{full_path=} creating dir structure to {os.path.dirname(full_path)}")
|
||||
os.makedirs(os.path.dirname(full_path), exist_ok=True)
|
||||
# mkdir_if_not_exists(os.path.join(*full_path.split(os.path.sep)[0:-1]))
|
||||
return os.path.abspath(full_path)
|
||||
|
||||
def exists(self, key):
|
||||
return os.path.isfile(self.get_cdn_url(key))
|
||||
|
||||
def uploadf(self, file, key, **kwargs):
|
||||
path = self.get_cdn_url(key)
|
||||
with open(path, "wb") as outf:
|
||||
outf.write(file.read())
|
12
src/v2.py
12
src/v2.py
|
@ -1,12 +0,0 @@
|
|||
|
||||
|
||||
from abc import ABC
|
||||
from configs.v2config import ConfigV2
|
||||
from orchestrator import ArchivingOrchestrator
|
||||
|
||||
config = ConfigV2()
|
||||
config.parse()
|
||||
|
||||
orchestrator = ArchivingOrchestrator(config)
|
||||
|
||||
orchestrator.feed()
|
Ładowanie…
Reference in New Issue