kopia lustrzana https://github.com/bellingcat/auto-archiver
split into multiple files MVP
rodzic
009c0dd8ca
commit
f3ce226665
|
@ -1,7 +1,8 @@
|
|||
tmp/
|
||||
.env
|
||||
.env*
|
||||
.DS_Store
|
||||
expmt/
|
||||
service_account.json
|
||||
__pycache__/
|
||||
._*
|
||||
anu.html
|
1
Pipfile
1
Pipfile
|
@ -10,7 +10,6 @@ python-dotenv = "*"
|
|||
youtube_dl = "*"
|
||||
argparse = "*"
|
||||
beautifulsoup4 = "*"
|
||||
nordvpn-switcher = "*"
|
||||
tiktok-downloader = {git = "https://github.com/msramalho/tiktok-downloader"}
|
||||
bs4 = "*"
|
||||
loguru = "*"
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "af39efbad8c78641a732697001193b5f4f92a0af8a9709081428001362a47060"
|
||||
"sha256": "9a5218275503e5ae779407349d0a76f44712dc4824e066b10aeb047264a168be"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
|
@ -93,6 +93,14 @@
|
|||
],
|
||||
"version": "==1.2.58"
|
||||
},
|
||||
"faker": {
|
||||
"hashes": [
|
||||
"sha256:ee8d9181137cdd2b198bd3d0653b0a3b7b385213862348e15ba8a423324b702b",
|
||||
"sha256:f545b2a1ba5f7effc4ed71af0a5204d939445f0190838d41bee6bc160958bfbe"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==13.0.0"
|
||||
},
|
||||
"ffmpeg-python": {
|
||||
"hashes": [
|
||||
"sha256:65225db34627c578ef0e11c8b1eb528bb35e024752f6f10b78c011f6f64c4127",
|
||||
|
@ -180,73 +188,6 @@
|
|||
"index": "pypi",
|
||||
"version": "==0.6.0"
|
||||
},
|
||||
"lxml": {
|
||||
"hashes": [
|
||||
"sha256:078306d19a33920004addeb5f4630781aaeabb6a8d01398045fcde085091a169",
|
||||
"sha256:0c1978ff1fd81ed9dcbba4f91cf09faf1f8082c9d72eb122e92294716c605428",
|
||||
"sha256:1010042bfcac2b2dc6098260a2ed022968dbdfaf285fc65a3acf8e4eb1ffd1bc",
|
||||
"sha256:1d650812b52d98679ed6c6b3b55cbb8fe5a5460a0aef29aeb08dc0b44577df85",
|
||||
"sha256:20b8a746a026017acf07da39fdb10aa80ad9877046c9182442bf80c84a1c4696",
|
||||
"sha256:2403a6d6fb61c285969b71f4a3527873fe93fd0abe0832d858a17fe68c8fa507",
|
||||
"sha256:24f5c5ae618395ed871b3d8ebfcbb36e3f1091fd847bf54c4de623f9107942f3",
|
||||
"sha256:28d1af847786f68bec57961f31221125c29d6f52d9187c01cd34dc14e2b29430",
|
||||
"sha256:31499847fc5f73ee17dbe1b8e24c6dafc4e8d5b48803d17d22988976b0171f03",
|
||||
"sha256:31ba2cbc64516dcdd6c24418daa7abff989ddf3ba6d3ea6f6ce6f2ed6e754ec9",
|
||||
"sha256:330bff92c26d4aee79c5bc4d9967858bdbe73fdbdbacb5daf623a03a914fe05b",
|
||||
"sha256:5045ee1ccd45a89c4daec1160217d363fcd23811e26734688007c26f28c9e9e7",
|
||||
"sha256:52cbf2ff155b19dc4d4100f7442f6a697938bf4493f8d3b0c51d45568d5666b5",
|
||||
"sha256:530f278849031b0eb12f46cca0e5db01cfe5177ab13bd6878c6e739319bae654",
|
||||
"sha256:545bd39c9481f2e3f2727c78c169425efbfb3fbba6e7db4f46a80ebb249819ca",
|
||||
"sha256:5804e04feb4e61babf3911c2a974a5b86f66ee227cc5006230b00ac6d285b3a9",
|
||||
"sha256:5a58d0b12f5053e270510bf12f753a76aaf3d74c453c00942ed7d2c804ca845c",
|
||||
"sha256:5f148b0c6133fb928503cfcdfdba395010f997aa44bcf6474fcdd0c5398d9b63",
|
||||
"sha256:5f7d7d9afc7b293147e2d506a4596641d60181a35279ef3aa5778d0d9d9123fe",
|
||||
"sha256:60d2f60bd5a2a979df28ab309352cdcf8181bda0cca4529769a945f09aba06f9",
|
||||
"sha256:6259b511b0f2527e6d55ad87acc1c07b3cbffc3d5e050d7e7bcfa151b8202df9",
|
||||
"sha256:6268e27873a3d191849204d00d03f65c0e343b3bcb518a6eaae05677c95621d1",
|
||||
"sha256:627e79894770783c129cc5e89b947e52aa26e8e0557c7e205368a809da4b7939",
|
||||
"sha256:62f93eac69ec0f4be98d1b96f4d6b964855b8255c345c17ff12c20b93f247b68",
|
||||
"sha256:6d6483b1229470e1d8835e52e0ff3c6973b9b97b24cd1c116dca90b57a2cc613",
|
||||
"sha256:6f7b82934c08e28a2d537d870293236b1000d94d0b4583825ab9649aef7ddf63",
|
||||
"sha256:6fe4ef4402df0250b75ba876c3795510d782def5c1e63890bde02d622570d39e",
|
||||
"sha256:719544565c2937c21a6f76d520e6e52b726d132815adb3447ccffbe9f44203c4",
|
||||
"sha256:730766072fd5dcb219dd2b95c4c49752a54f00157f322bc6d71f7d2a31fecd79",
|
||||
"sha256:74eb65ec61e3c7c019d7169387d1b6ffcfea1b9ec5894d116a9a903636e4a0b1",
|
||||
"sha256:7993232bd4044392c47779a3c7e8889fea6883be46281d45a81451acfd704d7e",
|
||||
"sha256:80bbaddf2baab7e6de4bc47405e34948e694a9efe0861c61cdc23aa774fcb141",
|
||||
"sha256:86545e351e879d0b72b620db6a3b96346921fa87b3d366d6c074e5a9a0b8dadb",
|
||||
"sha256:891dc8f522d7059ff0024cd3ae79fd224752676447f9c678f2a5c14b84d9a939",
|
||||
"sha256:8a31f24e2a0b6317f33aafbb2f0895c0bce772980ae60c2c640d82caac49628a",
|
||||
"sha256:8b99ec73073b37f9ebe8caf399001848fced9c08064effdbfc4da2b5a8d07b93",
|
||||
"sha256:986b7a96228c9b4942ec420eff37556c5777bfba6758edcb95421e4a614b57f9",
|
||||
"sha256:a1547ff4b8a833511eeaceacbcd17b043214fcdb385148f9c1bc5556ca9623e2",
|
||||
"sha256:a2bfc7e2a0601b475477c954bf167dee6d0f55cb167e3f3e7cefad906e7759f6",
|
||||
"sha256:a3c5f1a719aa11866ffc530d54ad965063a8cbbecae6515acbd5f0fae8f48eaa",
|
||||
"sha256:a9f1c3489736ff8e1c7652e9dc39f80cff820f23624f23d9eab6e122ac99b150",
|
||||
"sha256:aa0cf4922da7a3c905d000b35065df6184c0dc1d866dd3b86fd961905bbad2ea",
|
||||
"sha256:ad4332a532e2d5acb231a2e5d33f943750091ee435daffca3fec0a53224e7e33",
|
||||
"sha256:b2582b238e1658c4061ebe1b4df53c435190d22457642377fd0cb30685cdfb76",
|
||||
"sha256:b6fc2e2fb6f532cf48b5fed57567ef286addcef38c28874458a41b7837a57807",
|
||||
"sha256:b92d40121dcbd74831b690a75533da703750f7041b4bf951befc657c37e5695a",
|
||||
"sha256:bbab6faf6568484707acc052f4dfc3802bdb0cafe079383fbaa23f1cdae9ecd4",
|
||||
"sha256:c0b88ed1ae66777a798dc54f627e32d3b81c8009967c63993c450ee4cbcbec15",
|
||||
"sha256:ce13d6291a5f47c1c8dbd375baa78551053bc6b5e5c0e9bb8e39c0a8359fd52f",
|
||||
"sha256:db3535733f59e5605a88a706824dfcb9bd06725e709ecb017e165fc1d6e7d429",
|
||||
"sha256:dd10383f1d6b7edf247d0960a3db274c07e96cf3a3fc7c41c8448f93eac3fb1c",
|
||||
"sha256:e01f9531ba5420838c801c21c1b0f45dbc9607cb22ea2cf132844453bec863a5",
|
||||
"sha256:e11527dc23d5ef44d76fef11213215c34f36af1608074561fcc561d983aeb870",
|
||||
"sha256:e1ab2fac607842ac36864e358c42feb0960ae62c34aa4caaf12ada0a1fb5d99b",
|
||||
"sha256:e1fd7d2fe11f1cb63d3336d147c852f6d07de0d0020d704c6031b46a30b02ca8",
|
||||
"sha256:e9f84ed9f4d50b74fbc77298ee5c870f67cb7e91dcdc1a6915cb1ff6a317476c",
|
||||
"sha256:ec4b4e75fc68da9dc0ed73dcdb431c25c57775383fec325d23a770a64e7ebc87",
|
||||
"sha256:f10ce66fcdeb3543df51d423ede7e238be98412232fca5daec3e54bcd16b8da0",
|
||||
"sha256:f63f62fc60e6228a4ca9abae28228f35e1bd3ce675013d1dfb828688d50c6e23",
|
||||
"sha256:fa56bb08b3dd8eac3a8c5b7d075c94e74f755fd9d8a04543ae8d37b1612dd170",
|
||||
"sha256:fa9b7c450be85bfc6cd39f6df8c5b8cbd76b5d6fc1f69efec80203f9894b885f"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==4.8.0"
|
||||
},
|
||||
"markupsafe": {
|
||||
"hashes": [
|
||||
"sha256:023af8c54fe63530545f70dd2a2a7eed18d07a9a77b94e8bf1e2ff7f252db9a3",
|
||||
|
@ -293,14 +234,6 @@
|
|||
"markers": "python_version >= '3.7'",
|
||||
"version": "==2.1.0"
|
||||
},
|
||||
"nordvpn-switcher": {
|
||||
"hashes": [
|
||||
"sha256:764db054715d949af0f836da5e46c4053afe92282a0d4b2cfc6b8cfe8c3045de",
|
||||
"sha256:9788c2c3113d0d7b00894dae3ea19bed14f3b38d111d7223c126f001b1729a3b"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.2.9"
|
||||
},
|
||||
"oauthlib": {
|
||||
"hashes": [
|
||||
"sha256:23a8208d75b902797ea29fd31fa80a15ed9dc2c6c16fe73f5d346f83f6fa27a2",
|
||||
|
@ -309,59 +242,6 @@
|
|||
"markers": "python_version >= '3.6'",
|
||||
"version": "==3.2.0"
|
||||
},
|
||||
"pathlib": {
|
||||
"hashes": [
|
||||
"sha256:6940718dfc3eff4258203ad5021090933e5c04707d5ca8cc9e73c94a7894ea9f"
|
||||
],
|
||||
"version": "==1.0.1"
|
||||
},
|
||||
"psutil": {
|
||||
"hashes": [
|
||||
"sha256:072664401ae6e7c1bfb878c65d7282d4b4391f1bc9a56d5e03b5a490403271b5",
|
||||
"sha256:1070a9b287846a21a5d572d6dddd369517510b68710fca56b0e9e02fd24bed9a",
|
||||
"sha256:1d7b433519b9a38192dfda962dd8f44446668c009833e1429a52424624f408b4",
|
||||
"sha256:3151a58f0fbd8942ba94f7c31c7e6b310d2989f4da74fcbf28b934374e9bf841",
|
||||
"sha256:32acf55cb9a8cbfb29167cd005951df81b567099295291bcfd1027365b36591d",
|
||||
"sha256:3611e87eea393f779a35b192b46a164b1d01167c9d323dda9b1e527ea69d697d",
|
||||
"sha256:3d00a664e31921009a84367266b35ba0aac04a2a6cad09c550a89041034d19a0",
|
||||
"sha256:4e2fb92e3aeae3ec3b7b66c528981fd327fb93fd906a77215200404444ec1845",
|
||||
"sha256:539e429da49c5d27d5a58e3563886057f8fc3868a5547b4f1876d9c0f007bccf",
|
||||
"sha256:55ce319452e3d139e25d6c3f85a1acf12d1607ddedea5e35fb47a552c051161b",
|
||||
"sha256:58c7d923dc209225600aec73aa2c4ae8ea33b1ab31bc11ef8a5933b027476f07",
|
||||
"sha256:7336292a13a80eb93c21f36bde4328aa748a04b68c13d01dfddd67fc13fd0618",
|
||||
"sha256:742c34fff804f34f62659279ed5c5b723bb0195e9d7bd9907591de9f8f6558e2",
|
||||
"sha256:7641300de73e4909e5d148e90cc3142fb890079e1525a840cf0dfd39195239fd",
|
||||
"sha256:76cebf84aac1d6da5b63df11fe0d377b46b7b500d892284068bacccf12f20666",
|
||||
"sha256:7779be4025c540d1d65a2de3f30caeacc49ae7a2152108adeaf42c7534a115ce",
|
||||
"sha256:7d190ee2eaef7831163f254dc58f6d2e2a22e27382b936aab51c835fc080c3d3",
|
||||
"sha256:8293942e4ce0c5689821f65ce6522ce4786d02af57f13c0195b40e1edb1db61d",
|
||||
"sha256:869842dbd66bb80c3217158e629d6fceaecc3a3166d3d1faee515b05dd26ca25",
|
||||
"sha256:90a58b9fcae2dbfe4ba852b57bd4a1dded6b990a33d6428c7614b7d48eccb492",
|
||||
"sha256:9b51917c1af3fa35a3f2dabd7ba96a2a4f19df3dec911da73875e1edaf22a40b",
|
||||
"sha256:b2237f35c4bbae932ee98902a08050a27821f8f6dfa880a47195e5993af4702d",
|
||||
"sha256:c3400cae15bdb449d518545cbd5b649117de54e3596ded84aacabfbb3297ead2",
|
||||
"sha256:c51f1af02334e4b516ec221ee26b8fdf105032418ca5a5ab9737e8c87dafe203",
|
||||
"sha256:cb8d10461c1ceee0c25a64f2dd54872b70b89c26419e147a05a10b753ad36ec2",
|
||||
"sha256:d62a2796e08dd024b8179bd441cb714e0f81226c352c802fca0fd3f89eeacd94",
|
||||
"sha256:df2c8bd48fb83a8408c8390b143c6a6fa10cb1a674ca664954de193fdcab36a9",
|
||||
"sha256:e5c783d0b1ad6ca8a5d3e7b680468c9c926b804be83a3a8e95141b05c39c9f64",
|
||||
"sha256:e9805fed4f2a81de98ae5fe38b75a74c6e6ad2df8a5c479594c7629a1fe35f56",
|
||||
"sha256:ea42d747c5f71b5ccaa6897b216a7dadb9f52c72a0fe2b872ef7d3e1eacf3ba3",
|
||||
"sha256:ef216cc9feb60634bda2f341a9559ac594e2eeaadd0ba187a4c2eb5b5d40b91c",
|
||||
"sha256:ff0d41f8b3e9ebb6b6110057e40019a432e96aae2008951121ba4e56040b84f3"
|
||||
],
|
||||
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==5.9.0"
|
||||
},
|
||||
"py-mini-racer": {
|
||||
"hashes": [
|
||||
"sha256:346e73bb89a2024888244d487834be24a121089ceb0641dd0200cb96c4e24b57",
|
||||
"sha256:42896c24968481dd953eeeb11de331f6870917811961c9b26ba09071e07180e2",
|
||||
"sha256:97cab31bbf63ce462ba4cd6e978c572c916d8b15586156c7c5e0b2e42c10baab",
|
||||
"sha256:f71e36b643d947ba698c57cd9bd2232c83ca997b0802fc2f7f79582377040c11"
|
||||
],
|
||||
"version": "==0.6.0"
|
||||
},
|
||||
"pyasn1": {
|
||||
"hashes": [
|
||||
"sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359",
|
||||
|
@ -422,13 +302,6 @@
|
|||
"index": "pypi",
|
||||
"version": "==0.19.2"
|
||||
},
|
||||
"random-user-agent": {
|
||||
"hashes": [
|
||||
"sha256:535636a55fb63fe3d74fd0260d854c241d9f2946447026464e578e68eac17dac",
|
||||
"sha256:8f8ca26ec8cb1d24ad1758d8b8f700d154064d641dbe9a255cfec42960fbd012"
|
||||
],
|
||||
"version": "==1.0.1"
|
||||
},
|
||||
"requests": {
|
||||
"hashes": [
|
||||
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
|
||||
|
|
|
@ -8,6 +8,8 @@ If you are using `pipenv` (recommended), `pipenv install` is sufficient to insta
|
|||
|
||||
[A Google Service account is necessary for use with `gspread`.](https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account) Credentials for this account should be stored in `service_account.json`, in the same directory as the script.
|
||||
|
||||
[ffmpeg](https://www.ffmpeg.org/) must also be installed locally for this tool to work.
|
||||
|
||||
A `.env` file is required for saving content to a Digital Ocean space, and for archiving pages to the Internet Archive. This file should also be in the script directory, and should contain the following variables:
|
||||
|
||||
```
|
||||
|
|
390
archivers.py
390
archivers.py
|
@ -1,390 +0,0 @@
|
|||
from dataclasses import dataclass
|
||||
import youtube_dl
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
import tiktok_downloader
|
||||
from loguru import logger
|
||||
import os
|
||||
import datetime
|
||||
import ffmpeg
|
||||
from botocore.errorfactory import ClientError
|
||||
import time
|
||||
import traceback
|
||||
|
||||
# TODO There should be a better way of generating keys, that adds the following info:
|
||||
# - name of sheet that it is being archived from
|
||||
# (this means we might archive the same media twice on different sheets, but that's OK I think)
|
||||
# - name of archiver/platform that the video comes from
|
||||
# This should make it easier to maintain and clean the archive later
|
||||
|
||||
# TODO "check_if_exists" has lots of repeated code across the archivers. Can this be
|
||||
# cleaned up? Difficult is we don't know the filename until the archivers start working.
|
||||
|
||||
|
||||
def get_cdn_url(key):
|
||||
return 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format(
|
||||
os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key)
|
||||
|
||||
|
||||
def do_s3_upload(s3_client, f, key):
|
||||
s3_client.upload_fileobj(f, Bucket=os.getenv(
|
||||
'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'})
|
||||
|
||||
|
||||
def get_key(filename):
|
||||
key = filename.split('/')[1]
|
||||
if 'unknown_video' in key:
|
||||
key = key.replace('unknown_video', 'jpg')
|
||||
return key
|
||||
|
||||
|
||||
def get_thumbnails(filename, s3_client, duration=None):
|
||||
if not os.path.exists(filename.split('.')[0]):
|
||||
os.mkdir(filename.split('.')[0])
|
||||
|
||||
fps = 0.5
|
||||
if duration is not None:
|
||||
duration = float(duration)
|
||||
|
||||
if duration < 60:
|
||||
fps = 10.0 / duration
|
||||
elif duration < 120:
|
||||
fps = 20.0 / duration
|
||||
else:
|
||||
fps = 40.0 / duration
|
||||
|
||||
stream = ffmpeg.input(filename)
|
||||
stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)
|
||||
stream.output(filename.split('.')[0] + '/out%d.jpg').run()
|
||||
|
||||
thumbnails = os.listdir(filename.split('.')[0] + '/')
|
||||
cdn_urls = []
|
||||
|
||||
for fname in thumbnails:
|
||||
if fname[-3:] == 'jpg':
|
||||
thumbnail_filename = filename.split('.')[0] + '/' + fname
|
||||
key = filename.split('/')[1].split('.')[0] + '/' + fname
|
||||
|
||||
cdn_url = get_cdn_url(key)
|
||||
|
||||
with open(thumbnail_filename, 'rb') as f:
|
||||
do_s3_upload(s3_client, f, key)
|
||||
|
||||
cdn_urls.append(cdn_url)
|
||||
os.remove(thumbnail_filename)
|
||||
|
||||
if len(cdn_urls) == 0:
|
||||
return ('None', 'None')
|
||||
|
||||
key_thumb = cdn_urls[int(len(cdn_urls)*0.1)]
|
||||
|
||||
index_page = f'''<html><head><title>{filename}</title></head>
|
||||
<body>'''
|
||||
|
||||
for t in cdn_urls:
|
||||
index_page += f'<img src="{t}" />'
|
||||
|
||||
index_page += f"</body></html>"
|
||||
index_fname = filename.split('.')[0] + '/index.html'
|
||||
|
||||
with open(index_fname, 'w') as f:
|
||||
f.write(index_page)
|
||||
|
||||
thumb_index = filename.split('/')[1].split('.')[0] + '/index.html'
|
||||
|
||||
s3_client.upload_fileobj(open(index_fname, 'rb'), Bucket=os.getenv(
|
||||
'DO_BUCKET'), Key=thumb_index, ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/html'})
|
||||
|
||||
thumb_index_cdn_url = get_cdn_url(thumb_index)
|
||||
|
||||
return (key_thumb, thumb_index_cdn_url)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ArchiveResult:
|
||||
status: str
|
||||
cdn_url: str = None
|
||||
thumbnail: str = None
|
||||
thumbnail_index: str = None
|
||||
duration: float = None
|
||||
title: str = None
|
||||
timestamp: datetime.datetime = None
|
||||
|
||||
|
||||
class Archiver:
|
||||
def __init__(self, s3_client):
|
||||
self.s3 = s3_client
|
||||
|
||||
def download(self, url):
|
||||
pass
|
||||
|
||||
|
||||
class TelegramArchiver(Archiver):
|
||||
def download(self, url, check_if_exists=False):
|
||||
# detect URLs that we definitely cannot handle
|
||||
if 'http://t.me/' not in url and 'https://t.me/' not in url:
|
||||
return False
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
|
||||
status = "success"
|
||||
|
||||
original_url = url
|
||||
|
||||
if url[-8:] != "?embed=1":
|
||||
url += "?embed=1"
|
||||
|
||||
t = requests.get(url, headers=headers)
|
||||
s = BeautifulSoup(t.content, 'html.parser')
|
||||
video = s.find("video")
|
||||
|
||||
if video is None:
|
||||
return False # could not find video
|
||||
|
||||
video_url = video.get('src')
|
||||
key = video_url.split('/')[-1].split('?')[0]
|
||||
filename = 'tmp/' + key
|
||||
|
||||
if check_if_exists:
|
||||
try:
|
||||
self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
|
||||
|
||||
# file exists
|
||||
cdn_url = get_cdn_url(key)
|
||||
|
||||
status = 'already archived'
|
||||
|
||||
except ClientError:
|
||||
pass
|
||||
|
||||
v = requests.get(video_url, headers=headers)
|
||||
|
||||
with open(filename, 'wb') as f:
|
||||
f.write(v.content)
|
||||
|
||||
if status != 'already archived':
|
||||
cdn_url = get_cdn_url(key)
|
||||
|
||||
with open(filename, 'rb') as f:
|
||||
do_s3_upload(self.s3, f, key)
|
||||
|
||||
# extract duration from HTML
|
||||
duration = s.find_all('time')[0].contents[0]
|
||||
if ':' in duration:
|
||||
duration = float(duration.split(
|
||||
':')[0])*60 + float(duration.split(':')[1])
|
||||
else:
|
||||
duration = float(duration)
|
||||
|
||||
# process thumbnails
|
||||
key_thumb, thumb_index = get_thumbnails(
|
||||
filename, self.s3, duration=duration)
|
||||
os.remove(filename)
|
||||
|
||||
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
|
||||
duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'))
|
||||
|
||||
|
||||
class YoutubeDLArchiver(Archiver):
|
||||
def download(self, url, check_if_exists=False):
|
||||
ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
|
||||
if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'):
|
||||
logger.info('Using Facebook cookie')
|
||||
youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE')
|
||||
|
||||
ydl = youtube_dl.YoutubeDL(ydl_opts)
|
||||
cdn_url = None
|
||||
status = 'success'
|
||||
|
||||
try:
|
||||
info = ydl.extract_info(url, download=False)
|
||||
except youtube_dl.utils.DownloadError:
|
||||
# no video here
|
||||
return False
|
||||
|
||||
if 'is_live' in info and info['is_live']:
|
||||
logger.warning("Live streaming media, not archiving now")
|
||||
return ArchiveResult(status="Streaming media")
|
||||
|
||||
if check_if_exists:
|
||||
if 'entries' in info:
|
||||
if len(info['entries']) > 1:
|
||||
logger.warning(
|
||||
'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
|
||||
return False
|
||||
|
||||
filename = ydl.prepare_filename(info['entries'][0])
|
||||
else:
|
||||
filename = ydl.prepare_filename(info)
|
||||
|
||||
key = get_key(filename)
|
||||
|
||||
try:
|
||||
self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
|
||||
|
||||
# file exists
|
||||
cdn_url = get_cdn_url(key)
|
||||
|
||||
status = 'already archived'
|
||||
|
||||
except ClientError:
|
||||
pass
|
||||
|
||||
# sometimes this results in a different filename, so do this again
|
||||
info = ydl.extract_info(url, download=True)
|
||||
|
||||
if 'entries' in info:
|
||||
if len(info['entries']) > 1:
|
||||
logger.warning(
|
||||
'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
|
||||
return False
|
||||
else:
|
||||
info = info['entries'][0]
|
||||
|
||||
filename = ydl.prepare_filename(info)
|
||||
|
||||
if not os.path.exists(filename):
|
||||
filename = filename.split('.')[0] + '.mkv'
|
||||
|
||||
if status != 'already archived':
|
||||
key = get_key(filename)
|
||||
cdn_url = get_cdn_url(key)
|
||||
|
||||
with open(filename, 'rb') as f:
|
||||
do_s3_upload(self.s3, f, key)
|
||||
|
||||
# get duration
|
||||
duration = info['duration'] if 'duration' in info else None
|
||||
|
||||
# get thumbnails
|
||||
key_thumb, thumb_index = get_thumbnails(
|
||||
filename, self.s3, duration=duration)
|
||||
os.remove(filename)
|
||||
|
||||
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
|
||||
title=info['title'] if 'title' in info else None,
|
||||
timestamp=info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None)
|
||||
|
||||
|
||||
class WaybackArchiver(Archiver):
|
||||
def __init__(self, s3_client):
|
||||
self.s3 = s3_client
|
||||
self.seen_urls = {}
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
if check_if_exists and url in self.seen_urls:
|
||||
return self.seen_urls[url]
|
||||
|
||||
ia_headers = {
|
||||
"Accept": "application/json",
|
||||
"Authorization": "LOW " + os.getenv('INTERNET_ARCHIVE_S3_KEY') + ":" + os.getenv('INTERNET_ARCHIVE_S3_SECRET')
|
||||
}
|
||||
|
||||
r = requests.post(
|
||||
'https://web.archive.org/save/', headers=ia_headers, data={'url': url})
|
||||
|
||||
if r.status_code != 200:
|
||||
return ArchiveResult(status="Internet archive failed")
|
||||
|
||||
job_id = r.json()['job_id']
|
||||
|
||||
status_r = requests.get(
|
||||
'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
|
||||
|
||||
retries = 0
|
||||
|
||||
# wait 90-120 seconds for the archive job to finish
|
||||
while (status_r.status_code != 200 or status_r.json()['status'] == 'pending') and retries < 30:
|
||||
time.sleep(3)
|
||||
|
||||
try:
|
||||
status_r = requests.get(
|
||||
'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
|
||||
except:
|
||||
time.sleep(1)
|
||||
|
||||
retries += 1
|
||||
|
||||
if status_r.status_code != 200:
|
||||
return ArchiveResult(status="Internet archive failed")
|
||||
|
||||
status_json = status_r.json()
|
||||
|
||||
if status_json['status'] != 'success':
|
||||
return ArchiveResult(status='Internet Archive failed: ' + status_json['message'])
|
||||
|
||||
archive_url = 'https://web.archive.org/web/' + \
|
||||
status_json['timestamp'] + '/' + status_json['original_url']
|
||||
|
||||
try:
|
||||
r = requests.get(archive_url)
|
||||
|
||||
parsed = BeautifulSoup(
|
||||
r.content, 'html.parser')
|
||||
|
||||
title = parsed.find_all('title')[
|
||||
0].text
|
||||
except:
|
||||
title = "Could not get title"
|
||||
|
||||
result = ArchiveResult(
|
||||
status='Internet Archive fallback', cdn_url=archive_url, title=title)
|
||||
self.seen_urls[url] = result
|
||||
return result
|
||||
|
||||
|
||||
class TiktokArchiver(Archiver):
|
||||
def download(self, url, check_if_exists=False):
|
||||
if 'tiktok.com' not in url:
|
||||
return False
|
||||
|
||||
status = 'success'
|
||||
|
||||
try:
|
||||
info = tiktok_downloader.info_post(url)
|
||||
key = 'tiktok_' + str(info.id) + '.mp4'
|
||||
filename = 'tmp/' + key
|
||||
|
||||
if check_if_exists:
|
||||
try:
|
||||
self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
|
||||
|
||||
# file exists
|
||||
cdn_url = get_cdn_url(key)
|
||||
|
||||
status = 'already archived'
|
||||
|
||||
except ClientError:
|
||||
pass
|
||||
|
||||
if status != 'already archived':
|
||||
media = tiktok_downloader.snaptik(url).get_media()
|
||||
if len(media) > 0:
|
||||
media[0].download(filename)
|
||||
with open(filename, 'rb') as f:
|
||||
do_s3_upload(self.s3, f, key)
|
||||
|
||||
cdn_url = get_cdn_url(key)
|
||||
else:
|
||||
status = 'could not download media'
|
||||
|
||||
try:
|
||||
key_thumb, thumb_index = get_thumbnails(
|
||||
filename, self.s3, duration=info.duration)
|
||||
except:
|
||||
key_thumb = ''
|
||||
thumb_index = 'error creating thumbnails'
|
||||
|
||||
os.remove(filename)
|
||||
|
||||
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
|
||||
thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat())
|
||||
|
||||
except tiktok_downloader.Except.InvalidUrl:
|
||||
status = 'Invalid URL'
|
||||
return ArchiveResult(status=status)
|
||||
|
||||
except:
|
||||
error = traceback.format_exc()
|
||||
status = 'Other Tiktok error: ' + str(error)
|
||||
return ArchiveResult(status=status)
|
|
@ -0,0 +1,6 @@
|
|||
# we need to explicitly expose the available imports here
|
||||
from .base_archiver import *
|
||||
from .telegram_archiver import *
|
||||
from .tiktok_archiver import *
|
||||
from .wayback_archiver import *
|
||||
from .youtubedl_archiver import *
|
|
@ -0,0 +1,115 @@
|
|||
import os
|
||||
import ffmpeg
|
||||
from dataclasses import dataclass
|
||||
import datetime
|
||||
from loguru import logger
|
||||
|
||||
# TODO There should be a better way of generating keys, that adds the following info:
|
||||
# - name of sheet that it is being archived from
|
||||
# (this means we might archive the same media twice on different sheets, but that's OK I think)
|
||||
# - name of archiver/platform that the video comes from
|
||||
# This should make it easier to maintain and clean the archive later
|
||||
|
||||
# TODO "check_if_exists" has lots of repeated code across the archivers. Can this be
|
||||
# cleaned up? Difficult is we don't know the filename until the archivers start working.
|
||||
|
||||
|
||||
@dataclass
|
||||
class ArchiveResult:
|
||||
status: str
|
||||
cdn_url: str = None
|
||||
thumbnail: str = None
|
||||
thumbnail_index: str = None
|
||||
duration: float = None
|
||||
title: str = None
|
||||
timestamp: datetime.datetime = None
|
||||
|
||||
|
||||
class Archiver:
|
||||
name = "default"
|
||||
|
||||
def __init__(self, s3_client):
|
||||
self.s3 = s3_client
|
||||
|
||||
def __str__(self):
|
||||
return self.__class__.__name__
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
logger.error("method 'download' not implemented")
|
||||
|
||||
def get_cdn_url(self, key):
|
||||
return 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format(
|
||||
os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key)
|
||||
|
||||
def do_s3_upload(self, f, key):
|
||||
self.s3.upload_fileobj(f, Bucket=os.getenv(
|
||||
'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'})
|
||||
|
||||
def get_key(self, filename):
|
||||
print(f"key base implementation: {self.name}")
|
||||
# TODO: refactor to be more manageable
|
||||
key = filename.split('/')[1]
|
||||
if 'unknown_video' in key:
|
||||
key = key.replace('unknown_video', 'jpg')
|
||||
return key
|
||||
|
||||
def get_thumbnails(self, filename, duration=None):
|
||||
if not os.path.exists(filename.split('.')[0]):
|
||||
os.mkdir(filename.split('.')[0])
|
||||
|
||||
fps = 0.5
|
||||
if duration is not None:
|
||||
duration = float(duration)
|
||||
|
||||
if duration < 60:
|
||||
fps = 10.0 / duration
|
||||
elif duration < 120:
|
||||
fps = 20.0 / duration
|
||||
else:
|
||||
fps = 40.0 / duration
|
||||
|
||||
stream = ffmpeg.input(filename)
|
||||
stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)
|
||||
stream.output(filename.split('.')[0] + '/out%d.jpg').run()
|
||||
|
||||
thumbnails = os.listdir(filename.split('.')[0] + '/')
|
||||
cdn_urls = []
|
||||
|
||||
for fname in thumbnails:
|
||||
if fname[-3:] == 'jpg':
|
||||
thumbnail_filename = filename.split('.')[0] + '/' + fname
|
||||
key = filename.split('/')[1].split('.')[0] + '/' + fname
|
||||
|
||||
cdn_url = self.get_cdn_url(key)
|
||||
|
||||
with open(thumbnail_filename, 'rb') as f:
|
||||
self.do_s3_upload(f, key)
|
||||
|
||||
cdn_urls.append(cdn_url)
|
||||
os.remove(thumbnail_filename)
|
||||
|
||||
if len(cdn_urls) == 0:
|
||||
return ('None', 'None')
|
||||
|
||||
key_thumb = cdn_urls[int(len(cdn_urls) * 0.1)]
|
||||
|
||||
index_page = f'''<html><head><title>{filename}</title></head>
|
||||
<body>'''
|
||||
|
||||
for t in cdn_urls:
|
||||
index_page += f'<img src="{t}" />'
|
||||
|
||||
index_page += f"</body></html>"
|
||||
index_fname = filename.split('.')[0] + '/index.html'
|
||||
|
||||
with open(index_fname, 'w') as f:
|
||||
f.write(index_page)
|
||||
|
||||
thumb_index = filename.split('/')[1].split('.')[0] + '/index.html'
|
||||
|
||||
self.s3.upload_fileobj(open(index_fname, 'rb'), Bucket=os.getenv(
|
||||
'DO_BUCKET'), Key=thumb_index, ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/html'})
|
||||
|
||||
thumb_index_cdn_url = self.get_cdn_url(thumb_index)
|
||||
|
||||
return (key_thumb, thumb_index_cdn_url)
|
|
@ -0,0 +1,76 @@
|
|||
import os
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from botocore.errorfactory import ClientError
|
||||
from .base_archiver import Archiver, ArchiveResult
|
||||
|
||||
# TODO: get_cdn_url, get_thumbnails, do_s3_upload
|
||||
|
||||
|
||||
class TelegramArchiver(Archiver):
|
||||
name = "telegram"
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
# detect URLs that we definitely cannot handle
|
||||
if 'http://t.me/' not in url and 'https://t.me/' not in url:
|
||||
return False
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
|
||||
}
|
||||
status = "success"
|
||||
|
||||
original_url = url
|
||||
|
||||
# TODO: check if we can do this more resilient to user-input
|
||||
if url[-8:] != "?embed=1":
|
||||
url += "?embed=1"
|
||||
|
||||
t = requests.get(url, headers=headers)
|
||||
s = BeautifulSoup(t.content, 'html.parser')
|
||||
video = s.find("video")
|
||||
|
||||
if video is None:
|
||||
return False # could not find video
|
||||
|
||||
video_url = video.get('src')
|
||||
key = video_url.split('/')[-1].split('?')[0]
|
||||
filename = 'tmp/' + key
|
||||
|
||||
if check_if_exists:
|
||||
try:
|
||||
self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
|
||||
|
||||
# file exists
|
||||
cdn_url = self.get_cdn_url(key)
|
||||
|
||||
status = 'already archived'
|
||||
|
||||
except ClientError:
|
||||
pass
|
||||
|
||||
v = requests.get(video_url, headers=headers)
|
||||
|
||||
with open(filename, 'wb') as f:
|
||||
f.write(v.content)
|
||||
|
||||
if status != 'already archived':
|
||||
cdn_url = self.get_cdn_url(key)
|
||||
|
||||
with open(filename, 'rb') as f:
|
||||
self.do_s3_upload(f, key)
|
||||
|
||||
# extract duration from HTML
|
||||
duration = s.find_all('time')[0].contents[0]
|
||||
if ':' in duration:
|
||||
duration = float(duration.split(
|
||||
':')[0]) * 60 + float(duration.split(':')[1])
|
||||
else:
|
||||
duration = float(duration)
|
||||
|
||||
# process thumbnails
|
||||
key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration)
|
||||
os.remove(filename)
|
||||
|
||||
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
|
||||
duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'))
|
|
@ -0,0 +1,68 @@
|
|||
import os, traceback
|
||||
from botocore.errorfactory import ClientError
|
||||
import tiktok_downloader
|
||||
from loguru import logger
|
||||
from .base_archiver import Archiver, ArchiveResult
|
||||
|
||||
# TODO: get_cdn_url, do_s3_upload, get_thumbnails
|
||||
|
||||
|
||||
class TiktokArchiver(Archiver):
|
||||
name = "tiktok"
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
if 'tiktok.com' not in url:
|
||||
return False
|
||||
|
||||
status = 'success'
|
||||
|
||||
try:
|
||||
info = tiktok_downloader.info_post(url)
|
||||
key = 'tiktok_' + str(info.id) + '.mp4'
|
||||
filename = 'tmp/' + key
|
||||
|
||||
if check_if_exists:
|
||||
try:
|
||||
self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
|
||||
|
||||
# file exists
|
||||
cdn_url = self.get_cdn_url(key)
|
||||
|
||||
status = 'already archived'
|
||||
|
||||
except ClientError:
|
||||
pass
|
||||
|
||||
if status != 'already archived':
|
||||
media = tiktok_downloader.snaptik(url).get_media()
|
||||
if len(media) > 0:
|
||||
media[0].download(filename)
|
||||
with open(filename, 'rb') as f:
|
||||
self.do_s3_upload(f, key)
|
||||
|
||||
cdn_url = self.get_cdn_url(key)
|
||||
else:
|
||||
status = 'could not download media'
|
||||
|
||||
try:
|
||||
key_thumb, thumb_index = self.get_thumbnails(
|
||||
filename, duration=info.duration)
|
||||
except:
|
||||
key_thumb = ''
|
||||
thumb_index = 'error creating thumbnails'
|
||||
|
||||
try: os.remove(filename)
|
||||
except FileNotFoundError:
|
||||
logger.info(f'tmp file not found thus not deleted {filename}')
|
||||
|
||||
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
|
||||
thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat())
|
||||
|
||||
except tiktok_downloader.Except.InvalidUrl:
|
||||
status = 'Invalid URL'
|
||||
return ArchiveResult(status=status)
|
||||
|
||||
except:
|
||||
error = traceback.format_exc()
|
||||
status = 'Other Tiktok error: ' + str(error)
|
||||
return ArchiveResult(status=status)
|
|
@ -0,0 +1,73 @@
|
|||
import time, requests, os
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .base_archiver import Archiver, ArchiveResult
|
||||
|
||||
|
||||
class WaybackArchiver(Archiver):
|
||||
name = "wayback"
|
||||
|
||||
def __init__(self, s3_client):
|
||||
self.s3 = s3_client
|
||||
self.seen_urls = {}
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
if check_if_exists and url in self.seen_urls:
|
||||
return self.seen_urls[url]
|
||||
|
||||
ia_headers = {
|
||||
"Accept": "application/json",
|
||||
"Authorization": "LOW " + os.getenv('INTERNET_ARCHIVE_S3_KEY') + ":" + os.getenv('INTERNET_ARCHIVE_S3_SECRET')
|
||||
}
|
||||
|
||||
r = requests.post(
|
||||
'https://web.archive.org/save/', headers=ia_headers, data={'url': url})
|
||||
|
||||
if r.status_code != 200:
|
||||
return ArchiveResult(status="Internet archive failed")
|
||||
|
||||
job_id = r.json()['job_id']
|
||||
|
||||
status_r = requests.get(
|
||||
'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
|
||||
|
||||
retries = 0
|
||||
|
||||
# wait 90-120 seconds for the archive job to finish
|
||||
while (status_r.status_code != 200 or status_r.json()['status'] == 'pending') and retries < 30:
|
||||
time.sleep(3)
|
||||
|
||||
try:
|
||||
status_r = requests.get(
|
||||
'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
|
||||
except:
|
||||
time.sleep(1)
|
||||
|
||||
retries += 1
|
||||
|
||||
if status_r.status_code != 200:
|
||||
return ArchiveResult(status="Internet archive failed")
|
||||
|
||||
status_json = status_r.json()
|
||||
|
||||
if status_json['status'] != 'success':
|
||||
return ArchiveResult(status='Internet Archive failed: ' + status_json['message'])
|
||||
|
||||
archive_url = 'https://web.archive.org/web/' + \
|
||||
status_json['timestamp'] + '/' + status_json['original_url']
|
||||
|
||||
try:
|
||||
r = requests.get(archive_url)
|
||||
|
||||
parsed = BeautifulSoup(
|
||||
r.content, 'html.parser')
|
||||
|
||||
title = parsed.find_all('title')[
|
||||
0].text
|
||||
except:
|
||||
title = "Could not get title"
|
||||
|
||||
result = ArchiveResult(
|
||||
status='Internet Archive fallback', cdn_url=archive_url, title=title)
|
||||
self.seen_urls[url] = result
|
||||
return result
|
|
@ -0,0 +1,88 @@
|
|||
|
||||
import os
|
||||
import datetime
|
||||
import youtube_dl
|
||||
from loguru import logger
|
||||
from botocore.errorfactory import ClientError
|
||||
from .base_archiver import Archiver, ArchiveResult
|
||||
|
||||
class YoutubeDLArchiver(Archiver):
|
||||
name = "yotube_dl"
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
|
||||
if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'):
|
||||
logger.info('Using Facebook cookie')
|
||||
youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE')
|
||||
|
||||
ydl = youtube_dl.YoutubeDL(ydl_opts)
|
||||
cdn_url = None
|
||||
status = 'success'
|
||||
|
||||
try:
|
||||
info = ydl.extract_info(url, download=False)
|
||||
except youtube_dl.utils.DownloadError:
|
||||
# no video here
|
||||
return False
|
||||
|
||||
if 'is_live' in info and info['is_live']:
|
||||
logger.warning("Live streaming media, not archiving now")
|
||||
return ArchiveResult(status="Streaming media")
|
||||
|
||||
if check_if_exists:
|
||||
if 'entries' in info:
|
||||
if len(info['entries']) > 1:
|
||||
logger.warning(
|
||||
'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
|
||||
return False
|
||||
|
||||
filename = ydl.prepare_filename(info['entries'][0])
|
||||
else:
|
||||
filename = ydl.prepare_filename(info)
|
||||
|
||||
key = self.get_key(filename)
|
||||
|
||||
try:
|
||||
self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
|
||||
|
||||
# file exists
|
||||
cdn_url = self.get_cdn_url(key)
|
||||
|
||||
status = 'already archived'
|
||||
|
||||
except ClientError:
|
||||
pass
|
||||
|
||||
# sometimes this results in a different filename, so do this again
|
||||
info = ydl.extract_info(url, download=True)
|
||||
|
||||
if 'entries' in info:
|
||||
if len(info['entries']) > 1:
|
||||
logger.warning(
|
||||
'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
|
||||
return False
|
||||
else:
|
||||
info = info['entries'][0]
|
||||
|
||||
filename = ydl.prepare_filename(info)
|
||||
|
||||
if not os.path.exists(filename):
|
||||
filename = filename.split('.')[0] + '.mkv'
|
||||
|
||||
if status != 'already archived':
|
||||
key = self. get_key(filename)
|
||||
cdn_url = self.get_cdn_url(key)
|
||||
|
||||
with open(filename, 'rb') as f:
|
||||
self.do_s3_upload(f, key)
|
||||
|
||||
# get duration
|
||||
duration = info['duration'] if 'duration' in info else None
|
||||
|
||||
# get thumbnails
|
||||
key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration)
|
||||
os.remove(filename)
|
||||
|
||||
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
|
||||
title=info['title'] if 'title' in info else None,
|
||||
timestamp=info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None)
|
|
@ -1,14 +1,12 @@
|
|||
from dataclasses import dataclass
|
||||
import gspread
|
||||
from pathlib import Path
|
||||
import datetime
|
||||
import boto3
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
import datetime
|
||||
import argparse
|
||||
import math
|
||||
import threading
|
||||
import gspread
|
||||
import boto3
|
||||
from loguru import logger
|
||||
from dotenv import load_dotenv
|
||||
|
||||
import archivers
|
||||
|
||||
load_dotenv()
|
||||
|
@ -156,6 +154,7 @@ def process_sheet(sheet):
|
|||
'duration')) if 'duration' in headers else None
|
||||
|
||||
|
||||
# order matters, first to succeed excludes remaining
|
||||
active_archivers = [
|
||||
archivers.TelegramArchiver(s3_client),
|
||||
archivers.TiktokArchiver(s3_client),
|
||||
|
@ -198,7 +197,7 @@ def process_sheet(sheet):
|
|||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Automatically use youtube-dl to download media from a Google Sheet")
|
||||
description="Automatically archive social media videos from a Google Sheet")
|
||||
parser.add_argument("--sheet", action="store", dest="sheet")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
|
Ładowanie…
Reference in New Issue