split into multiple files MVP

2022-02-21 14:19:09 +01:00 · 2022-02-21 14:19:09 +01:00 · f3ce226665
commit f3ce226665
--- a/.gitignore
+++ b/.gitignore
@ -1,7 +1,8 @@
 tmp/
-.env
+.env*
 .DS_Store
 expmt/
 service_account.json
 __pycache__/
 ._*
+anu.html
--- a/1
+++ b/1
@ -10,7 +10,6 @@ python-dotenv = "*"
 youtube_dl = "*"
 argparse = "*"
 beautifulsoup4 = "*"
-nordvpn-switcher = "*"
 tiktok-downloader = {git = "https://github.com/msramalho/tiktok-downloader"}
 bs4 = "*"
 loguru = "*"
--- a/Pipfile.lock
+++ b/Pipfile.lock
@ -1,7 +1,7 @@
 {
    "_meta": {
        "hash": {
-            "sha256": "af39efbad8c78641a732697001193b5f4f92a0af8a9709081428001362a47060"
+            "sha256": "9a5218275503e5ae779407349d0a76f44712dc4824e066b10aeb047264a168be"
        },
        "pipfile-spec": 6,
        "requires": {
@ -93,6 +93,14 @@
            ],
            "version": "==1.2.58"
        },
+        "faker": {
+            "hashes": [
+                "sha256:ee8d9181137cdd2b198bd3d0653b0a3b7b385213862348e15ba8a423324b702b",
+                "sha256:f545b2a1ba5f7effc4ed71af0a5204d939445f0190838d41bee6bc160958bfbe"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==13.0.0"
+        },
        "ffmpeg-python": {
            "hashes": [
                "sha256:65225db34627c578ef0e11c8b1eb528bb35e024752f6f10b78c011f6f64c4127",
@ -180,73 +188,6 @@
            "index": "pypi",
            "version": "==0.6.0"
        },
-        "lxml": {
-            "hashes": [
-                "sha256:078306d19a33920004addeb5f4630781aaeabb6a8d01398045fcde085091a169",
-                "sha256:0c1978ff1fd81ed9dcbba4f91cf09faf1f8082c9d72eb122e92294716c605428",
-                "sha256:1010042bfcac2b2dc6098260a2ed022968dbdfaf285fc65a3acf8e4eb1ffd1bc",
-                "sha256:1d650812b52d98679ed6c6b3b55cbb8fe5a5460a0aef29aeb08dc0b44577df85",
-                "sha256:20b8a746a026017acf07da39fdb10aa80ad9877046c9182442bf80c84a1c4696",
-                "sha256:2403a6d6fb61c285969b71f4a3527873fe93fd0abe0832d858a17fe68c8fa507",
-                "sha256:24f5c5ae618395ed871b3d8ebfcbb36e3f1091fd847bf54c4de623f9107942f3",
-                "sha256:28d1af847786f68bec57961f31221125c29d6f52d9187c01cd34dc14e2b29430",
-                "sha256:31499847fc5f73ee17dbe1b8e24c6dafc4e8d5b48803d17d22988976b0171f03",
-                "sha256:31ba2cbc64516dcdd6c24418daa7abff989ddf3ba6d3ea6f6ce6f2ed6e754ec9",
-                "sha256:330bff92c26d4aee79c5bc4d9967858bdbe73fdbdbacb5daf623a03a914fe05b",
-                "sha256:5045ee1ccd45a89c4daec1160217d363fcd23811e26734688007c26f28c9e9e7",
-                "sha256:52cbf2ff155b19dc4d4100f7442f6a697938bf4493f8d3b0c51d45568d5666b5",
-                "sha256:530f278849031b0eb12f46cca0e5db01cfe5177ab13bd6878c6e739319bae654",
-                "sha256:545bd39c9481f2e3f2727c78c169425efbfb3fbba6e7db4f46a80ebb249819ca",
-                "sha256:5804e04feb4e61babf3911c2a974a5b86f66ee227cc5006230b00ac6d285b3a9",
-                "sha256:5a58d0b12f5053e270510bf12f753a76aaf3d74c453c00942ed7d2c804ca845c",
-                "sha256:5f148b0c6133fb928503cfcdfdba395010f997aa44bcf6474fcdd0c5398d9b63",
-                "sha256:5f7d7d9afc7b293147e2d506a4596641d60181a35279ef3aa5778d0d9d9123fe",
-                "sha256:60d2f60bd5a2a979df28ab309352cdcf8181bda0cca4529769a945f09aba06f9",
-                "sha256:6259b511b0f2527e6d55ad87acc1c07b3cbffc3d5e050d7e7bcfa151b8202df9",
-                "sha256:6268e27873a3d191849204d00d03f65c0e343b3bcb518a6eaae05677c95621d1",
-                "sha256:627e79894770783c129cc5e89b947e52aa26e8e0557c7e205368a809da4b7939",
-                "sha256:62f93eac69ec0f4be98d1b96f4d6b964855b8255c345c17ff12c20b93f247b68",
-                "sha256:6d6483b1229470e1d8835e52e0ff3c6973b9b97b24cd1c116dca90b57a2cc613",
-                "sha256:6f7b82934c08e28a2d537d870293236b1000d94d0b4583825ab9649aef7ddf63",
-                "sha256:6fe4ef4402df0250b75ba876c3795510d782def5c1e63890bde02d622570d39e",
-                "sha256:719544565c2937c21a6f76d520e6e52b726d132815adb3447ccffbe9f44203c4",
-                "sha256:730766072fd5dcb219dd2b95c4c49752a54f00157f322bc6d71f7d2a31fecd79",
-                "sha256:74eb65ec61e3c7c019d7169387d1b6ffcfea1b9ec5894d116a9a903636e4a0b1",
-                "sha256:7993232bd4044392c47779a3c7e8889fea6883be46281d45a81451acfd704d7e",
-                "sha256:80bbaddf2baab7e6de4bc47405e34948e694a9efe0861c61cdc23aa774fcb141",
-                "sha256:86545e351e879d0b72b620db6a3b96346921fa87b3d366d6c074e5a9a0b8dadb",
-                "sha256:891dc8f522d7059ff0024cd3ae79fd224752676447f9c678f2a5c14b84d9a939",
-                "sha256:8a31f24e2a0b6317f33aafbb2f0895c0bce772980ae60c2c640d82caac49628a",
-                "sha256:8b99ec73073b37f9ebe8caf399001848fced9c08064effdbfc4da2b5a8d07b93",
-                "sha256:986b7a96228c9b4942ec420eff37556c5777bfba6758edcb95421e4a614b57f9",
-                "sha256:a1547ff4b8a833511eeaceacbcd17b043214fcdb385148f9c1bc5556ca9623e2",
-                "sha256:a2bfc7e2a0601b475477c954bf167dee6d0f55cb167e3f3e7cefad906e7759f6",
-                "sha256:a3c5f1a719aa11866ffc530d54ad965063a8cbbecae6515acbd5f0fae8f48eaa",
-                "sha256:a9f1c3489736ff8e1c7652e9dc39f80cff820f23624f23d9eab6e122ac99b150",
-                "sha256:aa0cf4922da7a3c905d000b35065df6184c0dc1d866dd3b86fd961905bbad2ea",
-                "sha256:ad4332a532e2d5acb231a2e5d33f943750091ee435daffca3fec0a53224e7e33",
-                "sha256:b2582b238e1658c4061ebe1b4df53c435190d22457642377fd0cb30685cdfb76",
-                "sha256:b6fc2e2fb6f532cf48b5fed57567ef286addcef38c28874458a41b7837a57807",
-                "sha256:b92d40121dcbd74831b690a75533da703750f7041b4bf951befc657c37e5695a",
-                "sha256:bbab6faf6568484707acc052f4dfc3802bdb0cafe079383fbaa23f1cdae9ecd4",
-                "sha256:c0b88ed1ae66777a798dc54f627e32d3b81c8009967c63993c450ee4cbcbec15",
-                "sha256:ce13d6291a5f47c1c8dbd375baa78551053bc6b5e5c0e9bb8e39c0a8359fd52f",
-                "sha256:db3535733f59e5605a88a706824dfcb9bd06725e709ecb017e165fc1d6e7d429",
-                "sha256:dd10383f1d6b7edf247d0960a3db274c07e96cf3a3fc7c41c8448f93eac3fb1c",
-                "sha256:e01f9531ba5420838c801c21c1b0f45dbc9607cb22ea2cf132844453bec863a5",
-                "sha256:e11527dc23d5ef44d76fef11213215c34f36af1608074561fcc561d983aeb870",
-                "sha256:e1ab2fac607842ac36864e358c42feb0960ae62c34aa4caaf12ada0a1fb5d99b",
-                "sha256:e1fd7d2fe11f1cb63d3336d147c852f6d07de0d0020d704c6031b46a30b02ca8",
-                "sha256:e9f84ed9f4d50b74fbc77298ee5c870f67cb7e91dcdc1a6915cb1ff6a317476c",
-                "sha256:ec4b4e75fc68da9dc0ed73dcdb431c25c57775383fec325d23a770a64e7ebc87",
-                "sha256:f10ce66fcdeb3543df51d423ede7e238be98412232fca5daec3e54bcd16b8da0",
-                "sha256:f63f62fc60e6228a4ca9abae28228f35e1bd3ce675013d1dfb828688d50c6e23",
-                "sha256:fa56bb08b3dd8eac3a8c5b7d075c94e74f755fd9d8a04543ae8d37b1612dd170",
-                "sha256:fa9b7c450be85bfc6cd39f6df8c5b8cbd76b5d6fc1f69efec80203f9894b885f"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
-            "version": "==4.8.0"
-        },
        "markupsafe": {
            "hashes": [
                "sha256:023af8c54fe63530545f70dd2a2a7eed18d07a9a77b94e8bf1e2ff7f252db9a3",
@ -293,14 +234,6 @@
            "markers": "python_version >= '3.7'",
            "version": "==2.1.0"
        },
-        "nordvpn-switcher": {
-            "hashes": [
-                "sha256:764db054715d949af0f836da5e46c4053afe92282a0d4b2cfc6b8cfe8c3045de",
-                "sha256:9788c2c3113d0d7b00894dae3ea19bed14f3b38d111d7223c126f001b1729a3b"
-            ],
-            "index": "pypi",
-            "version": "==0.2.9"
-        },
        "oauthlib": {
            "hashes": [
                "sha256:23a8208d75b902797ea29fd31fa80a15ed9dc2c6c16fe73f5d346f83f6fa27a2",
@ -309,59 +242,6 @@
            "markers": "python_version >= '3.6'",
            "version": "==3.2.0"
        },
-        "pathlib": {
-            "hashes": [
-                "sha256:6940718dfc3eff4258203ad5021090933e5c04707d5ca8cc9e73c94a7894ea9f"
-            ],
-            "version": "==1.0.1"
-        },
-        "psutil": {
-            "hashes": [
-                "sha256:072664401ae6e7c1bfb878c65d7282d4b4391f1bc9a56d5e03b5a490403271b5",
-                "sha256:1070a9b287846a21a5d572d6dddd369517510b68710fca56b0e9e02fd24bed9a",
-                "sha256:1d7b433519b9a38192dfda962dd8f44446668c009833e1429a52424624f408b4",
-                "sha256:3151a58f0fbd8942ba94f7c31c7e6b310d2989f4da74fcbf28b934374e9bf841",
-                "sha256:32acf55cb9a8cbfb29167cd005951df81b567099295291bcfd1027365b36591d",
-                "sha256:3611e87eea393f779a35b192b46a164b1d01167c9d323dda9b1e527ea69d697d",
-                "sha256:3d00a664e31921009a84367266b35ba0aac04a2a6cad09c550a89041034d19a0",
-                "sha256:4e2fb92e3aeae3ec3b7b66c528981fd327fb93fd906a77215200404444ec1845",
-                "sha256:539e429da49c5d27d5a58e3563886057f8fc3868a5547b4f1876d9c0f007bccf",
-                "sha256:55ce319452e3d139e25d6c3f85a1acf12d1607ddedea5e35fb47a552c051161b",
-                "sha256:58c7d923dc209225600aec73aa2c4ae8ea33b1ab31bc11ef8a5933b027476f07",
-                "sha256:7336292a13a80eb93c21f36bde4328aa748a04b68c13d01dfddd67fc13fd0618",
-                "sha256:742c34fff804f34f62659279ed5c5b723bb0195e9d7bd9907591de9f8f6558e2",
-                "sha256:7641300de73e4909e5d148e90cc3142fb890079e1525a840cf0dfd39195239fd",
-                "sha256:76cebf84aac1d6da5b63df11fe0d377b46b7b500d892284068bacccf12f20666",
-                "sha256:7779be4025c540d1d65a2de3f30caeacc49ae7a2152108adeaf42c7534a115ce",
-                "sha256:7d190ee2eaef7831163f254dc58f6d2e2a22e27382b936aab51c835fc080c3d3",
-                "sha256:8293942e4ce0c5689821f65ce6522ce4786d02af57f13c0195b40e1edb1db61d",
-                "sha256:869842dbd66bb80c3217158e629d6fceaecc3a3166d3d1faee515b05dd26ca25",
-                "sha256:90a58b9fcae2dbfe4ba852b57bd4a1dded6b990a33d6428c7614b7d48eccb492",
-                "sha256:9b51917c1af3fa35a3f2dabd7ba96a2a4f19df3dec911da73875e1edaf22a40b",
-                "sha256:b2237f35c4bbae932ee98902a08050a27821f8f6dfa880a47195e5993af4702d",
-                "sha256:c3400cae15bdb449d518545cbd5b649117de54e3596ded84aacabfbb3297ead2",
-                "sha256:c51f1af02334e4b516ec221ee26b8fdf105032418ca5a5ab9737e8c87dafe203",
-                "sha256:cb8d10461c1ceee0c25a64f2dd54872b70b89c26419e147a05a10b753ad36ec2",
-                "sha256:d62a2796e08dd024b8179bd441cb714e0f81226c352c802fca0fd3f89eeacd94",
-                "sha256:df2c8bd48fb83a8408c8390b143c6a6fa10cb1a674ca664954de193fdcab36a9",
-                "sha256:e5c783d0b1ad6ca8a5d3e7b680468c9c926b804be83a3a8e95141b05c39c9f64",
-                "sha256:e9805fed4f2a81de98ae5fe38b75a74c6e6ad2df8a5c479594c7629a1fe35f56",
-                "sha256:ea42d747c5f71b5ccaa6897b216a7dadb9f52c72a0fe2b872ef7d3e1eacf3ba3",
-                "sha256:ef216cc9feb60634bda2f341a9559ac594e2eeaadd0ba187a4c2eb5b5d40b91c",
-                "sha256:ff0d41f8b3e9ebb6b6110057e40019a432e96aae2008951121ba4e56040b84f3"
-            ],
-            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==5.9.0"
-        },
-        "py-mini-racer": {
-            "hashes": [
-                "sha256:346e73bb89a2024888244d487834be24a121089ceb0641dd0200cb96c4e24b57",
-                "sha256:42896c24968481dd953eeeb11de331f6870917811961c9b26ba09071e07180e2",
-                "sha256:97cab31bbf63ce462ba4cd6e978c572c916d8b15586156c7c5e0b2e42c10baab",
-                "sha256:f71e36b643d947ba698c57cd9bd2232c83ca997b0802fc2f7f79582377040c11"
-            ],
-            "version": "==0.6.0"
-        },
        "pyasn1": {
            "hashes": [
                "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359",
@ -422,13 +302,6 @@
            "index": "pypi",
            "version": "==0.19.2"
        },
-        "random-user-agent": {
-            "hashes": [
-                "sha256:535636a55fb63fe3d74fd0260d854c241d9f2946447026464e578e68eac17dac",
-                "sha256:8f8ca26ec8cb1d24ad1758d8b8f700d154064d641dbe9a255cfec42960fbd012"
-            ],
-            "version": "==1.0.1"
-        },
        "requests": {
            "hashes": [
                "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
--- a/README.md
+++ b/README.md
@ -8,6 +8,8 @@ If you are using `pipenv` (recommended), `pipenv install` is sufficient to insta

 [A Google Service account is necessary for use with `gspread`.](https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account) Credentials for this account should be stored in `service_account.json`, in the same directory as the script.

+[ffmpeg](https://www.ffmpeg.org/) must also be installed locally for this tool to work. 
+
 A `.env` file is required for saving content to a Digital Ocean space, and for archiving pages to the Internet Archive. This file should also be in the script directory, and should contain the following variables:

 ```
--- a/archivers.py
+++ b/archivers.py
@ -1,390 +0,0 @@
-from dataclasses import dataclass
-import youtube_dl
-from bs4 import BeautifulSoup
-import requests
-import tiktok_downloader
-from loguru import logger
-import os
-import datetime
-import ffmpeg
-from botocore.errorfactory import ClientError
-import time
-import traceback
-
-# TODO There should be a better way of generating keys, that adds the following info:
-#           - name of sheet that it is being archived from
-#             (this means we might archive the same media twice on different sheets, but that's OK I think)
-#           - name of archiver/platform that the video comes from
-#       This should make it easier to maintain and clean the archive later
-
-# TODO "check_if_exists" has lots of repeated code across the archivers. Can this be
-#      cleaned up? Difficult is we don't know the filename until the archivers start working.
-
-
-def get_cdn_url(key):
-    return 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format(
-        os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key)
-
-
-def do_s3_upload(s3_client, f, key):
-    s3_client.upload_fileobj(f, Bucket=os.getenv(
-        'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'})
-
-
-def get_key(filename):
-    key = filename.split('/')[1]
-    if 'unknown_video' in key:
-        key = key.replace('unknown_video', 'jpg')
-    return key
-
-
-def get_thumbnails(filename, s3_client, duration=None):
-    if not os.path.exists(filename.split('.')[0]):
-        os.mkdir(filename.split('.')[0])
-
-    fps = 0.5
-    if duration is not None:
-        duration = float(duration)
-
-        if duration < 60:
-            fps = 10.0 / duration
-        elif duration < 120:
-            fps = 20.0 / duration
-        else:
-            fps = 40.0 / duration
-
-    stream = ffmpeg.input(filename)
-    stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)
-    stream.output(filename.split('.')[0] + '/out%d.jpg').run()
-
-    thumbnails = os.listdir(filename.split('.')[0] + '/')
-    cdn_urls = []
-
-    for fname in thumbnails:
-        if fname[-3:] == 'jpg':
-            thumbnail_filename = filename.split('.')[0] + '/' + fname
-            key = filename.split('/')[1].split('.')[0] + '/' + fname
-
-            cdn_url = get_cdn_url(key)
-
-            with open(thumbnail_filename, 'rb') as f:
-                do_s3_upload(s3_client, f, key)
-
-            cdn_urls.append(cdn_url)
-            os.remove(thumbnail_filename)
-
-    if len(cdn_urls) == 0:
-        return ('None', 'None')
-
-    key_thumb = cdn_urls[int(len(cdn_urls)*0.1)]
-
-    index_page = f'''<html><head><title>{filename}</title></head>
-        <body>'''
-
-    for t in cdn_urls:
-        index_page += f'<img src="{t}" />'
-
-    index_page += f"</body></html>"
-    index_fname = filename.split('.')[0] + '/index.html'
-
-    with open(index_fname, 'w') as f:
-        f.write(index_page)
-
-    thumb_index = filename.split('/')[1].split('.')[0] + '/index.html'
-
-    s3_client.upload_fileobj(open(index_fname, 'rb'), Bucket=os.getenv(
-        'DO_BUCKET'), Key=thumb_index, ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/html'})
-
-    thumb_index_cdn_url = get_cdn_url(thumb_index)
-
-    return (key_thumb, thumb_index_cdn_url)
-
-
-@dataclass
-class ArchiveResult:
-    status: str
-    cdn_url: str = None
-    thumbnail: str = None
-    thumbnail_index: str = None
-    duration: float = None
-    title: str = None
-    timestamp: datetime.datetime = None
-
-
-class Archiver:
-    def __init__(self, s3_client):
-        self.s3 = s3_client
-
-    def download(self, url):
-        pass
-
-
-class TelegramArchiver(Archiver):
-    def download(self, url, check_if_exists=False):
-        # detect URLs that we definitely cannot handle
-        if 'http://t.me/' not in url and 'https://t.me/' not in url:
-            return False
-
-        headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
-        status = "success"
-
-        original_url = url
-
-        if url[-8:] != "?embed=1":
-            url += "?embed=1"
-
-        t = requests.get(url, headers=headers)
-        s = BeautifulSoup(t.content, 'html.parser')
-        video = s.find("video")
-
-        if video is None:
-            return False  # could not find video
-
-        video_url = video.get('src')
-        key = video_url.split('/')[-1].split('?')[0]
-        filename = 'tmp/' + key
-
-        if check_if_exists:
-            try:
-                self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
-
-                # file exists
-                cdn_url = get_cdn_url(key)
-
-                status = 'already archived'
-
-            except ClientError:
-                pass
-
-        v = requests.get(video_url, headers=headers)
-
-        with open(filename, 'wb') as f:
-            f.write(v.content)
-
-        if status != 'already archived':
-            cdn_url = get_cdn_url(key)
-
-            with open(filename, 'rb') as f:
-                do_s3_upload(self.s3, f, key)
-
-        # extract duration from HTML
-        duration = s.find_all('time')[0].contents[0]
-        if ':' in duration:
-            duration = float(duration.split(
-                ':')[0])*60 + float(duration.split(':')[1])
-        else:
-            duration = float(duration)
-
-        # process thumbnails
-        key_thumb, thumb_index = get_thumbnails(
-            filename, self.s3, duration=duration)
-        os.remove(filename)
-
-        return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
-                             duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'))
-
-
-class YoutubeDLArchiver(Archiver):
-    def download(self, url, check_if_exists=False):
-        ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
-        if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'):
-            logger.info('Using Facebook cookie')
-            youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE')
-
-        ydl = youtube_dl.YoutubeDL(ydl_opts)
-        cdn_url = None
-        status = 'success'
-
-        try:
-            info = ydl.extract_info(url, download=False)
-        except youtube_dl.utils.DownloadError:
-            # no video here
-            return False
-
-        if 'is_live' in info and info['is_live']:
-            logger.warning("Live streaming media, not archiving now")
-            return ArchiveResult(status="Streaming media")
-
-        if check_if_exists:
-            if 'entries' in info:
-                if len(info['entries']) > 1:
-                    logger.warning(
-                        'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
-                    return False
-
-                filename = ydl.prepare_filename(info['entries'][0])
-            else:
-                filename = ydl.prepare_filename(info)
-
-            key = get_key(filename)
-
-            try:
-                self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
-
-                # file exists
-                cdn_url = get_cdn_url(key)
-
-                status = 'already archived'
-
-            except ClientError:
-                pass
-
-        # sometimes this results in a different filename, so do this again
-        info = ydl.extract_info(url, download=True)
-
-        if 'entries' in info:
-            if len(info['entries']) > 1:
-                logger.warning(
-                    'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
-                return False
-            else:
-                info = info['entries'][0]
-
-        filename = ydl.prepare_filename(info)
-
-        if not os.path.exists(filename):
-            filename = filename.split('.')[0] + '.mkv'
-
-        if status != 'already archived':
-            key = get_key(filename)
-            cdn_url = get_cdn_url(key)
-
-            with open(filename, 'rb') as f:
-                do_s3_upload(self.s3, f, key)
-
-        # get duration
-        duration = info['duration'] if 'duration' in info else None
-
-        # get thumbnails
-        key_thumb, thumb_index = get_thumbnails(
-            filename, self.s3, duration=duration)
-        os.remove(filename)
-
-        return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
-                             title=info['title'] if 'title' in info else None,
-                             timestamp=info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None)
-
-
-class WaybackArchiver(Archiver):
-    def __init__(self, s3_client):
-        self.s3 = s3_client
-        self.seen_urls = {}
-
-    def download(self, url, check_if_exists=False):
-        if check_if_exists and url in self.seen_urls:
-            return self.seen_urls[url]
-
-        ia_headers = {
-            "Accept": "application/json",
-            "Authorization": "LOW " + os.getenv('INTERNET_ARCHIVE_S3_KEY') + ":" + os.getenv('INTERNET_ARCHIVE_S3_SECRET')
-        }
-
-        r = requests.post(
-            'https://web.archive.org/save/', headers=ia_headers, data={'url': url})
-
-        if r.status_code != 200:
-            return ArchiveResult(status="Internet archive failed")
-
-        job_id = r.json()['job_id']
-
-        status_r = requests.get(
-            'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
-
-        retries = 0
-
-        # wait 90-120 seconds for the archive job to finish
-        while (status_r.status_code != 200 or status_r.json()['status'] == 'pending') and retries < 30:
-            time.sleep(3)
-
-            try:
-                status_r = requests.get(
-                    'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
-            except:
-                time.sleep(1)
-
-            retries += 1
-
-        if status_r.status_code != 200:
-            return ArchiveResult(status="Internet archive failed")
-
-        status_json = status_r.json()
-
-        if status_json['status'] != 'success':
-            return ArchiveResult(status='Internet Archive failed: ' + status_json['message'])
-
-        archive_url = 'https://web.archive.org/web/' + \
-            status_json['timestamp'] + '/' + status_json['original_url']
-
-        try:
-            r = requests.get(archive_url)
-
-            parsed = BeautifulSoup(
-                r.content, 'html.parser')
-
-            title = parsed.find_all('title')[
-                0].text
-        except:
-            title = "Could not get title"
-
-        result = ArchiveResult(
-            status='Internet Archive fallback', cdn_url=archive_url, title=title)
-        self.seen_urls[url] = result
-        return result
-
-
-class TiktokArchiver(Archiver):
-    def download(self, url, check_if_exists=False):
-        if 'tiktok.com' not in url:
-            return False
-
-        status = 'success'
-
-        try:
-            info = tiktok_downloader.info_post(url)
-            key = 'tiktok_' + str(info.id) + '.mp4'
-            filename = 'tmp/' + key
-
-            if check_if_exists:
-                try:
-                    self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
-
-                    # file exists
-                    cdn_url = get_cdn_url(key)
-
-                    status = 'already archived'
-
-                except ClientError:
-                    pass
-
-            if status != 'already archived':
-                media = tiktok_downloader.snaptik(url).get_media()
-                if len(media) > 0:
-                    media[0].download(filename)
-                    with open(filename, 'rb') as f:
-                        do_s3_upload(self.s3, f, key)
-
-                    cdn_url = get_cdn_url(key)
-                else:
-                    status = 'could not download media'
-
-            try:
-                key_thumb, thumb_index = get_thumbnails(
-                    filename, self.s3, duration=info.duration)
-            except:
-                key_thumb = ''
-                thumb_index = 'error creating thumbnails'
-
-            os.remove(filename)
-
-            return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
-                                 thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat())
-
-        except tiktok_downloader.Except.InvalidUrl:
-            status = 'Invalid URL'
-            return ArchiveResult(status=status)
-
-        except:
-            error = traceback.format_exc()
-            status = 'Other Tiktok error: ' + str(error)
-            return ArchiveResult(status=status)
--- a/archivers/init.py
+++ b/archivers/init.py
@ -0,0 +1,6 @@
+# we need to explicitly expose the available imports here
+from .base_archiver import *
+from .telegram_archiver import *
+from .tiktok_archiver import *
+from .wayback_archiver import *
+from .youtubedl_archiver import *
--- a/archivers/base_archiver.py
+++ b/archivers/base_archiver.py
@ -0,0 +1,115 @@
+import os
+import ffmpeg
+from dataclasses import dataclass
+import datetime
+from loguru import logger
+
+# TODO There should be a better way of generating keys, that adds the following info:
+#           - name of sheet that it is being archived from
+#             (this means we might archive the same media twice on different sheets, but that's OK I think)
+#           - name of archiver/platform that the video comes from
+#       This should make it easier to maintain and clean the archive later
+
+# TODO "check_if_exists" has lots of repeated code across the archivers. Can this be
+#      cleaned up? Difficult is we don't know the filename until the archivers start working.
+
+
+@dataclass
+class ArchiveResult:
+    status: str
+    cdn_url: str = None
+    thumbnail: str = None
+    thumbnail_index: str = None
+    duration: float = None
+    title: str = None
+    timestamp: datetime.datetime = None
+
+
+class Archiver:
+    name = "default"
+
+    def __init__(self, s3_client):
+        self.s3 = s3_client
+
+    def __str__(self):
+        return self.__class__.__name__
+
+    def download(self, url, check_if_exists=False):
+        logger.error("method 'download' not implemented")
+
+    def get_cdn_url(self, key):
+        return 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format(
+            os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key)
+
+    def do_s3_upload(self, f, key):
+        self.s3.upload_fileobj(f, Bucket=os.getenv(
+            'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'})
+
+    def get_key(self, filename):
+        print(f"key base implementation: {self.name}")
+        # TODO: refactor to be more manageable
+        key = filename.split('/')[1]
+        if 'unknown_video' in key:
+            key = key.replace('unknown_video', 'jpg')
+        return key
+
+    def get_thumbnails(self, filename, duration=None):
+        if not os.path.exists(filename.split('.')[0]):
+            os.mkdir(filename.split('.')[0])
+
+        fps = 0.5
+        if duration is not None:
+            duration = float(duration)
+
+            if duration < 60:
+                fps = 10.0 / duration
+            elif duration < 120:
+                fps = 20.0 / duration
+            else:
+                fps = 40.0 / duration
+
+        stream = ffmpeg.input(filename)
+        stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)
+        stream.output(filename.split('.')[0] + '/out%d.jpg').run()
+
+        thumbnails = os.listdir(filename.split('.')[0] + '/')
+        cdn_urls = []
+
+        for fname in thumbnails:
+            if fname[-3:] == 'jpg':
+                thumbnail_filename = filename.split('.')[0] + '/' + fname
+                key = filename.split('/')[1].split('.')[0] + '/' + fname
+
+                cdn_url = self.get_cdn_url(key)
+
+                with open(thumbnail_filename, 'rb') as f:
+                    self.do_s3_upload(f, key)
+
+                cdn_urls.append(cdn_url)
+                os.remove(thumbnail_filename)
+
+        if len(cdn_urls) == 0:
+            return ('None', 'None')
+
+        key_thumb = cdn_urls[int(len(cdn_urls) * 0.1)]
+
+        index_page = f'''<html><head><title>{filename}</title></head>
+            <body>'''
+
+        for t in cdn_urls:
+            index_page += f'<img src="{t}" />'
+
+        index_page += f"</body></html>"
+        index_fname = filename.split('.')[0] + '/index.html'
+
+        with open(index_fname, 'w') as f:
+            f.write(index_page)
+
+        thumb_index = filename.split('/')[1].split('.')[0] + '/index.html'
+
+        self.s3.upload_fileobj(open(index_fname, 'rb'), Bucket=os.getenv(
+            'DO_BUCKET'), Key=thumb_index, ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/html'})
+
+        thumb_index_cdn_url = self.get_cdn_url(thumb_index)
+
+        return (key_thumb, thumb_index_cdn_url)
--- a/archivers/telegram_archiver.py
+++ b/archivers/telegram_archiver.py
@ -0,0 +1,76 @@
+import os
+import requests
+from bs4 import BeautifulSoup
+from botocore.errorfactory import ClientError
+from .base_archiver import Archiver, ArchiveResult
+
+# TODO: get_cdn_url, get_thumbnails, do_s3_upload
+
+
+class TelegramArchiver(Archiver):
+    name = "telegram"
+    
+    def download(self, url, check_if_exists=False):
+        # detect URLs that we definitely cannot handle
+        if 'http://t.me/' not in url and 'https://t.me/' not in url:
+            return False
+
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
+        }
+        status = "success"
+
+        original_url = url
+
+        # TODO: check if we can do this more resilient to user-input
+        if url[-8:] != "?embed=1":
+            url += "?embed=1"
+
+        t = requests.get(url, headers=headers)
+        s = BeautifulSoup(t.content, 'html.parser')
+        video = s.find("video")
+
+        if video is None:
+            return False  # could not find video
+
+        video_url = video.get('src')
+        key = video_url.split('/')[-1].split('?')[0]
+        filename = 'tmp/' + key
+
+        if check_if_exists:
+            try:
+                self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
+
+                # file exists
+                cdn_url = self.get_cdn_url(key)
+
+                status = 'already archived'
+
+            except ClientError:
+                pass
+
+        v = requests.get(video_url, headers=headers)
+
+        with open(filename, 'wb') as f:
+            f.write(v.content)
+
+        if status != 'already archived':
+            cdn_url = self.get_cdn_url(key)
+
+            with open(filename, 'rb') as f:
+                self.do_s3_upload(f, key)
+
+        # extract duration from HTML
+        duration = s.find_all('time')[0].contents[0]
+        if ':' in duration:
+            duration = float(duration.split(
+                ':')[0]) * 60 + float(duration.split(':')[1])
+        else:
+            duration = float(duration)
+
+        # process thumbnails
+        key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration)
+        os.remove(filename)
+
+        return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
+                             duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'))
--- a/archivers/tiktok_archiver.py
+++ b/archivers/tiktok_archiver.py
@ -0,0 +1,68 @@
+import os, traceback
+from botocore.errorfactory import ClientError
+import tiktok_downloader
+from loguru import logger
+from .base_archiver import Archiver, ArchiveResult
+
+# TODO: get_cdn_url, do_s3_upload, get_thumbnails
+
+
+class TiktokArchiver(Archiver):
+    name = "tiktok"
+    
+    def download(self, url, check_if_exists=False):
+        if 'tiktok.com' not in url:
+            return False
+
+        status = 'success'
+
+        try:
+            info = tiktok_downloader.info_post(url)
+            key = 'tiktok_' + str(info.id) + '.mp4'
+            filename = 'tmp/' + key
+
+            if check_if_exists:
+                try:
+                    self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
+
+                    # file exists
+                    cdn_url = self.get_cdn_url(key)
+
+                    status = 'already archived'
+
+                except ClientError:
+                    pass
+
+            if status != 'already archived':
+                media = tiktok_downloader.snaptik(url).get_media()
+                if len(media) > 0:
+                    media[0].download(filename)
+                    with open(filename, 'rb') as f:
+                        self.do_s3_upload(f, key)
+
+                    cdn_url = self.get_cdn_url(key)
+                else:
+                    status = 'could not download media'
+
+            try:
+                key_thumb, thumb_index = self.get_thumbnails(
+                    filename, duration=info.duration)
+            except:
+                key_thumb = ''
+                thumb_index = 'error creating thumbnails'
+
+            try: os.remove(filename)
+            except FileNotFoundError:
+                logger.info(f'tmp file not found thus not deleted {filename}')
+
+            return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
+                                 thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat())
+
+        except tiktok_downloader.Except.InvalidUrl:
+            status = 'Invalid URL'
+            return ArchiveResult(status=status)
+
+        except:
+            error = traceback.format_exc()
+            status = 'Other Tiktok error: ' + str(error)
+            return ArchiveResult(status=status)
--- a/archivers/wayback_archiver.py
+++ b/archivers/wayback_archiver.py
@ -0,0 +1,73 @@
+import time, requests, os
+from bs4 import BeautifulSoup
+
+from .base_archiver import Archiver, ArchiveResult
+
+
+class WaybackArchiver(Archiver):
+    name = "wayback"
+    
+    def __init__(self, s3_client):
+        self.s3 = s3_client
+        self.seen_urls = {}
+
+    def download(self, url, check_if_exists=False):
+        if check_if_exists and url in self.seen_urls:
+            return self.seen_urls[url]
+
+        ia_headers = {
+            "Accept": "application/json",
+            "Authorization": "LOW " + os.getenv('INTERNET_ARCHIVE_S3_KEY') + ":" + os.getenv('INTERNET_ARCHIVE_S3_SECRET')
+        }
+
+        r = requests.post(
+            'https://web.archive.org/save/', headers=ia_headers, data={'url': url})
+
+        if r.status_code != 200:
+            return ArchiveResult(status="Internet archive failed")
+
+        job_id = r.json()['job_id']
+
+        status_r = requests.get(
+            'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
+
+        retries = 0
+
+        # wait 90-120 seconds for the archive job to finish
+        while (status_r.status_code != 200 or status_r.json()['status'] == 'pending') and retries < 30:
+            time.sleep(3)
+
+            try:
+                status_r = requests.get(
+                    'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
+            except:
+                time.sleep(1)
+
+            retries += 1
+
+        if status_r.status_code != 200:
+            return ArchiveResult(status="Internet archive failed")
+
+        status_json = status_r.json()
+
+        if status_json['status'] != 'success':
+            return ArchiveResult(status='Internet Archive failed: ' + status_json['message'])
+
+        archive_url = 'https://web.archive.org/web/' + \
+            status_json['timestamp'] + '/' + status_json['original_url']
+
+        try:
+            r = requests.get(archive_url)
+
+            parsed = BeautifulSoup(
+                r.content, 'html.parser')
+
+            title = parsed.find_all('title')[
+                0].text
+        except:
+            title = "Could not get title"
+
+        result = ArchiveResult(
+            status='Internet Archive fallback', cdn_url=archive_url, title=title)
+        self.seen_urls[url] = result
+        return result
--- a/archivers/youtubedl_archiver.py
+++ b/archivers/youtubedl_archiver.py
@ -0,0 +1,88 @@
+
+import os
+import datetime
+import youtube_dl
+from loguru import logger
+from botocore.errorfactory import ClientError
+from .base_archiver import Archiver, ArchiveResult
+
+class YoutubeDLArchiver(Archiver):
+    name = "yotube_dl"
+    
+    def download(self, url, check_if_exists=False):
+        ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
+        if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'):
+            logger.info('Using Facebook cookie')
+            youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE')
+
+        ydl = youtube_dl.YoutubeDL(ydl_opts)
+        cdn_url = None
+        status = 'success'
+
+        try:
+            info = ydl.extract_info(url, download=False)
+        except youtube_dl.utils.DownloadError:
+            # no video here
+            return False
+
+        if 'is_live' in info and info['is_live']:
+            logger.warning("Live streaming media, not archiving now")
+            return ArchiveResult(status="Streaming media")
+
+        if check_if_exists:
+            if 'entries' in info:
+                if len(info['entries']) > 1:
+                    logger.warning(
+                        'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
+                    return False
+
+                filename = ydl.prepare_filename(info['entries'][0])
+            else:
+                filename = ydl.prepare_filename(info)
+
+            key = self.get_key(filename)
+
+            try:
+                self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
+
+                # file exists
+                cdn_url = self.get_cdn_url(key)
+
+                status = 'already archived'
+
+            except ClientError:
+                pass
+
+        # sometimes this results in a different filename, so do this again
+        info = ydl.extract_info(url, download=True)
+
+        if 'entries' in info:
+            if len(info['entries']) > 1:
+                logger.warning(
+                    'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
+                return False
+            else:
+                info = info['entries'][0]
+
+        filename = ydl.prepare_filename(info)
+
+        if not os.path.exists(filename):
+            filename = filename.split('.')[0] + '.mkv'
+
+        if status != 'already archived':
+            key = self. get_key(filename)
+            cdn_url = self.get_cdn_url(key)
+
+            with open(filename, 'rb') as f:
+                self.do_s3_upload(f, key)
+
+        # get duration
+        duration = info['duration'] if 'duration' in info else None
+
+        # get thumbnails
+        key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration)
+        os.remove(filename)
+
+        return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
+                             title=info['title'] if 'title' in info else None,
+                             timestamp=info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None)
--- a/auto_archive.py
+++ b/auto_archive.py
@ -1,14 +1,12 @@
-from dataclasses import dataclass
-import gspread
-from pathlib import Path
-import datetime
-import boto3
 import os
-from dotenv import load_dotenv
+import datetime
 import argparse
 import math
-import threading
+import gspread
+import boto3
 from loguru import logger
+from dotenv import load_dotenv
+
 import archivers

 load_dotenv()
@ -156,6 +154,7 @@ def process_sheet(sheet):
            'duration')) if 'duration' in headers else None


+        # order matters, first to succeed excludes remaining
        active_archivers = [
            archivers.TelegramArchiver(s3_client),
            archivers.TiktokArchiver(s3_client),
@ -198,7 +197,7 @@ def process_sheet(sheet):

 def main():
    parser = argparse.ArgumentParser(
-        description="Automatically use youtube-dl to download media from a Google Sheet")
+        description="Automatically archive social media videos from a Google Sheet")
    parser.add_argument("--sheet", action="store", dest="sheet")
    args = parser.parse_args()