split into multiple files MVP

msramalho 2022-02-21 14:19:09 +01:00
rodzic 009c0dd8ca
commit f3ce226665
12 zmienionych plików z 446 dodań i 536 usunięć

.gitignore vendored
Wyświetl plik

@ -1,7 +1,8 @@

Wyświetl plik

@ -10,7 +10,6 @@ python-dotenv = "*"
youtube_dl = "*"
argparse = "*"
beautifulsoup4 = "*"
nordvpn-switcher = "*"
tiktok-downloader = {git = "https://github.com/msramalho/tiktok-downloader"}
bs4 = "*"
loguru = "*"

Pipfile.lock wygenerowano
Wyświetl plik

@ -1,7 +1,7 @@
"_meta": {
"hash": {
"sha256": "af39efbad8c78641a732697001193b5f4f92a0af8a9709081428001362a47060"
"sha256": "9a5218275503e5ae779407349d0a76f44712dc4824e066b10aeb047264a168be"
"pipfile-spec": 6,
"requires": {
@ -93,6 +93,14 @@
"version": "==1.2.58"
"faker": {
"hashes": [
"markers": "python_version >= '3.6'",
"version": "==13.0.0"
"ffmpeg-python": {
"hashes": [
@ -180,73 +188,6 @@
"index": "pypi",
"version": "==0.6.0"
"lxml": {
"hashes": [
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==4.8.0"
"markupsafe": {
"hashes": [
@ -293,14 +234,6 @@
"markers": "python_version >= '3.7'",
"version": "==2.1.0"
"nordvpn-switcher": {
"hashes": [
"index": "pypi",
"version": "==0.2.9"
"oauthlib": {
"hashes": [
@ -309,59 +242,6 @@
"markers": "python_version >= '3.6'",
"version": "==3.2.0"
"pathlib": {
"hashes": [
"version": "==1.0.1"
"psutil": {
"hashes": [
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==5.9.0"
"py-mini-racer": {
"hashes": [
"version": "==0.6.0"
"pyasn1": {
"hashes": [
@ -422,13 +302,6 @@
"index": "pypi",
"version": "==0.19.2"
"random-user-agent": {
"hashes": [
"version": "==1.0.1"
"requests": {
"hashes": [

Wyświetl plik

@ -8,6 +8,8 @@ If you are using `pipenv` (recommended), `pipenv install` is sufficient to insta
[A Google Service account is necessary for use with `gspread`.](https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account) Credentials for this account should be stored in `service_account.json`, in the same directory as the script.
[ffmpeg](https://www.ffmpeg.org/) must also be installed locally for this tool to work.
A `.env` file is required for saving content to a Digital Ocean space, and for archiving pages to the Internet Archive. This file should also be in the script directory, and should contain the following variables:

Wyświetl plik

@ -1,390 +0,0 @@
from dataclasses import dataclass
import youtube_dl
from bs4 import BeautifulSoup
import requests
import tiktok_downloader
from loguru import logger
import os
import datetime
import ffmpeg
from botocore.errorfactory import ClientError
import time
import traceback
# TODO There should be a better way of generating keys, that adds the following info:
# - name of sheet that it is being archived from
# (this means we might archive the same media twice on different sheets, but that's OK I think)
# - name of archiver/platform that the video comes from
# This should make it easier to maintain and clean the archive later
# TODO "check_if_exists" has lots of repeated code across the archivers. Can this be
# cleaned up? Difficult is we don't know the filename until the archivers start working.
def get_cdn_url(key):
return 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format(
os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key)
def do_s3_upload(s3_client, f, key):
s3_client.upload_fileobj(f, Bucket=os.getenv(
'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'})
def get_key(filename):
key = filename.split('/')[1]
if 'unknown_video' in key:
key = key.replace('unknown_video', 'jpg')
return key
def get_thumbnails(filename, s3_client, duration=None):
if not os.path.exists(filename.split('.')[0]):
fps = 0.5
if duration is not None:
duration = float(duration)
if duration < 60:
fps = 10.0 / duration
elif duration < 120:
fps = 20.0 / duration
fps = 40.0 / duration
stream = ffmpeg.input(filename)
stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)
stream.output(filename.split('.')[0] + '/out%d.jpg').run()
thumbnails = os.listdir(filename.split('.')[0] + '/')
cdn_urls = []
for fname in thumbnails:
if fname[-3:] == 'jpg':
thumbnail_filename = filename.split('.')[0] + '/' + fname
key = filename.split('/')[1].split('.')[0] + '/' + fname
cdn_url = get_cdn_url(key)
with open(thumbnail_filename, 'rb') as f:
do_s3_upload(s3_client, f, key)
if len(cdn_urls) == 0:
return ('None', 'None')
key_thumb = cdn_urls[int(len(cdn_urls)*0.1)]
index_page = f'''<html><head><title>{filename}</title></head>
for t in cdn_urls:
index_page += f'<img src="{t}" />'
index_page += f"</body></html>"
index_fname = filename.split('.')[0] + '/index.html'
with open(index_fname, 'w') as f:
thumb_index = filename.split('/')[1].split('.')[0] + '/index.html'
s3_client.upload_fileobj(open(index_fname, 'rb'), Bucket=os.getenv(
'DO_BUCKET'), Key=thumb_index, ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/html'})
thumb_index_cdn_url = get_cdn_url(thumb_index)
return (key_thumb, thumb_index_cdn_url)
class ArchiveResult:
status: str
cdn_url: str = None
thumbnail: str = None
thumbnail_index: str = None
duration: float = None
title: str = None
timestamp: datetime.datetime = None
class Archiver:
def __init__(self, s3_client):
self.s3 = s3_client
def download(self, url):
class TelegramArchiver(Archiver):
def download(self, url, check_if_exists=False):
# detect URLs that we definitely cannot handle
if 'http://t.me/' not in url and 'https://t.me/' not in url:
return False
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
status = "success"
original_url = url
if url[-8:] != "?embed=1":
url += "?embed=1"
t = requests.get(url, headers=headers)
s = BeautifulSoup(t.content, 'html.parser')
video = s.find("video")
if video is None:
return False # could not find video
video_url = video.get('src')
key = video_url.split('/')[-1].split('?')[0]
filename = 'tmp/' + key
if check_if_exists:
self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
# file exists
cdn_url = get_cdn_url(key)
status = 'already archived'
except ClientError:
v = requests.get(video_url, headers=headers)
with open(filename, 'wb') as f:
if status != 'already archived':
cdn_url = get_cdn_url(key)
with open(filename, 'rb') as f:
do_s3_upload(self.s3, f, key)
# extract duration from HTML
duration = s.find_all('time')[0].contents[0]
if ':' in duration:
duration = float(duration.split(
':')[0])*60 + float(duration.split(':')[1])
duration = float(duration)
# process thumbnails
key_thumb, thumb_index = get_thumbnails(
filename, self.s3, duration=duration)
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'))
class YoutubeDLArchiver(Archiver):
def download(self, url, check_if_exists=False):
ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'):
logger.info('Using Facebook cookie')
youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE')
ydl = youtube_dl.YoutubeDL(ydl_opts)
cdn_url = None
status = 'success'
info = ydl.extract_info(url, download=False)
except youtube_dl.utils.DownloadError:
# no video here
return False
if 'is_live' in info and info['is_live']:
logger.warning("Live streaming media, not archiving now")
return ArchiveResult(status="Streaming media")
if check_if_exists:
if 'entries' in info:
if len(info['entries']) > 1:
'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
return False
filename = ydl.prepare_filename(info['entries'][0])
filename = ydl.prepare_filename(info)
key = get_key(filename)
self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
# file exists
cdn_url = get_cdn_url(key)
status = 'already archived'
except ClientError:
# sometimes this results in a different filename, so do this again
info = ydl.extract_info(url, download=True)
if 'entries' in info:
if len(info['entries']) > 1:
'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
return False
info = info['entries'][0]
filename = ydl.prepare_filename(info)
if not os.path.exists(filename):
filename = filename.split('.')[0] + '.mkv'
if status != 'already archived':
key = get_key(filename)
cdn_url = get_cdn_url(key)
with open(filename, 'rb') as f:
do_s3_upload(self.s3, f, key)
# get duration
duration = info['duration'] if 'duration' in info else None
# get thumbnails
key_thumb, thumb_index = get_thumbnails(
filename, self.s3, duration=duration)
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
title=info['title'] if 'title' in info else None,
timestamp=info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None)
class WaybackArchiver(Archiver):
def __init__(self, s3_client):
self.s3 = s3_client
self.seen_urls = {}
def download(self, url, check_if_exists=False):
if check_if_exists and url in self.seen_urls:
return self.seen_urls[url]
ia_headers = {
"Accept": "application/json",
"Authorization": "LOW " + os.getenv('INTERNET_ARCHIVE_S3_KEY') + ":" + os.getenv('INTERNET_ARCHIVE_S3_SECRET')
r = requests.post(
'https://web.archive.org/save/', headers=ia_headers, data={'url': url})
if r.status_code != 200:
return ArchiveResult(status="Internet archive failed")
job_id = r.json()['job_id']
status_r = requests.get(
'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
retries = 0
# wait 90-120 seconds for the archive job to finish
while (status_r.status_code != 200 or status_r.json()['status'] == 'pending') and retries < 30:
status_r = requests.get(
'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
retries += 1
if status_r.status_code != 200:
return ArchiveResult(status="Internet archive failed")
status_json = status_r.json()
if status_json['status'] != 'success':
return ArchiveResult(status='Internet Archive failed: ' + status_json['message'])
archive_url = 'https://web.archive.org/web/' + \
status_json['timestamp'] + '/' + status_json['original_url']
r = requests.get(archive_url)
parsed = BeautifulSoup(
r.content, 'html.parser')
title = parsed.find_all('title')[
title = "Could not get title"
result = ArchiveResult(
status='Internet Archive fallback', cdn_url=archive_url, title=title)
self.seen_urls[url] = result
return result
class TiktokArchiver(Archiver):
def download(self, url, check_if_exists=False):
if 'tiktok.com' not in url:
return False
status = 'success'
info = tiktok_downloader.info_post(url)
key = 'tiktok_' + str(info.id) + '.mp4'
filename = 'tmp/' + key
if check_if_exists:
self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
# file exists
cdn_url = get_cdn_url(key)
status = 'already archived'
except ClientError:
if status != 'already archived':
media = tiktok_downloader.snaptik(url).get_media()
if len(media) > 0:
with open(filename, 'rb') as f:
do_s3_upload(self.s3, f, key)
cdn_url = get_cdn_url(key)
status = 'could not download media'
key_thumb, thumb_index = get_thumbnails(
filename, self.s3, duration=info.duration)
key_thumb = ''
thumb_index = 'error creating thumbnails'
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat())
except tiktok_downloader.Except.InvalidUrl:
status = 'Invalid URL'
return ArchiveResult(status=status)
error = traceback.format_exc()
status = 'Other Tiktok error: ' + str(error)
return ArchiveResult(status=status)

Wyświetl plik

@ -0,0 +1,6 @@
# we need to explicitly expose the available imports here
from .base_archiver import *
from .telegram_archiver import *
from .tiktok_archiver import *
from .wayback_archiver import *
from .youtubedl_archiver import *

Wyświetl plik

@ -0,0 +1,115 @@
import os
import ffmpeg
from dataclasses import dataclass
import datetime
from loguru import logger
# TODO There should be a better way of generating keys, that adds the following info:
# - name of sheet that it is being archived from
# (this means we might archive the same media twice on different sheets, but that's OK I think)
# - name of archiver/platform that the video comes from
# This should make it easier to maintain and clean the archive later
# TODO "check_if_exists" has lots of repeated code across the archivers. Can this be
# cleaned up? Difficult is we don't know the filename until the archivers start working.
class ArchiveResult:
status: str
cdn_url: str = None
thumbnail: str = None
thumbnail_index: str = None
duration: float = None
title: str = None
timestamp: datetime.datetime = None
class Archiver:
name = "default"
def __init__(self, s3_client):
self.s3 = s3_client
def __str__(self):
return self.__class__.__name__
def download(self, url, check_if_exists=False):
logger.error("method 'download' not implemented")
def get_cdn_url(self, key):
return 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format(
os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key)
def do_s3_upload(self, f, key):
self.s3.upload_fileobj(f, Bucket=os.getenv(
'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'})
def get_key(self, filename):
print(f"key base implementation: {self.name}")
# TODO: refactor to be more manageable
key = filename.split('/')[1]
if 'unknown_video' in key:
key = key.replace('unknown_video', 'jpg')
return key
def get_thumbnails(self, filename, duration=None):
if not os.path.exists(filename.split('.')[0]):
fps = 0.5
if duration is not None:
duration = float(duration)
if duration < 60:
fps = 10.0 / duration
elif duration < 120:
fps = 20.0 / duration
fps = 40.0 / duration
stream = ffmpeg.input(filename)
stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)
stream.output(filename.split('.')[0] + '/out%d.jpg').run()
thumbnails = os.listdir(filename.split('.')[0] + '/')
cdn_urls = []
for fname in thumbnails:
if fname[-3:] == 'jpg':
thumbnail_filename = filename.split('.')[0] + '/' + fname
key = filename.split('/')[1].split('.')[0] + '/' + fname
cdn_url = self.get_cdn_url(key)
with open(thumbnail_filename, 'rb') as f:
self.do_s3_upload(f, key)
if len(cdn_urls) == 0:
return ('None', 'None')
key_thumb = cdn_urls[int(len(cdn_urls) * 0.1)]
index_page = f'''<html><head><title>{filename}</title></head>
for t in cdn_urls:
index_page += f'<img src="{t}" />'
index_page += f"</body></html>"
index_fname = filename.split('.')[0] + '/index.html'
with open(index_fname, 'w') as f:
thumb_index = filename.split('/')[1].split('.')[0] + '/index.html'
self.s3.upload_fileobj(open(index_fname, 'rb'), Bucket=os.getenv(
'DO_BUCKET'), Key=thumb_index, ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/html'})
thumb_index_cdn_url = self.get_cdn_url(thumb_index)
return (key_thumb, thumb_index_cdn_url)

Wyświetl plik

@ -0,0 +1,76 @@
import os
import requests
from bs4 import BeautifulSoup
from botocore.errorfactory import ClientError
from .base_archiver import Archiver, ArchiveResult
# TODO: get_cdn_url, get_thumbnails, do_s3_upload
class TelegramArchiver(Archiver):
name = "telegram"
def download(self, url, check_if_exists=False):
# detect URLs that we definitely cannot handle
if 'http://t.me/' not in url and 'https://t.me/' not in url:
return False
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
status = "success"
original_url = url
# TODO: check if we can do this more resilient to user-input
if url[-8:] != "?embed=1":
url += "?embed=1"
t = requests.get(url, headers=headers)
s = BeautifulSoup(t.content, 'html.parser')
video = s.find("video")
if video is None:
return False # could not find video
video_url = video.get('src')
key = video_url.split('/')[-1].split('?')[0]
filename = 'tmp/' + key
if check_if_exists:
self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
# file exists
cdn_url = self.get_cdn_url(key)
status = 'already archived'
except ClientError:
v = requests.get(video_url, headers=headers)
with open(filename, 'wb') as f:
if status != 'already archived':
cdn_url = self.get_cdn_url(key)
with open(filename, 'rb') as f:
self.do_s3_upload(f, key)
# extract duration from HTML
duration = s.find_all('time')[0].contents[0]
if ':' in duration:
duration = float(duration.split(
':')[0]) * 60 + float(duration.split(':')[1])
duration = float(duration)
# process thumbnails
key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration)
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'))

Wyświetl plik

@ -0,0 +1,68 @@
import os, traceback
from botocore.errorfactory import ClientError
import tiktok_downloader
from loguru import logger
from .base_archiver import Archiver, ArchiveResult
# TODO: get_cdn_url, do_s3_upload, get_thumbnails
class TiktokArchiver(Archiver):
name = "tiktok"
def download(self, url, check_if_exists=False):
if 'tiktok.com' not in url:
return False
status = 'success'
info = tiktok_downloader.info_post(url)
key = 'tiktok_' + str(info.id) + '.mp4'
filename = 'tmp/' + key
if check_if_exists:
self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
# file exists
cdn_url = self.get_cdn_url(key)
status = 'already archived'
except ClientError:
if status != 'already archived':
media = tiktok_downloader.snaptik(url).get_media()
if len(media) > 0:
with open(filename, 'rb') as f:
self.do_s3_upload(f, key)
cdn_url = self.get_cdn_url(key)
status = 'could not download media'
key_thumb, thumb_index = self.get_thumbnails(
filename, duration=info.duration)
key_thumb = ''
thumb_index = 'error creating thumbnails'
try: os.remove(filename)
except FileNotFoundError:
logger.info(f'tmp file not found thus not deleted {filename}')
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat())
except tiktok_downloader.Except.InvalidUrl:
status = 'Invalid URL'
return ArchiveResult(status=status)
error = traceback.format_exc()
status = 'Other Tiktok error: ' + str(error)
return ArchiveResult(status=status)

Wyświetl plik

@ -0,0 +1,73 @@
import time, requests, os
from bs4 import BeautifulSoup
from .base_archiver import Archiver, ArchiveResult
class WaybackArchiver(Archiver):
name = "wayback"
def __init__(self, s3_client):
self.s3 = s3_client
self.seen_urls = {}
def download(self, url, check_if_exists=False):
if check_if_exists and url in self.seen_urls:
return self.seen_urls[url]
ia_headers = {
"Accept": "application/json",
"Authorization": "LOW " + os.getenv('INTERNET_ARCHIVE_S3_KEY') + ":" + os.getenv('INTERNET_ARCHIVE_S3_SECRET')
r = requests.post(
'https://web.archive.org/save/', headers=ia_headers, data={'url': url})
if r.status_code != 200:
return ArchiveResult(status="Internet archive failed")
job_id = r.json()['job_id']
status_r = requests.get(
'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
retries = 0
# wait 90-120 seconds for the archive job to finish
while (status_r.status_code != 200 or status_r.json()['status'] == 'pending') and retries < 30:
status_r = requests.get(
'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
retries += 1
if status_r.status_code != 200:
return ArchiveResult(status="Internet archive failed")
status_json = status_r.json()
if status_json['status'] != 'success':
return ArchiveResult(status='Internet Archive failed: ' + status_json['message'])
archive_url = 'https://web.archive.org/web/' + \
status_json['timestamp'] + '/' + status_json['original_url']
r = requests.get(archive_url)
parsed = BeautifulSoup(
r.content, 'html.parser')
title = parsed.find_all('title')[
title = "Could not get title"
result = ArchiveResult(
status='Internet Archive fallback', cdn_url=archive_url, title=title)
self.seen_urls[url] = result
return result

Wyświetl plik

@ -0,0 +1,88 @@
import os
import datetime
import youtube_dl
from loguru import logger
from botocore.errorfactory import ClientError
from .base_archiver import Archiver, ArchiveResult
class YoutubeDLArchiver(Archiver):
name = "yotube_dl"
def download(self, url, check_if_exists=False):
ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'):
logger.info('Using Facebook cookie')
youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE')
ydl = youtube_dl.YoutubeDL(ydl_opts)
cdn_url = None
status = 'success'
info = ydl.extract_info(url, download=False)
except youtube_dl.utils.DownloadError:
# no video here
return False
if 'is_live' in info and info['is_live']:
logger.warning("Live streaming media, not archiving now")
return ArchiveResult(status="Streaming media")
if check_if_exists:
if 'entries' in info:
if len(info['entries']) > 1:
'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
return False
filename = ydl.prepare_filename(info['entries'][0])
filename = ydl.prepare_filename(info)
key = self.get_key(filename)
self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
# file exists
cdn_url = self.get_cdn_url(key)
status = 'already archived'
except ClientError:
# sometimes this results in a different filename, so do this again
info = ydl.extract_info(url, download=True)
if 'entries' in info:
if len(info['entries']) > 1:
'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
return False
info = info['entries'][0]
filename = ydl.prepare_filename(info)
if not os.path.exists(filename):
filename = filename.split('.')[0] + '.mkv'
if status != 'already archived':
key = self. get_key(filename)
cdn_url = self.get_cdn_url(key)
with open(filename, 'rb') as f:
self.do_s3_upload(f, key)
# get duration
duration = info['duration'] if 'duration' in info else None
# get thumbnails
key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration)
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
title=info['title'] if 'title' in info else None,
timestamp=info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None)

Wyświetl plik

@ -1,14 +1,12 @@
from dataclasses import dataclass
import gspread
from pathlib import Path
import datetime
import boto3
import os
from dotenv import load_dotenv
import datetime
import argparse
import math
import threading
import gspread
import boto3
from loguru import logger
from dotenv import load_dotenv
import archivers
@ -156,6 +154,7 @@ def process_sheet(sheet):
'duration')) if 'duration' in headers else None
# order matters, first to succeed excludes remaining
active_archivers = [
@ -198,7 +197,7 @@ def process_sheet(sheet):
def main():
parser = argparse.ArgumentParser(
description="Automatically use youtube-dl to download media from a Google Sheet")
description="Automatically archive social media videos from a Google Sheet")
parser.add_argument("--sheet", action="store", dest="sheet")
args = parser.parse_args()