split into multiple files MVP

pull/13/head
msramalho 2022-02-21 14:19:09 +01:00
rodzic 009c0dd8ca
commit f3ce226665
12 zmienionych plików z 446 dodań i 536 usunięć

3
.gitignore vendored
Wyświetl plik

@ -1,7 +1,8 @@
tmp/
.env
.env*
.DS_Store
expmt/
service_account.json
__pycache__/
._*
anu.html

Wyświetl plik

@ -10,7 +10,6 @@ python-dotenv = "*"
youtube_dl = "*"
argparse = "*"
beautifulsoup4 = "*"
nordvpn-switcher = "*"
tiktok-downloader = {git = "https://github.com/msramalho/tiktok-downloader"}
bs4 = "*"
loguru = "*"

145
Pipfile.lock wygenerowano
Wyświetl plik

@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "af39efbad8c78641a732697001193b5f4f92a0af8a9709081428001362a47060"
"sha256": "9a5218275503e5ae779407349d0a76f44712dc4824e066b10aeb047264a168be"
},
"pipfile-spec": 6,
"requires": {
@ -93,6 +93,14 @@
],
"version": "==1.2.58"
},
"faker": {
"hashes": [
"sha256:ee8d9181137cdd2b198bd3d0653b0a3b7b385213862348e15ba8a423324b702b",
"sha256:f545b2a1ba5f7effc4ed71af0a5204d939445f0190838d41bee6bc160958bfbe"
],
"markers": "python_version >= '3.6'",
"version": "==13.0.0"
},
"ffmpeg-python": {
"hashes": [
"sha256:65225db34627c578ef0e11c8b1eb528bb35e024752f6f10b78c011f6f64c4127",
@ -180,73 +188,6 @@
"index": "pypi",
"version": "==0.6.0"
},
"lxml": {
"hashes": [
"sha256:078306d19a33920004addeb5f4630781aaeabb6a8d01398045fcde085091a169",
"sha256:0c1978ff1fd81ed9dcbba4f91cf09faf1f8082c9d72eb122e92294716c605428",
"sha256:1010042bfcac2b2dc6098260a2ed022968dbdfaf285fc65a3acf8e4eb1ffd1bc",
"sha256:1d650812b52d98679ed6c6b3b55cbb8fe5a5460a0aef29aeb08dc0b44577df85",
"sha256:20b8a746a026017acf07da39fdb10aa80ad9877046c9182442bf80c84a1c4696",
"sha256:2403a6d6fb61c285969b71f4a3527873fe93fd0abe0832d858a17fe68c8fa507",
"sha256:24f5c5ae618395ed871b3d8ebfcbb36e3f1091fd847bf54c4de623f9107942f3",
"sha256:28d1af847786f68bec57961f31221125c29d6f52d9187c01cd34dc14e2b29430",
"sha256:31499847fc5f73ee17dbe1b8e24c6dafc4e8d5b48803d17d22988976b0171f03",
"sha256:31ba2cbc64516dcdd6c24418daa7abff989ddf3ba6d3ea6f6ce6f2ed6e754ec9",
"sha256:330bff92c26d4aee79c5bc4d9967858bdbe73fdbdbacb5daf623a03a914fe05b",
"sha256:5045ee1ccd45a89c4daec1160217d363fcd23811e26734688007c26f28c9e9e7",
"sha256:52cbf2ff155b19dc4d4100f7442f6a697938bf4493f8d3b0c51d45568d5666b5",
"sha256:530f278849031b0eb12f46cca0e5db01cfe5177ab13bd6878c6e739319bae654",
"sha256:545bd39c9481f2e3f2727c78c169425efbfb3fbba6e7db4f46a80ebb249819ca",
"sha256:5804e04feb4e61babf3911c2a974a5b86f66ee227cc5006230b00ac6d285b3a9",
"sha256:5a58d0b12f5053e270510bf12f753a76aaf3d74c453c00942ed7d2c804ca845c",
"sha256:5f148b0c6133fb928503cfcdfdba395010f997aa44bcf6474fcdd0c5398d9b63",
"sha256:5f7d7d9afc7b293147e2d506a4596641d60181a35279ef3aa5778d0d9d9123fe",
"sha256:60d2f60bd5a2a979df28ab309352cdcf8181bda0cca4529769a945f09aba06f9",
"sha256:6259b511b0f2527e6d55ad87acc1c07b3cbffc3d5e050d7e7bcfa151b8202df9",
"sha256:6268e27873a3d191849204d00d03f65c0e343b3bcb518a6eaae05677c95621d1",
"sha256:627e79894770783c129cc5e89b947e52aa26e8e0557c7e205368a809da4b7939",
"sha256:62f93eac69ec0f4be98d1b96f4d6b964855b8255c345c17ff12c20b93f247b68",
"sha256:6d6483b1229470e1d8835e52e0ff3c6973b9b97b24cd1c116dca90b57a2cc613",
"sha256:6f7b82934c08e28a2d537d870293236b1000d94d0b4583825ab9649aef7ddf63",
"sha256:6fe4ef4402df0250b75ba876c3795510d782def5c1e63890bde02d622570d39e",
"sha256:719544565c2937c21a6f76d520e6e52b726d132815adb3447ccffbe9f44203c4",
"sha256:730766072fd5dcb219dd2b95c4c49752a54f00157f322bc6d71f7d2a31fecd79",
"sha256:74eb65ec61e3c7c019d7169387d1b6ffcfea1b9ec5894d116a9a903636e4a0b1",
"sha256:7993232bd4044392c47779a3c7e8889fea6883be46281d45a81451acfd704d7e",
"sha256:80bbaddf2baab7e6de4bc47405e34948e694a9efe0861c61cdc23aa774fcb141",
"sha256:86545e351e879d0b72b620db6a3b96346921fa87b3d366d6c074e5a9a0b8dadb",
"sha256:891dc8f522d7059ff0024cd3ae79fd224752676447f9c678f2a5c14b84d9a939",
"sha256:8a31f24e2a0b6317f33aafbb2f0895c0bce772980ae60c2c640d82caac49628a",
"sha256:8b99ec73073b37f9ebe8caf399001848fced9c08064effdbfc4da2b5a8d07b93",
"sha256:986b7a96228c9b4942ec420eff37556c5777bfba6758edcb95421e4a614b57f9",
"sha256:a1547ff4b8a833511eeaceacbcd17b043214fcdb385148f9c1bc5556ca9623e2",
"sha256:a2bfc7e2a0601b475477c954bf167dee6d0f55cb167e3f3e7cefad906e7759f6",
"sha256:a3c5f1a719aa11866ffc530d54ad965063a8cbbecae6515acbd5f0fae8f48eaa",
"sha256:a9f1c3489736ff8e1c7652e9dc39f80cff820f23624f23d9eab6e122ac99b150",
"sha256:aa0cf4922da7a3c905d000b35065df6184c0dc1d866dd3b86fd961905bbad2ea",
"sha256:ad4332a532e2d5acb231a2e5d33f943750091ee435daffca3fec0a53224e7e33",
"sha256:b2582b238e1658c4061ebe1b4df53c435190d22457642377fd0cb30685cdfb76",
"sha256:b6fc2e2fb6f532cf48b5fed57567ef286addcef38c28874458a41b7837a57807",
"sha256:b92d40121dcbd74831b690a75533da703750f7041b4bf951befc657c37e5695a",
"sha256:bbab6faf6568484707acc052f4dfc3802bdb0cafe079383fbaa23f1cdae9ecd4",
"sha256:c0b88ed1ae66777a798dc54f627e32d3b81c8009967c63993c450ee4cbcbec15",
"sha256:ce13d6291a5f47c1c8dbd375baa78551053bc6b5e5c0e9bb8e39c0a8359fd52f",
"sha256:db3535733f59e5605a88a706824dfcb9bd06725e709ecb017e165fc1d6e7d429",
"sha256:dd10383f1d6b7edf247d0960a3db274c07e96cf3a3fc7c41c8448f93eac3fb1c",
"sha256:e01f9531ba5420838c801c21c1b0f45dbc9607cb22ea2cf132844453bec863a5",
"sha256:e11527dc23d5ef44d76fef11213215c34f36af1608074561fcc561d983aeb870",
"sha256:e1ab2fac607842ac36864e358c42feb0960ae62c34aa4caaf12ada0a1fb5d99b",
"sha256:e1fd7d2fe11f1cb63d3336d147c852f6d07de0d0020d704c6031b46a30b02ca8",
"sha256:e9f84ed9f4d50b74fbc77298ee5c870f67cb7e91dcdc1a6915cb1ff6a317476c",
"sha256:ec4b4e75fc68da9dc0ed73dcdb431c25c57775383fec325d23a770a64e7ebc87",
"sha256:f10ce66fcdeb3543df51d423ede7e238be98412232fca5daec3e54bcd16b8da0",
"sha256:f63f62fc60e6228a4ca9abae28228f35e1bd3ce675013d1dfb828688d50c6e23",
"sha256:fa56bb08b3dd8eac3a8c5b7d075c94e74f755fd9d8a04543ae8d37b1612dd170",
"sha256:fa9b7c450be85bfc6cd39f6df8c5b8cbd76b5d6fc1f69efec80203f9894b885f"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==4.8.0"
},
"markupsafe": {
"hashes": [
"sha256:023af8c54fe63530545f70dd2a2a7eed18d07a9a77b94e8bf1e2ff7f252db9a3",
@ -293,14 +234,6 @@
"markers": "python_version >= '3.7'",
"version": "==2.1.0"
},
"nordvpn-switcher": {
"hashes": [
"sha256:764db054715d949af0f836da5e46c4053afe92282a0d4b2cfc6b8cfe8c3045de",
"sha256:9788c2c3113d0d7b00894dae3ea19bed14f3b38d111d7223c126f001b1729a3b"
],
"index": "pypi",
"version": "==0.2.9"
},
"oauthlib": {
"hashes": [
"sha256:23a8208d75b902797ea29fd31fa80a15ed9dc2c6c16fe73f5d346f83f6fa27a2",
@ -309,59 +242,6 @@
"markers": "python_version >= '3.6'",
"version": "==3.2.0"
},
"pathlib": {
"hashes": [
"sha256:6940718dfc3eff4258203ad5021090933e5c04707d5ca8cc9e73c94a7894ea9f"
],
"version": "==1.0.1"
},
"psutil": {
"hashes": [
"sha256:072664401ae6e7c1bfb878c65d7282d4b4391f1bc9a56d5e03b5a490403271b5",
"sha256:1070a9b287846a21a5d572d6dddd369517510b68710fca56b0e9e02fd24bed9a",
"sha256:1d7b433519b9a38192dfda962dd8f44446668c009833e1429a52424624f408b4",
"sha256:3151a58f0fbd8942ba94f7c31c7e6b310d2989f4da74fcbf28b934374e9bf841",
"sha256:32acf55cb9a8cbfb29167cd005951df81b567099295291bcfd1027365b36591d",
"sha256:3611e87eea393f779a35b192b46a164b1d01167c9d323dda9b1e527ea69d697d",
"sha256:3d00a664e31921009a84367266b35ba0aac04a2a6cad09c550a89041034d19a0",
"sha256:4e2fb92e3aeae3ec3b7b66c528981fd327fb93fd906a77215200404444ec1845",
"sha256:539e429da49c5d27d5a58e3563886057f8fc3868a5547b4f1876d9c0f007bccf",
"sha256:55ce319452e3d139e25d6c3f85a1acf12d1607ddedea5e35fb47a552c051161b",
"sha256:58c7d923dc209225600aec73aa2c4ae8ea33b1ab31bc11ef8a5933b027476f07",
"sha256:7336292a13a80eb93c21f36bde4328aa748a04b68c13d01dfddd67fc13fd0618",
"sha256:742c34fff804f34f62659279ed5c5b723bb0195e9d7bd9907591de9f8f6558e2",
"sha256:7641300de73e4909e5d148e90cc3142fb890079e1525a840cf0dfd39195239fd",
"sha256:76cebf84aac1d6da5b63df11fe0d377b46b7b500d892284068bacccf12f20666",
"sha256:7779be4025c540d1d65a2de3f30caeacc49ae7a2152108adeaf42c7534a115ce",
"sha256:7d190ee2eaef7831163f254dc58f6d2e2a22e27382b936aab51c835fc080c3d3",
"sha256:8293942e4ce0c5689821f65ce6522ce4786d02af57f13c0195b40e1edb1db61d",
"sha256:869842dbd66bb80c3217158e629d6fceaecc3a3166d3d1faee515b05dd26ca25",
"sha256:90a58b9fcae2dbfe4ba852b57bd4a1dded6b990a33d6428c7614b7d48eccb492",
"sha256:9b51917c1af3fa35a3f2dabd7ba96a2a4f19df3dec911da73875e1edaf22a40b",
"sha256:b2237f35c4bbae932ee98902a08050a27821f8f6dfa880a47195e5993af4702d",
"sha256:c3400cae15bdb449d518545cbd5b649117de54e3596ded84aacabfbb3297ead2",
"sha256:c51f1af02334e4b516ec221ee26b8fdf105032418ca5a5ab9737e8c87dafe203",
"sha256:cb8d10461c1ceee0c25a64f2dd54872b70b89c26419e147a05a10b753ad36ec2",
"sha256:d62a2796e08dd024b8179bd441cb714e0f81226c352c802fca0fd3f89eeacd94",
"sha256:df2c8bd48fb83a8408c8390b143c6a6fa10cb1a674ca664954de193fdcab36a9",
"sha256:e5c783d0b1ad6ca8a5d3e7b680468c9c926b804be83a3a8e95141b05c39c9f64",
"sha256:e9805fed4f2a81de98ae5fe38b75a74c6e6ad2df8a5c479594c7629a1fe35f56",
"sha256:ea42d747c5f71b5ccaa6897b216a7dadb9f52c72a0fe2b872ef7d3e1eacf3ba3",
"sha256:ef216cc9feb60634bda2f341a9559ac594e2eeaadd0ba187a4c2eb5b5d40b91c",
"sha256:ff0d41f8b3e9ebb6b6110057e40019a432e96aae2008951121ba4e56040b84f3"
],
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==5.9.0"
},
"py-mini-racer": {
"hashes": [
"sha256:346e73bb89a2024888244d487834be24a121089ceb0641dd0200cb96c4e24b57",
"sha256:42896c24968481dd953eeeb11de331f6870917811961c9b26ba09071e07180e2",
"sha256:97cab31bbf63ce462ba4cd6e978c572c916d8b15586156c7c5e0b2e42c10baab",
"sha256:f71e36b643d947ba698c57cd9bd2232c83ca997b0802fc2f7f79582377040c11"
],
"version": "==0.6.0"
},
"pyasn1": {
"hashes": [
"sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359",
@ -422,13 +302,6 @@
"index": "pypi",
"version": "==0.19.2"
},
"random-user-agent": {
"hashes": [
"sha256:535636a55fb63fe3d74fd0260d854c241d9f2946447026464e578e68eac17dac",
"sha256:8f8ca26ec8cb1d24ad1758d8b8f700d154064d641dbe9a255cfec42960fbd012"
],
"version": "==1.0.1"
},
"requests": {
"hashes": [
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",

Wyświetl plik

@ -8,6 +8,8 @@ If you are using `pipenv` (recommended), `pipenv install` is sufficient to insta
[A Google Service account is necessary for use with `gspread`.](https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account) Credentials for this account should be stored in `service_account.json`, in the same directory as the script.
[ffmpeg](https://www.ffmpeg.org/) must also be installed locally for this tool to work.
A `.env` file is required for saving content to a Digital Ocean space, and for archiving pages to the Internet Archive. This file should also be in the script directory, and should contain the following variables:
```

Wyświetl plik

@ -1,390 +0,0 @@
from dataclasses import dataclass
import youtube_dl
from bs4 import BeautifulSoup
import requests
import tiktok_downloader
from loguru import logger
import os
import datetime
import ffmpeg
from botocore.errorfactory import ClientError
import time
import traceback
# TODO There should be a better way of generating keys, that adds the following info:
# - name of sheet that it is being archived from
# (this means we might archive the same media twice on different sheets, but that's OK I think)
# - name of archiver/platform that the video comes from
# This should make it easier to maintain and clean the archive later
# TODO "check_if_exists" has lots of repeated code across the archivers. Can this be
# cleaned up? Difficult is we don't know the filename until the archivers start working.
def get_cdn_url(key):
return 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format(
os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key)
def do_s3_upload(s3_client, f, key):
s3_client.upload_fileobj(f, Bucket=os.getenv(
'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'})
def get_key(filename):
key = filename.split('/')[1]
if 'unknown_video' in key:
key = key.replace('unknown_video', 'jpg')
return key
def get_thumbnails(filename, s3_client, duration=None):
if not os.path.exists(filename.split('.')[0]):
os.mkdir(filename.split('.')[0])
fps = 0.5
if duration is not None:
duration = float(duration)
if duration < 60:
fps = 10.0 / duration
elif duration < 120:
fps = 20.0 / duration
else:
fps = 40.0 / duration
stream = ffmpeg.input(filename)
stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)
stream.output(filename.split('.')[0] + '/out%d.jpg').run()
thumbnails = os.listdir(filename.split('.')[0] + '/')
cdn_urls = []
for fname in thumbnails:
if fname[-3:] == 'jpg':
thumbnail_filename = filename.split('.')[0] + '/' + fname
key = filename.split('/')[1].split('.')[0] + '/' + fname
cdn_url = get_cdn_url(key)
with open(thumbnail_filename, 'rb') as f:
do_s3_upload(s3_client, f, key)
cdn_urls.append(cdn_url)
os.remove(thumbnail_filename)
if len(cdn_urls) == 0:
return ('None', 'None')
key_thumb = cdn_urls[int(len(cdn_urls)*0.1)]
index_page = f'''<html><head><title>{filename}</title></head>
<body>'''
for t in cdn_urls:
index_page += f'<img src="{t}" />'
index_page += f"</body></html>"
index_fname = filename.split('.')[0] + '/index.html'
with open(index_fname, 'w') as f:
f.write(index_page)
thumb_index = filename.split('/')[1].split('.')[0] + '/index.html'
s3_client.upload_fileobj(open(index_fname, 'rb'), Bucket=os.getenv(
'DO_BUCKET'), Key=thumb_index, ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/html'})
thumb_index_cdn_url = get_cdn_url(thumb_index)
return (key_thumb, thumb_index_cdn_url)
@dataclass
class ArchiveResult:
status: str
cdn_url: str = None
thumbnail: str = None
thumbnail_index: str = None
duration: float = None
title: str = None
timestamp: datetime.datetime = None
class Archiver:
def __init__(self, s3_client):
self.s3 = s3_client
def download(self, url):
pass
class TelegramArchiver(Archiver):
def download(self, url, check_if_exists=False):
# detect URLs that we definitely cannot handle
if 'http://t.me/' not in url and 'https://t.me/' not in url:
return False
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
status = "success"
original_url = url
if url[-8:] != "?embed=1":
url += "?embed=1"
t = requests.get(url, headers=headers)
s = BeautifulSoup(t.content, 'html.parser')
video = s.find("video")
if video is None:
return False # could not find video
video_url = video.get('src')
key = video_url.split('/')[-1].split('?')[0]
filename = 'tmp/' + key
if check_if_exists:
try:
self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
# file exists
cdn_url = get_cdn_url(key)
status = 'already archived'
except ClientError:
pass
v = requests.get(video_url, headers=headers)
with open(filename, 'wb') as f:
f.write(v.content)
if status != 'already archived':
cdn_url = get_cdn_url(key)
with open(filename, 'rb') as f:
do_s3_upload(self.s3, f, key)
# extract duration from HTML
duration = s.find_all('time')[0].contents[0]
if ':' in duration:
duration = float(duration.split(
':')[0])*60 + float(duration.split(':')[1])
else:
duration = float(duration)
# process thumbnails
key_thumb, thumb_index = get_thumbnails(
filename, self.s3, duration=duration)
os.remove(filename)
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'))
class YoutubeDLArchiver(Archiver):
def download(self, url, check_if_exists=False):
ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'):
logger.info('Using Facebook cookie')
youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE')
ydl = youtube_dl.YoutubeDL(ydl_opts)
cdn_url = None
status = 'success'
try:
info = ydl.extract_info(url, download=False)
except youtube_dl.utils.DownloadError:
# no video here
return False
if 'is_live' in info and info['is_live']:
logger.warning("Live streaming media, not archiving now")
return ArchiveResult(status="Streaming media")
if check_if_exists:
if 'entries' in info:
if len(info['entries']) > 1:
logger.warning(
'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
return False
filename = ydl.prepare_filename(info['entries'][0])
else:
filename = ydl.prepare_filename(info)
key = get_key(filename)
try:
self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
# file exists
cdn_url = get_cdn_url(key)
status = 'already archived'
except ClientError:
pass
# sometimes this results in a different filename, so do this again
info = ydl.extract_info(url, download=True)
if 'entries' in info:
if len(info['entries']) > 1:
logger.warning(
'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
return False
else:
info = info['entries'][0]
filename = ydl.prepare_filename(info)
if not os.path.exists(filename):
filename = filename.split('.')[0] + '.mkv'
if status != 'already archived':
key = get_key(filename)
cdn_url = get_cdn_url(key)
with open(filename, 'rb') as f:
do_s3_upload(self.s3, f, key)
# get duration
duration = info['duration'] if 'duration' in info else None
# get thumbnails
key_thumb, thumb_index = get_thumbnails(
filename, self.s3, duration=duration)
os.remove(filename)
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
title=info['title'] if 'title' in info else None,
timestamp=info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None)
class WaybackArchiver(Archiver):
def __init__(self, s3_client):
self.s3 = s3_client
self.seen_urls = {}
def download(self, url, check_if_exists=False):
if check_if_exists and url in self.seen_urls:
return self.seen_urls[url]
ia_headers = {
"Accept": "application/json",
"Authorization": "LOW " + os.getenv('INTERNET_ARCHIVE_S3_KEY') + ":" + os.getenv('INTERNET_ARCHIVE_S3_SECRET')
}
r = requests.post(
'https://web.archive.org/save/', headers=ia_headers, data={'url': url})
if r.status_code != 200:
return ArchiveResult(status="Internet archive failed")
job_id = r.json()['job_id']
status_r = requests.get(
'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
retries = 0
# wait 90-120 seconds for the archive job to finish
while (status_r.status_code != 200 or status_r.json()['status'] == 'pending') and retries < 30:
time.sleep(3)
try:
status_r = requests.get(
'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
except:
time.sleep(1)
retries += 1
if status_r.status_code != 200:
return ArchiveResult(status="Internet archive failed")
status_json = status_r.json()
if status_json['status'] != 'success':
return ArchiveResult(status='Internet Archive failed: ' + status_json['message'])
archive_url = 'https://web.archive.org/web/' + \
status_json['timestamp'] + '/' + status_json['original_url']
try:
r = requests.get(archive_url)
parsed = BeautifulSoup(
r.content, 'html.parser')
title = parsed.find_all('title')[
0].text
except:
title = "Could not get title"
result = ArchiveResult(
status='Internet Archive fallback', cdn_url=archive_url, title=title)
self.seen_urls[url] = result
return result
class TiktokArchiver(Archiver):
def download(self, url, check_if_exists=False):
if 'tiktok.com' not in url:
return False
status = 'success'
try:
info = tiktok_downloader.info_post(url)
key = 'tiktok_' + str(info.id) + '.mp4'
filename = 'tmp/' + key
if check_if_exists:
try:
self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
# file exists
cdn_url = get_cdn_url(key)
status = 'already archived'
except ClientError:
pass
if status != 'already archived':
media = tiktok_downloader.snaptik(url).get_media()
if len(media) > 0:
media[0].download(filename)
with open(filename, 'rb') as f:
do_s3_upload(self.s3, f, key)
cdn_url = get_cdn_url(key)
else:
status = 'could not download media'
try:
key_thumb, thumb_index = get_thumbnails(
filename, self.s3, duration=info.duration)
except:
key_thumb = ''
thumb_index = 'error creating thumbnails'
os.remove(filename)
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat())
except tiktok_downloader.Except.InvalidUrl:
status = 'Invalid URL'
return ArchiveResult(status=status)
except:
error = traceback.format_exc()
status = 'Other Tiktok error: ' + str(error)
return ArchiveResult(status=status)

Wyświetl plik

@ -0,0 +1,6 @@
# we need to explicitly expose the available imports here
from .base_archiver import *
from .telegram_archiver import *
from .tiktok_archiver import *
from .wayback_archiver import *
from .youtubedl_archiver import *

Wyświetl plik

@ -0,0 +1,115 @@
import os
import ffmpeg
from dataclasses import dataclass
import datetime
from loguru import logger
# TODO There should be a better way of generating keys, that adds the following info:
# - name of sheet that it is being archived from
# (this means we might archive the same media twice on different sheets, but that's OK I think)
# - name of archiver/platform that the video comes from
# This should make it easier to maintain and clean the archive later
# TODO "check_if_exists" has lots of repeated code across the archivers. Can this be
# cleaned up? Difficult is we don't know the filename until the archivers start working.
@dataclass
class ArchiveResult:
status: str
cdn_url: str = None
thumbnail: str = None
thumbnail_index: str = None
duration: float = None
title: str = None
timestamp: datetime.datetime = None
class Archiver:
name = "default"
def __init__(self, s3_client):
self.s3 = s3_client
def __str__(self):
return self.__class__.__name__
def download(self, url, check_if_exists=False):
logger.error("method 'download' not implemented")
def get_cdn_url(self, key):
return 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format(
os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key)
def do_s3_upload(self, f, key):
self.s3.upload_fileobj(f, Bucket=os.getenv(
'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'})
def get_key(self, filename):
print(f"key base implementation: {self.name}")
# TODO: refactor to be more manageable
key = filename.split('/')[1]
if 'unknown_video' in key:
key = key.replace('unknown_video', 'jpg')
return key
def get_thumbnails(self, filename, duration=None):
if not os.path.exists(filename.split('.')[0]):
os.mkdir(filename.split('.')[0])
fps = 0.5
if duration is not None:
duration = float(duration)
if duration < 60:
fps = 10.0 / duration
elif duration < 120:
fps = 20.0 / duration
else:
fps = 40.0 / duration
stream = ffmpeg.input(filename)
stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)
stream.output(filename.split('.')[0] + '/out%d.jpg').run()
thumbnails = os.listdir(filename.split('.')[0] + '/')
cdn_urls = []
for fname in thumbnails:
if fname[-3:] == 'jpg':
thumbnail_filename = filename.split('.')[0] + '/' + fname
key = filename.split('/')[1].split('.')[0] + '/' + fname
cdn_url = self.get_cdn_url(key)
with open(thumbnail_filename, 'rb') as f:
self.do_s3_upload(f, key)
cdn_urls.append(cdn_url)
os.remove(thumbnail_filename)
if len(cdn_urls) == 0:
return ('None', 'None')
key_thumb = cdn_urls[int(len(cdn_urls) * 0.1)]
index_page = f'''<html><head><title>{filename}</title></head>
<body>'''
for t in cdn_urls:
index_page += f'<img src="{t}" />'
index_page += f"</body></html>"
index_fname = filename.split('.')[0] + '/index.html'
with open(index_fname, 'w') as f:
f.write(index_page)
thumb_index = filename.split('/')[1].split('.')[0] + '/index.html'
self.s3.upload_fileobj(open(index_fname, 'rb'), Bucket=os.getenv(
'DO_BUCKET'), Key=thumb_index, ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/html'})
thumb_index_cdn_url = self.get_cdn_url(thumb_index)
return (key_thumb, thumb_index_cdn_url)

Wyświetl plik

@ -0,0 +1,76 @@
import os
import requests
from bs4 import BeautifulSoup
from botocore.errorfactory import ClientError
from .base_archiver import Archiver, ArchiveResult
# TODO: get_cdn_url, get_thumbnails, do_s3_upload
class TelegramArchiver(Archiver):
name = "telegram"
def download(self, url, check_if_exists=False):
# detect URLs that we definitely cannot handle
if 'http://t.me/' not in url and 'https://t.me/' not in url:
return False
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
}
status = "success"
original_url = url
# TODO: check if we can do this more resilient to user-input
if url[-8:] != "?embed=1":
url += "?embed=1"
t = requests.get(url, headers=headers)
s = BeautifulSoup(t.content, 'html.parser')
video = s.find("video")
if video is None:
return False # could not find video
video_url = video.get('src')
key = video_url.split('/')[-1].split('?')[0]
filename = 'tmp/' + key
if check_if_exists:
try:
self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
# file exists
cdn_url = self.get_cdn_url(key)
status = 'already archived'
except ClientError:
pass
v = requests.get(video_url, headers=headers)
with open(filename, 'wb') as f:
f.write(v.content)
if status != 'already archived':
cdn_url = self.get_cdn_url(key)
with open(filename, 'rb') as f:
self.do_s3_upload(f, key)
# extract duration from HTML
duration = s.find_all('time')[0].contents[0]
if ':' in duration:
duration = float(duration.split(
':')[0]) * 60 + float(duration.split(':')[1])
else:
duration = float(duration)
# process thumbnails
key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration)
os.remove(filename)
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'))

Wyświetl plik

@ -0,0 +1,68 @@
import os, traceback
from botocore.errorfactory import ClientError
import tiktok_downloader
from loguru import logger
from .base_archiver import Archiver, ArchiveResult
# TODO: get_cdn_url, do_s3_upload, get_thumbnails
class TiktokArchiver(Archiver):
name = "tiktok"
def download(self, url, check_if_exists=False):
if 'tiktok.com' not in url:
return False
status = 'success'
try:
info = tiktok_downloader.info_post(url)
key = 'tiktok_' + str(info.id) + '.mp4'
filename = 'tmp/' + key
if check_if_exists:
try:
self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
# file exists
cdn_url = self.get_cdn_url(key)
status = 'already archived'
except ClientError:
pass
if status != 'already archived':
media = tiktok_downloader.snaptik(url).get_media()
if len(media) > 0:
media[0].download(filename)
with open(filename, 'rb') as f:
self.do_s3_upload(f, key)
cdn_url = self.get_cdn_url(key)
else:
status = 'could not download media'
try:
key_thumb, thumb_index = self.get_thumbnails(
filename, duration=info.duration)
except:
key_thumb = ''
thumb_index = 'error creating thumbnails'
try: os.remove(filename)
except FileNotFoundError:
logger.info(f'tmp file not found thus not deleted {filename}')
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat())
except tiktok_downloader.Except.InvalidUrl:
status = 'Invalid URL'
return ArchiveResult(status=status)
except:
error = traceback.format_exc()
status = 'Other Tiktok error: ' + str(error)
return ArchiveResult(status=status)

Wyświetl plik

@ -0,0 +1,73 @@
import time, requests, os
from bs4 import BeautifulSoup
from .base_archiver import Archiver, ArchiveResult
class WaybackArchiver(Archiver):
name = "wayback"
def __init__(self, s3_client):
self.s3 = s3_client
self.seen_urls = {}
def download(self, url, check_if_exists=False):
if check_if_exists and url in self.seen_urls:
return self.seen_urls[url]
ia_headers = {
"Accept": "application/json",
"Authorization": "LOW " + os.getenv('INTERNET_ARCHIVE_S3_KEY') + ":" + os.getenv('INTERNET_ARCHIVE_S3_SECRET')
}
r = requests.post(
'https://web.archive.org/save/', headers=ia_headers, data={'url': url})
if r.status_code != 200:
return ArchiveResult(status="Internet archive failed")
job_id = r.json()['job_id']
status_r = requests.get(
'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
retries = 0
# wait 90-120 seconds for the archive job to finish
while (status_r.status_code != 200 or status_r.json()['status'] == 'pending') and retries < 30:
time.sleep(3)
try:
status_r = requests.get(
'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
except:
time.sleep(1)
retries += 1
if status_r.status_code != 200:
return ArchiveResult(status="Internet archive failed")
status_json = status_r.json()
if status_json['status'] != 'success':
return ArchiveResult(status='Internet Archive failed: ' + status_json['message'])
archive_url = 'https://web.archive.org/web/' + \
status_json['timestamp'] + '/' + status_json['original_url']
try:
r = requests.get(archive_url)
parsed = BeautifulSoup(
r.content, 'html.parser')
title = parsed.find_all('title')[
0].text
except:
title = "Could not get title"
result = ArchiveResult(
status='Internet Archive fallback', cdn_url=archive_url, title=title)
self.seen_urls[url] = result
return result

Wyświetl plik

@ -0,0 +1,88 @@
import os
import datetime
import youtube_dl
from loguru import logger
from botocore.errorfactory import ClientError
from .base_archiver import Archiver, ArchiveResult
class YoutubeDLArchiver(Archiver):
name = "yotube_dl"
def download(self, url, check_if_exists=False):
ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'):
logger.info('Using Facebook cookie')
youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE')
ydl = youtube_dl.YoutubeDL(ydl_opts)
cdn_url = None
status = 'success'
try:
info = ydl.extract_info(url, download=False)
except youtube_dl.utils.DownloadError:
# no video here
return False
if 'is_live' in info and info['is_live']:
logger.warning("Live streaming media, not archiving now")
return ArchiveResult(status="Streaming media")
if check_if_exists:
if 'entries' in info:
if len(info['entries']) > 1:
logger.warning(
'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
return False
filename = ydl.prepare_filename(info['entries'][0])
else:
filename = ydl.prepare_filename(info)
key = self.get_key(filename)
try:
self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
# file exists
cdn_url = self.get_cdn_url(key)
status = 'already archived'
except ClientError:
pass
# sometimes this results in a different filename, so do this again
info = ydl.extract_info(url, download=True)
if 'entries' in info:
if len(info['entries']) > 1:
logger.warning(
'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
return False
else:
info = info['entries'][0]
filename = ydl.prepare_filename(info)
if not os.path.exists(filename):
filename = filename.split('.')[0] + '.mkv'
if status != 'already archived':
key = self. get_key(filename)
cdn_url = self.get_cdn_url(key)
with open(filename, 'rb') as f:
self.do_s3_upload(f, key)
# get duration
duration = info['duration'] if 'duration' in info else None
# get thumbnails
key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration)
os.remove(filename)
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
title=info['title'] if 'title' in info else None,
timestamp=info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None)

Wyświetl plik

@ -1,14 +1,12 @@
from dataclasses import dataclass
import gspread
from pathlib import Path
import datetime
import boto3
import os
from dotenv import load_dotenv
import datetime
import argparse
import math
import threading
import gspread
import boto3
from loguru import logger
from dotenv import load_dotenv
import archivers
load_dotenv()
@ -156,6 +154,7 @@ def process_sheet(sheet):
'duration')) if 'duration' in headers else None
# order matters, first to succeed excludes remaining
active_archivers = [
archivers.TelegramArchiver(s3_client),
archivers.TiktokArchiver(s3_client),
@ -198,7 +197,7 @@ def process_sheet(sheet):
def main():
parser = argparse.ArgumentParser(
description="Automatically use youtube-dl to download media from a Google Sheet")
description="Automatically archive social media videos from a Google Sheet")
parser.add_argument("--sheet", action="store", dest="sheet")
args = parser.parse_args()