pull/33/head
msramalho 2022-06-14 21:18:18 +02:00
rodzic 64c083b37b
commit dc60bb1558
7 zmienionych plików z 188 dodań i 127 usunięć

2
.gitignore vendored
Wyświetl plik

@ -12,5 +12,7 @@ anu.html
anon*
config.json
config-*.json
config.yaml
config-*.yaml
logs/*
local_archive/

Wyświetl plik

@ -21,6 +21,7 @@ google-auth-httplib2 = "*"
google-auth-oauthlib = "*"
oauth2client = "*"
python-slugify = "*"
pyyaml = "*"
[requires]
python_version = "3.9"

127
Pipfile.lock wygenerowano
Wyświetl plik

@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "e13fa011edc8726b15cc2a3ef30cd73a71ff33830ca853f6a5e7641f0a9a6f91"
"sha256": "602a05a8fa475181c24714ab57188a417fdfddf373a7dab4fa0ba0fcb7ce8d0a"
},
"pipfile-spec": 6,
"requires": {
@ -50,19 +50,19 @@
},
"boto3": {
"hashes": [
"sha256:1bc562393d7985263e62828173eea6c7d61562031c646dc857a4f0fad1dfddbe",
"sha256:7625c5ed92bb7a953e03d2541bcbfcb66c3495f8d7b9421e47b4e2c280dc9162"
"sha256:28ab0947c49a6fb2409004d4a10b2828aec231cb95ca1d800cb1411e191cc201",
"sha256:833e67edfb73f2cc22ff27a1c33728686dc90a9e81ba2551f9462ea2d1b04f41"
],
"index": "pypi",
"version": "==1.24.3"
"version": "==1.24.8"
},
"botocore": {
"hashes": [
"sha256:2d48f4ed77220d4cb6f1b1abbb1b782d1b12260645f6ba3f3cd9ae5c98546297",
"sha256:7be5962b956b5770799ba87b0bd2173230068d269982bdf8d16fabaa79483912"
"sha256:ad92702930d6cb7b587fc2f619672feb74d5218f8de387a28c2905820db79027",
"sha256:db6667b8dfd175d16187653942cd91dd1f0cf36adc0ea9d7a0805ba4d2a3321f"
],
"markers": "python_full_version >= '3.7.0'",
"version": "==1.27.3"
"markers": "python_version >= '3.7'",
"version": "==1.27.8"
},
"brotli": {
"hashes": [
@ -215,7 +215,7 @@
"sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597",
"sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df"
],
"markers": "python_version >= '3'",
"markers": "python_version >= '3.5'",
"version": "==2.0.12"
},
"click": {
@ -223,7 +223,7 @@
"sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e",
"sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"
],
"markers": "python_full_version >= '3.7.0'",
"markers": "python_version >= '3.7'",
"version": "==8.1.3"
},
"cloudscraper": {
@ -280,7 +280,7 @@
"sha256:37def7b658813cda163b56fc564cdc75e86d338246458c4c28ae84cabefa2404",
"sha256:3a0fd85166ad9dbab54c9aec96737b744106dc5f15c0b09a6744a445299fcf04"
],
"markers": "python_full_version >= '3.7.0'",
"markers": "python_version >= '3.7'",
"version": "==3.7.1"
},
"flask": {
@ -288,7 +288,7 @@
"sha256:315ded2ddf8a6281567edb27393010fe3406188bafbfe65a3339d5787d89e477",
"sha256:fad5b446feb0d6db6aec0c3184d16a8c1f6c3e464b511649c8918a9be100b4fe"
],
"markers": "python_full_version >= '3.7.0'",
"markers": "python_version >= '3.7'",
"version": "==2.1.2"
},
"future": {
@ -308,19 +308,19 @@
},
"google-api-python-client": {
"hashes": [
"sha256:159aa2d5f67998f39b06f28f38d6621389dda099c56f0fde46e9070dabdd5b40",
"sha256:a45fd3f318f79b3498d31de7e7db16d70b01672a755c88f56841183db908c576"
"sha256:a573373041b3f6ccbd04877b70e7425c52daec5b4fe5f440e8f5895c87d1a69c",
"sha256:b444f839bed289ecfe30950ea1cd15b7e7976d8cf9f0a3c778037ae3fb030df3"
],
"index": "pypi",
"version": "==2.50.0"
"version": "==2.51.0"
},
"google-auth": {
"hashes": [
"sha256:1ba4938e032b73deb51e59c4656a00e0939cf0b1112575099f136babb4563312",
"sha256:349ac49b18b01019453cc99c11c92ed772739778c92f184002b7ab3a5b7ac77d"
"sha256:8a954960f852d5f19e6af14dd8e75c20159609e85d8db37e4013cc8c3824a7e1",
"sha256:df549a1433108801b11bdcc0e312eaf0d5f0500db42f0523e4d65c78722e8475"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
"version": "==2.6.6"
"version": "==2.7.0"
},
"google-auth-httplib2": {
"hashes": [
@ -332,11 +332,11 @@
},
"google-auth-oauthlib": {
"hashes": [
"sha256:24f67735513c4c7134dbde2f1dee5a1deb6acc8dfcb577d7bff30d213a28e7b0",
"sha256:30596b824fc6808fdaca2f048e4998cc40fb4b3599eaea66d28dc7085b36c5b8"
"sha256:6d6161d0ec0a62e2abf2207c6071c117ec5897b300823c4bb2d963ee86e20e4f",
"sha256:d5e98a71203330699f92a26bc08847a92e8c3b1b8d82a021f1af34164db143ae"
],
"index": "pypi",
"version": "==0.5.1"
"version": "==0.5.2"
},
"googleapis-common-protos": {
"hashes": [
@ -375,7 +375,7 @@
"sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
"sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"
],
"markers": "python_version >= '3'",
"markers": "python_version >= '3.5'",
"version": "==3.3"
},
"importlib-metadata": {
@ -391,7 +391,7 @@
"sha256:2c2349112351b88699d8d4b6b075022c0808887cb7ad10069318a8b0bc88db44",
"sha256:5dbbc68b317e5e42f327f9021763545dc3fc3bfe22e6deb96aaf1fc38874156a"
],
"markers": "python_full_version >= '3.7.0'",
"markers": "python_version >= '3.7'",
"version": "==2.1.2"
},
"jinja2": {
@ -399,7 +399,7 @@
"sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852",
"sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"
],
"markers": "python_full_version >= '3.7.0'",
"markers": "python_version >= '3.7'",
"version": "==3.1.2"
},
"jmespath": {
@ -407,7 +407,7 @@
"sha256:a490e280edd1f57d6de88636992d05b71e97d69a26a19f058ecf7d304474bf5e",
"sha256:e8dcd576ed616f14ec02eed0005c85973b5890083313860136657e24784e4c04"
],
"markers": "python_full_version >= '3.7.0'",
"markers": "python_version >= '3.7'",
"version": "==1.0.0"
},
"loguru": {
@ -530,7 +530,7 @@
"sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a",
"sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7"
],
"markers": "python_full_version >= '3.7.0'",
"markers": "python_version >= '3.7'",
"version": "==2.1.1"
},
"mutagen": {
@ -559,11 +559,11 @@
},
"outcome": {
"hashes": [
"sha256:c7dd9375cfd3c12db9801d080a3b63d4b0a261aa996c4c13152380587288d958",
"sha256:e862f01d4e626e63e8f92c38d1f8d5546d3f9cce989263c521b2e7990d186967"
"sha256:6f82bd3de45da303cf1f771ecafa1633750a358436a8bb60e06a1ceb745d2672",
"sha256:c4ab89a56575d6d38a05aa16daeaa333109c1f96167aba8901ab18b6b5e0f7f5"
],
"markers": "python_version >= '3.6'",
"version": "==1.1.0"
"markers": "python_version >= '3.7'",
"version": "==1.2.0"
},
"protobuf": {
"hashes": [
@ -592,7 +592,7 @@
"sha256:e250a42f15bf9d5b09fe1b293bdba2801cd520a9f5ea2d7fb7536d4441811d20",
"sha256:ff8d8fa42675249bb456f5db06c00de6c2f4c27a065955917b28c4f15978b9c3"
],
"markers": "python_full_version >= '3.7.0'",
"markers": "python_version >= '3.7'",
"version": "==3.20.1"
},
"pyaes": {
@ -724,13 +724,52 @@
"index": "pypi",
"version": "==6.1.2"
},
"pyyaml": {
"hashes": [
"sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293",
"sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b",
"sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57",
"sha256:0b4624f379dab24d3725ffde76559cff63d9ec94e1736b556dacdfebe5ab6d4b",
"sha256:0ce82d761c532fe4ec3f87fc45688bdd3a4c1dc5e0b4a19814b9009a29baefd4",
"sha256:1e4747bc279b4f613a09eb64bba2ba602d8a6664c6ce6396a4d0cd413a50ce07",
"sha256:213c60cd50106436cc818accf5baa1aba61c0189ff610f64f4a3e8c6726218ba",
"sha256:231710d57adfd809ef5d34183b8ed1eeae3f76459c18fb4a0b373ad56bedcdd9",
"sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287",
"sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513",
"sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0",
"sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0",
"sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92",
"sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f",
"sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2",
"sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc",
"sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c",
"sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86",
"sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4",
"sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c",
"sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34",
"sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b",
"sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c",
"sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb",
"sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737",
"sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3",
"sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d",
"sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53",
"sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78",
"sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803",
"sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a",
"sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174",
"sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"
],
"index": "pypi",
"version": "==6.0"
},
"requests": {
"hashes": [
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
"sha256:bc7861137fbce630f17b03d3ad02ad0bf978c844f3536d0edda6499dafce2b6f",
"sha256:d568723a7ebd25875d8d1eaf5dfa068cd2fc8194b2e483d7b1f7c81918dbec6b"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
"version": "==2.27.1"
"markers": "python_version >= '3.7' and python_version < '4'",
"version": "==2.28.0"
},
"requests-oauthlib": {
"hashes": [
@ -768,7 +807,7 @@
"sha256:06176b74f3a15f61f1b4f25a1fc29a4429040b7647133a463da8fa5bd28d5ecd",
"sha256:2ed07d3866f523cc561bf4a00fc5535827981b117dd7876f036b0c1aca42c947"
],
"markers": "python_full_version >= '3.7.0'",
"markers": "python_version >= '3.7'",
"version": "==0.6.0"
},
"selenium": {
@ -849,11 +888,11 @@
},
"trio": {
"hashes": [
"sha256:670a52d3115d0e879e1ac838a4eb999af32f858163e3a704fe4839de2a676070",
"sha256:fb2d48e4eab0dfb786a472cd514aaadc71e3445b203bc300bad93daa75d77c1a"
"sha256:4dc0bf9d5cc78767fc4516325b6d80cc0968705a31d0eec2ecd7cdda466265b0",
"sha256:523f39b7b69eef73501cebfe1aafd400a9aad5b03543a0eded52952488ff1c13"
],
"markers": "python_full_version >= '3.7.0'",
"version": "==0.20.0"
"markers": "python_version >= '3.7'",
"version": "==0.21.0"
},
"trio-websocket": {
"hashes": [
@ -934,7 +973,7 @@
"sha256:fab7c640815812ed5f10fbee7abbf58788d602046b7bb3af9b1ac753a6d5e916",
"sha256:fc06cc8073c8e87072138ba1e431300e2d408f054b27047d047b549455066ff4"
],
"markers": "python_full_version >= '3.7.0'",
"markers": "python_version >= '3.7'",
"version": "==10.3"
},
"werkzeug": {
@ -942,7 +981,7 @@
"sha256:1ce08e8093ed67d638d63879fd1ba3735817f7a80de3674d293f5984f25fb6e6",
"sha256:72a4b735692dd3135217911cbeaa1be5fa3f62bffb8745c5215420a03dc55255"
],
"markers": "python_full_version >= '3.7.0'",
"markers": "python_version >= '3.7'",
"version": "==2.1.2"
},
"wsproto": {
@ -950,7 +989,7 @@
"sha256:2218cb57952d90b9fca325c0dcfb08c3bda93e8fd8070b0a17f048e2e47a521b",
"sha256:a2e56bfd5c7cd83c1369d83b5feccd6d37798b74872866e62616e0ecf111bda8"
],
"markers": "python_full_version >= '3.7.0'",
"markers": "python_version >= '3.7'",
"version": "==1.1.0"
},
"yt-dlp": {
@ -966,7 +1005,7 @@
"sha256:56bf8aadb83c24db6c4b577e13de374ccfb67da2078beba1d037c17980bf43ad",
"sha256:c4f6e5bbf48e74f7a38e7cc5b0480ff42b0ae5178957d564d18932525d5cf099"
],
"markers": "python_full_version >= '3.7.0'",
"markers": "python_version >= '3.7'",
"version": "==3.8.0"
}
},

Wyświetl plik

@ -13,7 +13,7 @@ You also need:
5. Internet Archive credentials can be retrieved from https://archive.org/account/s3.php.
### Configuration file
Configuration is done via a config.json file (see [example.config.json](example.config.json)) and some properties of that file can be overwritten via command line arguments. Here is the current result from running the `python auto_archive.py --help`:
Configuration is done via a config.yaml file (see [example.config.yaml](example.config.yaml)) and some properties of that file can be overwritten via command line arguments. Here is the current result from running the `python auto_archive.py --help`:
<details><summary><code>python auto_archive.py --help</code></summary>
@ -56,36 +56,36 @@ optional arguments:
</details><br/>
#### Example invocations
All the configurations can be specified in the JSON config file, but sometimes it is useful to override only some of those like the sheet that we are running the archival on, here are some examples (possibly prepended by `pipenv run`):
All the configurations can be specified in the YAML config file, but sometimes it is useful to override only some of those like the sheet that we are running the archival on, here are some examples (possibly prepended by `pipenv run`):
```bash
# all the configurations come from config.json
# all the configurations come from config.yaml
python auto_archive.py
# all the configurations come from my_config.json
python auto_archive.py --config my_config.json
# all the configurations come from my_config.yaml
python auto_archive.py --config my_config.yaml
# reads the configurations but saves archived content to google drive instead
python auto_archive.py --config my_config.json --storage gd
python auto_archive.py --config my_config.yaml --storage gd
# uses the configurations but for another google docs sheet
# with a header on row 2 and with some different column names
python auto_archive.py --config my_config.json --sheet="use it on another sheets doc" --header=2 --col-link="put urls here"
python auto_archive.py --config my_config.yaml --sheet="use it on another sheets doc" --header=2 --col-link="put urls here"
# all the configurations come from config.json and specifies that s3 files should be private
# all the configurations come from config.yaml and specifies that s3 files should be private
python auto_archive.py --s3-private
```
### Extra notes on configuration
#### Google Drive
To use Google Drive storage you need the id of the shared folder in the `config.json` file which must be shared with the service account eg `autoarchiverservice@auto-archiver-111111.iam.gserviceaccount.com` and then you can use `--storage=gd`
To use Google Drive storage you need the id of the shared folder in the `config.yaml` file which must be shared with the service account eg `autoarchiverservice@auto-archiver-111111.iam.gserviceaccount.com` and then you can use `--storage=gd`
#### Telethon (Telegrams API Library)
The first time you run, you will be prompted to do a authentication with the phone number associated, alternatively you can put your `anon.session` in the root.
## Running
The `--sheet name` property (or `execution.sheet` in the JSON file) is the name of the Google Sheet to check for URLs.
The `--sheet name` property (or `execution.sheet` in the YAML file) is the name of the Google Sheet to check for URLs.
This sheet must have been shared with the Google Service account used by `gspread`.
This sheet must also have specific columns (case-insensitive) in the `header` row (see `COLUMN_NAMES` in [gworksheet.py](utils/gworksheet.py)), only the `link` and `status` columns are mandatory:
* `Link` (required): the location of the media to be archived. This is the only column that should be supplied with data initially

Wyświetl plik

@ -1,5 +1,5 @@
import argparse, json
import argparse, yaml, json
import gspread
from loguru import logger
from selenium import webdriver
@ -26,7 +26,7 @@ class Config:
c.parse() # parses the values and initializes the Services and API clients
# you can then access the Services and APIs like 'c.s3_config'
All the configurations available as cmd line options, when included, will
override the configurations in the config.json file.
override the configurations in the config.yaml file.
Configurations are split between:
1. "secrets" containing API keys for generating services - not kept in memory
2. "execution" containing specific execution configurations
@ -41,12 +41,12 @@ class Config:
self.args = self.parser.parse_args()
logger.success(f'Command line arguments parsed successfully')
self.config_file = self.args.config
self.read_config_json()
self.read_config_yaml()
logger.info(f'APIs and Services initialized:\n{self}')
def read_config_json(self):
def read_config_yaml(self):
with open(self.config_file, "r", encoding="utf-8") as inf:
self.config = json.load(inf)
self.config = yaml.safe_load(inf)
# ----------------------EXECUTION - execution configurations
execution = self.config.get("execution", {})
@ -150,13 +150,13 @@ class Config:
"""
Creates the CMD line arguments. 'python auto_archive.py --help'
"""
parser = argparse.ArgumentParser(description='Automatically archive social media posts, videos, and images from a Google Sheets document. The command line arguments will always override the configurations in the provided JSON config file (--config), only some high-level options are allowed via the command line and the JSON configuration file is the preferred method. The sheet must have the "url" and "status" for the archiver to work. ')
parser = argparse.ArgumentParser(description='Automatically archive social media posts, videos, and images from a Google Sheets document. The command line arguments will always override the configurations in the provided YAML config file (--config), only some high-level options are allowed via the command line and the YAML configuration file is the preferred method. The sheet must have the "url" and "status" for the archiver to work. ')
parser.add_argument('--config', action='store', dest='config', help='the filename of the JSON configuration file (defaults to \'config.json\')', default='config.json')
parser.add_argument('--storage', action='store', dest='storage', help='which storage to use [execution.storage in config.json]', choices=Config.AVAILABLE_STORAGES)
parser.add_argument('--sheet', action='store', dest='sheet', help='the name of the google sheets document [execution.sheet in config.json]')
parser.add_argument('--header', action='store', dest='header', help='1-based index for the header row [execution.header in config.json]')
parser.add_argument('--s3-private', action='store_true', help='Store content without public access permission (only for storage=s3) [secrets.s3.private in config.json]')
parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='config.yaml')
parser.add_argument('--storage', action='store', dest='storage', help='which storage to use [execution.storage in config.yaml]', choices=Config.AVAILABLE_STORAGES)
parser.add_argument('--sheet', action='store', dest='sheet', help='the name of the google sheets document [execution.sheet in config.yaml]')
parser.add_argument('--header', action='store', dest='header', help='1-based index for the header row [execution.header in config.yaml]')
parser.add_argument('--s3-private', action='store_true', help='Store content without public access permission (only for storage=s3) [secrets.s3.private in config.yaml]')
for k, v in GWorksheet.COLUMN_NAMES.items():
help = f"the name of the column to FILL WITH {k} (default='{v}')"

Wyświetl plik

@ -1,62 +0,0 @@
{
"secrets": {
"s3": {// for storage=s3
"region": "s3 region like fra1",
"bucket": "s3 bucket name like my-bucket",
"key": "s3 API key",
"secret": "s3 API secret",
"endpoint_url": "use region format like such: https://{region}.digitaloceanspaces.com",
"cdn_url": "use bucket, region, and key (key is the archived file path generated when executing) format like such: https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}",
"private": false, // if true S3 urls will not be readable online
"key_path": "random" // you can generate a random UUID for the URL instead of a predictable path, useful to still have public but unlisted files
},
"wayback": {
"key": "your API key, visit https://archive.org/account/s3.php",
"secret": "your API secret"
},
"telegram": {
"api_id": "your API key, see https://telegra.ph/How-to-get-Telegram-APP-ID--API-HASH-05-27",
"api_hash": "your API hash",
"bot_token": "optional, but allows access to more content such as large videos, talk to @botfather"
},
"google_sheets": {
"service_account": "local filename: normally service_account.json, see https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account"
},
"google_drive": { // for storage=gd
"service_account": "local filename: can be the same or different file from google_sheets.service_account defaults to service_account.json",
"root_folder_id": "copy XXXX from https://drive.google.com/drive/folders/XXXX"
},
"local":{ // for storage=local
"save_to": "local path to save files in ./local_archive"
},
"facebook": {
"cookie": "optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'"
}
},
"execution": {
"sheet": "your-sheet-name", // can be overwritten with CMD --sheet=
"header": 1, //which row of your tabs contains the header, can be overwritten with CMD --header=
"storage": "s3", // which storage to use, can be overwritten with CMD --storage=
"selenium": { // optional configurations for the selenium browser that takes screenshots, these are the defaults
"timeout_seconds": 120, // values under 10s might mean screenshots fail to grab
"window_width": 1400,
"window_height": 2000
},
"tmp_folder": "tmp/", // local tmp folder to save files before uploading to storage
"save_logs": true, // puts execution logs into /logs folder, defaults to false
"column_names": { // custom column names, only needed if different from default, can be overwritten with CMD --col-NAME="VALUE"
"url": "link",
"archive": "archive location",
"folder": "folder",
"date": "archive date",
"status": "archive status",
"thumbnail": "thumbnail",
"thumbnail_index": "thumbnail index",
"timestamp": "upload timestamp",
"title": "upload title",
"duration": "duration",
"screenshot": "screenshot",
"hash": "hash"
}
}
}

Wyświetl plik

@ -0,0 +1,81 @@
---
secrets:
# needed if you use storage=s3
s3:
# contains S3 info on region, bucket, key and secret
region: reg1
bucket: my-bucket
key: "s3 API key"
secret: "s3 API secret"
# use region format like such
endpoint_url: 'https://{region}.digitaloceanspaces.com'
#use bucket, region, and key (key is the archived file path generated when executing) format like such as:
cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
# if private:true S3 urls will not be readable online
private: false
# with 'random' you can generate a random UUID for the URL instead of a predictable path, useful to still have public but unlisted files, alternative is 'default' or not omitted from config
key_path: random
# needed if you use storage=gd
google_drive:
# local filename can be the same or different file from google_sheets.service_account, defaults to service_account.json
service_account: "service_account.json"
root_folder_id: copy XXXX from https://drive.google.com/drive/folders/XXXX
# needed if you use storage=local
local:
# local path to save files in
save_to: "./local_archive"
wayback:
# to get credentials visit https://archive.org/account/s3.php
key: your API key
secret: your API secret
telegram:
# to get credentials see: https://telegra.ph/How-to-get-Telegram-APP-ID--API-HASH-05-27
api_id: your API key, see
api_hash: your API hash
# optional, but allows access to more content such as large videos, talk to @botfather
bot_token: your bot-token
google_sheets:
# local filename: defaults to service_account.json, see https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account
service_account: "service_account.json"
facebook:
# optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'
cookie: ""
execution:
# can be overwritten with CMD --sheet=
sheet: your-sheet-name
# which row of your tabs contains the header, can be overwritten with CMD --header=
header: 1
# which storage to use, can be overwritten with CMD --storage=
storage: s3
# optional configurations for the selenium browser that takes screenshots, these are the defaults
selenium:
# values under 10s might mean screenshots fail to grab screenshot
timeout_seconds: 120
window_width: 1400
window_height: 2000
# local tmp folder to save files before uploading to storage
tmp_folder: tmp/
# puts execution logs into /logs folder, defaults to false
save_logs: true
# custom column names, only needed if different from default, can be overwritten with CMD --col-NAME="VALUE"
# url and status are the only columns required to be present in the google sheet
column_names:
url: link
status: archive status
archive: archive location
# use this column to override default location data
folder: folder
date: archive date
thumbnail: thumbnail
thumbnail_index: thumbnail index
timestamp: upload timestamp
title: upload title
duration: duration
screenshot: screenshot
hash: hash