Resiliance fixes

main
Simon Aubury 2023-02-06 06:51:22 +11:00
rodzic d466189a8e
commit 1ca22e72eb
8 zmienionych plików z 61 dodań i 11 usunięć

1
.gitignore vendored
Wyświetl plik

@ -3,3 +3,4 @@ env/*
__pycache__
BAK
data/*
config/mastodon-sink-s3-aws.json

Wyświetl plik

@ -27,6 +27,8 @@ confluent-hub install confluentinc/kafka-connect-s3:10.3.0
curl -X PUT -H "Content-Type:application/json" localhost:8083/connectors/mastodon-sink-s3/config -d '@./config/mastodon-sink-s3.json'
curl -X PUT -H "Content-Type:application/json" localhost:8083/connectors/mastodon-sink-s3-aws/config -d '@./config/mastodon-sink-s3-aws.json'
# DuckDB
@ -34,6 +36,8 @@ duckdb --init duckdb/init.sql
select * FROM read_parquet('s3://mastodon/topics/mastodon-topic*');
select 'epoch'::TIMESTAMP + INTERVAL 1675325510 seconds;
# OLD Notes

Wyświetl plik

@ -14,6 +14,11 @@
"logicalType": "date",
"default" : "null"
},
{
"name": "created_at_str",
"type": "string",
"default": "unknown"
},
{
"name": "app",
"type": "string",

Wyświetl plik

@ -3,7 +3,7 @@
"connector.class": "io.confluent.connect.s3.S3SinkConnector",
"topics": "mastodon-topic",
"format.class": "io.confluent.connect.s3.format.parquet.ParquetFormat",
"flush.size": "1",
"flush.size": "1000",
"s3.bucket.name": "mastodon",
"aws.access.key.id": "minio",
"aws.secret.access.key": "minio123",

33
duckdb/go.sql 100644
Wyświetl plik

@ -0,0 +1,33 @@
-- .read duckdb/go.sql
/*
drop table if exists xx;
create table xx as
select m_id
, created_at_str
, created_at, ('EPOCH'::TIMESTAMP + INTERVAL (created_at::INT) seconds)::TIMESTAMPTZ as created_tz
, app
, url
, regexp_replace(regexp_replace(url, '^http[s]://', ''), '/.*$', '') as new_url
, base_url
, language
, favourites
, username
, bot
, tags
, characters
, mastodon_text
FROM read_parquet('s3://mastodon/topics/mastodon-topic/partition=0/*');
*/
select date_part('day', created_tz) as created_day
, date_part('hour', created_tz) as created_hour
, count(*)
from xx
group by 1,2
order by 1,2
;
-- select username, bot, count(*) from xx group by 1,2 order by 3 desc;

Wyświetl plik

@ -13,11 +13,7 @@ base_url = ''
enable_kafka = False
quiet = False
watchdog = False
if enable_kafka:
topic_name, producer = kafka_producer()
else:
topic_name, producer = '' , ''
topic_name, producer = '' , ''
# Listener for Mastodon events
@ -41,10 +37,13 @@ class Listener(mastodon.StreamListener):
# attribute only available on local
if hasattr(status, 'application'):
app = status.application.get('name')
now_dt=datetime.datetime.now()
value_dict = {
'm_id': status.id,
'created_at': int(datetime.datetime.now().strftime('%s')),
'created_at': int(now_dt.strftime('%s')),
'created_at_str': now_dt.strftime('%Y %m %d %H:%M:%S'),
'app': app,
'url': status.url,
'base_url': base_url,
@ -98,6 +97,7 @@ def main():
global enable_kafka
global quiet
global watchdog
global topic_name, producer
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
@ -137,8 +137,15 @@ def main():
args = parser.parse_args()
base_url=args.baseURL
enable_kafka=args.enableKafka
if enable_kafka:
topic_name, producer = kafka_producer()
mastodon = Mastodon(api_base_url = base_url)
if args.watchdog:

Wyświetl plik

@ -8,8 +8,8 @@ cd ${BASE}
# while true; do echo Start; ${PY} mastodonlisten.py --enableKafka --public; sleep 30; done &
while true; do echo Start; ${PY} mastodonlisten.py --baseURL https://mastodon.social --enableKafka --public; sleep 30; done
while true; do echo Start; ${PY} mastodonlisten.py --baseURL https://hachyderm.io --enableKafka ; sleep 30; done
while true; do echo Start; ${PY} mastodonlisten.py --baseURL https://mastodon.au/ --enableKafka ; sleep 30; done
while true; do echo Start; ${PY} mastodonlisten.py --baseURL https://data-folks.masto.host --enableKafka ; sleep 30; done
while true; do echo Start; ${PY} mastodonlisten.py --baseURL https://mastodon.social --enableKafka --watchdog 30 --public; sleep 30; done
while true; do echo Start; ${PY} mastodonlisten.py --baseURL https://hachyderm.io --enableKafka --watchdog 30 ; sleep 30; done
while true; do echo Start; ${PY} mastodonlisten.py --baseURL https://mastodon.au/ --enableKafka --watchdog 30 ; sleep 30; done
while true; do echo Start; ${PY} mastodonlisten.py --baseURL https://data-folks.masto.host --enableKafka --watchdog 30 ; sleep 30; done

BIN
xx.parquet 100644

Plik binarny nie jest wyświetlany.