Incremental dev

main
Simon Aubury 2023-02-14 16:02:54 +11:00
rodzic 688528a53f
commit 0cc14823c3
9 zmienionych plików z 1344 dodań i 688 usunięć

3
.env 100644
Wyświetl plik

@ -0,0 +1,3 @@
CONF_VER=7.1.0
ELST_VER=7.11.0
TZ_SET=Australia/Sydney

1
.gitignore vendored
Wyświetl plik

@ -7,3 +7,4 @@ config/mastodon-sink-s3-aws.json
notebooks/demo.ipynb
duckdb/init-s3.sql
data_tmp/*
docker-compose-orig.yml

Wyświetl plik

@ -25,6 +25,8 @@ python mastodonlisten.py --baseURL https://data-folks.masto.host/ --enableKafka
confluent-hub install confluentinc/kafka-connect-s3:10.3.0
curl -X PUT -H "Content-Type:application/json" localhost:8083/connectors/mastodon-sink-s3/config -d '@./config/mastodon-sink-s3-minio.json'
curl -X PUT -H "Content-Type:application/json" localhost:8083/connectors/mastodon-sink-s3/config -d '@./config/mastodon-sink-s3.json'
curl -X PUT -H "Content-Type:application/json" localhost:8083/connectors/mastodon-sink-s3-aws/config -d '@./config/mastodon-sink-s3-aws.json'

Wyświetl plik

@ -12,7 +12,7 @@
"name": "created_at",
"type": ["null","int"],
"logicalType": "date",
"default" : "null"
"default" : null
},
{
"name": "created_at_str",

Wyświetl plik

@ -0,0 +1,16 @@
{
"name": "mastodon-sink-s3",
"connector.class": "io.confluent.connect.s3.S3SinkConnector",
"topics": "mastodon-topic",
"format.class": "io.confluent.connect.s3.format.parquet.ParquetFormat",
"flush.size": "10",
"s3.bucket.name": "mastodon",
"aws.access.key.id": "minio",
"aws.secret.access.key": "minio123",
"storage.class": "io.confluent.connect.s3.storage.S3Storage",
"store.url": "http://minio:9000"
}

Plik binarny nie jest wyświetlany.

Przed

Szerokość:  |  Wysokość:  |  Rozmiar: 21 KiB

Po

Szerokość:  |  Wysokość:  |  Rozmiar: 21 KiB

Wyświetl plik

@ -31,3 +31,92 @@ order by 1,2
-- select username, bot, count(*) from xx group by 1,2 order by 3 desc;
as select *
-- old backup
create table toots
as
select m_id
, created_at
, created_at_str
, app
, url
, base_url
, language
, favourites
, username
, bot
, tags
, characters
, mastodon_text
FROM read_parquet('../xx.parquet');
insert into toots
select
m_id
, created_at
, created_at_str
, app
, url
, base_url
, language
, favourites
, username
, bot
, tags
, characters
, mastodon_text
from read_parquet('*.parquet');
insert into toots
select
m_id
, created_at
, created_at_str
, app
, url
, base_url
, language
, favourites
, username
, bot
, tags
, characters
, mastodon_text
from read_parquet('20230213/mastodon-topic/partition=0/*.parquet');
create table all_toots
as
select
m_id
, created_at
, app
, url
, base_url
, language
, favourites
, username
, bot
, tags
, characters
, mastodon_text
from toots
group by
m_id
, created_at
, app
, url
, base_url
, language
, favourites
, username
, bot
, tags
, characters
, mastodon_text;
COPY all_toots TO 'all_toots.parquet' (FORMAT PARQUET);

Wyświetl plik

@ -0,0 +1,6 @@
# FROM confluentinc/cp-server-connect-base:7.3.1
FROM confluentinc/cp-server-connect:7.1.0
RUN confluent-hub install --no-prompt confluentinc/kafka-connect-s3:10.3.0
# ENTRYPOINT ["tail", "-f", "/dev/null"]

File diff suppressed because one or more lines are too long