Simon Aubury 2023-02-21 16:50:57 +11:00
rodzic 2e8667b9c3
commit d7bb25fa00
5 zmienionych plików z 153 dodań i 34 usunięć

Wyświetl plik

@ -36,6 +36,8 @@ kafka-avro-console-consumer --bootstrap-server localhost:9092 --topic mastodon-t
curl -X PUT -H "Content-Type:application/json" localhost:8083/connectors/mastodon-sink-s3/config -d '@./config/mastodon-sink-s3-minio.json'
```
# Open s3 browser
http://localhost:9001/
# Kafka Connect OLD

Wyświetl plik

@ -178,23 +178,23 @@ services:
exit 0;
"
jupyter:
image: jupyter/scipy-notebook
ports:
- "8888:8888"
healthcheck:
test: nc -z localhost 8888 || exit -1
start_period: 15s
interval: 5s
timeout: 10s
retries: 10
volumes:
- ./notebooks:/home/jovyan/
user: root
environment:
JUPYTER_ENABLE_LAB: "yes"
JUPYTER_RUNTIME_DIR: "/tmp"
NB_USER: simonaubury
CHOWN_HOME: 'yes'
CHOWN_HOME_OPTS: '-R'
command: "start-notebook.sh --allow-root --ip=0.0.0.0 --NotebookApp.token='' --NotebookApp.password=''"
# jupyter:
# image: jupyter/scipy-notebook
# ports:
# - "8888:8888"
# healthcheck:
# test: nc -z localhost 8888 || exit -1
# start_period: 15s
# interval: 5s
# timeout: 10s
# retries: 10
# volumes:
# - ./notebooks:/home/jovyan/
# user: root
# environment:
# JUPYTER_ENABLE_LAB: "yes"
# JUPYTER_RUNTIME_DIR: "/tmp"
# NB_USER: simonaubury
# CHOWN_HOME: 'yes'
# CHOWN_HOME_OPTS: '-R'
# command: "start-notebook.sh --allow-root --ip=0.0.0.0 --NotebookApp.token='' --NotebookApp.password=''"

Wyświetl plik

@ -49,6 +49,14 @@
"LOAD httpfs;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Establish s3 endpoint\n",
"Set the s3 endpoint settings. Here we're using a local [MinIO](https://min.io/) as an Open Source, Amazon S3 compatible server"
]
},
{
"cell_type": "code",
"execution_count": null,
@ -64,6 +72,13 @@
"set s3_url_style='path';"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"And you can now query the parquet files directly from s3"
]
},
{
"cell_type": "code",
"execution_count": null,
@ -118,11 +133,12 @@
", tags\n",
", characters\n",
", mastodon_text\n",
"FROM read_parquet('../data_tmp/all_toots.parquet');\n",
"from read_parquet('s3://mastodon/topics/mastodon-topic/partition=0/*');\n",
"\n",
"create table mastodon_toot as\n",
"select mr.*, ln.language_name\n",
"from mastodon_toot_raw mr left outer join language ln on (mr.language = ln.lang_iso);"
"from mastodon_toot_raw mr \n",
"left outer join language ln on (mr.language = ln.lang_iso);"
]
},
{
@ -152,7 +168,6 @@
", mode(case when bot='True' then username end) as \"Most freq bot\"\n",
", mode(base_url) as \"Most freq host\"\n",
"from mastodon_toot\n",
"where created_tz between TIMESTAMP '2023-02-07 13:00:00' and TIMESTAMP '2023-02-18 12:59:59' \n",
"group by 1\n",
"order by 1\n",
";"
@ -235,9 +250,7 @@
"metadata": {},
"outputs": [],
"source": [
"# sns.lineplot(data=mastodon_usage_df, x=\"created_hour\", y=\"num\")\n",
"df = mastodon_usage_df[mastodon_usage_df['created_day'].isin(['2023/02/04 Sat', '2023/02/05 Sun', '2023/02/11 Sat', '2023/02/12 Sun', '2023/02/15 Wed', '2023/02/16 Thu', '2023/02/17 Fri'] ) ]\n",
"sns.lineplot(data=df , x=\"created_hour\", y=\"num\", hue=\"created_day\").set_xticks(range(24))"
"sns.lineplot(data=mastodon_usage_df, x=\"created_hour\", y=\"num\", hue=\"created_day\").set_xticks(range(24))"
]
},
{
@ -277,10 +290,10 @@
"source": [
"%%sql\n",
"mastodon_lang_df << \n",
"select *\n",
"from mastodon_toot\n",
"where characters < 200\n",
"and language not in ('unknown');"
" select *\n",
" from mastodon_toot\n",
" where characters < 200\n",
" and language not in ('unknown');"
]
},
{

Wyświetl plik

@ -1,4 +1,108 @@
Mastodon.py
BeautifulSoup4
confluent_kafka
avro
anyio==3.6.2
appnope==0.1.3
argon2-cffi==21.3.0
argon2-cffi-bindings==21.2.0
arrow==1.2.3
asttokens==2.2.1
attrs==22.2.0
avro==1.11.1
backcall==0.2.0
beautifulsoup4==4.11.1
bleach==6.0.0
blurhash==1.1.4
certifi==2022.12.7
cffi==1.15.1
chardet==5.1.0
charset-normalizer==3.0.1
comm==0.1.2
confluent-kafka==2.0.2
contourpy==1.0.7
cycler==0.11.0
debugpy==1.6.6
decorator==5.1.1
defusedxml==0.7.1
duckdb==0.6.1
duckdb-engine==0.6.8
executing==1.2.0
fastavro==1.7.1
fastjsonschema==2.16.2
fonttools==4.38.0
fqdn==1.5.1
idna==3.4
importlib-metadata==6.0.0
ipykernel==6.21.1
ipython==8.9.0
ipython-genutils==0.2.0
ipython-sql==0.4.1
isoduration==20.11.0
jedi==0.18.2
Jinja2==3.1.2
jsonpointer==2.3
jsonschema==4.17.3
jupyter-events==0.6.3
jupyter_client==8.0.2
jupyter_core==5.2.0
jupyter_server==2.2.1
jupyter_server_terminals==0.4.4
jupyterlab-pygments==0.2.2
kiwisolver==1.4.4
MarkupSafe==2.1.2
Mastodon.py==1.8.0
matplotlib==3.6.3
matplotlib-inline==0.1.6
mistune==2.0.4
nbclassic==0.5.1
nbclient==0.7.2
nbconvert==7.2.9
nbformat==5.7.3
nest-asyncio==1.5.6
notebook==6.5.2
notebook_shim==0.2.2
numpy==1.24.2
packaging==23.0
pandas==1.5.3
pandocfilters==1.5.0
parso==0.8.3
pexpect==4.8.0
pickleshare==0.7.5
Pillow==9.4.0
platformdirs==2.6.2
prettytable==0.7.2
prometheus-client==0.16.0
prompt-toolkit==3.0.36
psutil==5.9.4
ptyprocess==0.7.0
pure-eval==0.2.2
pycparser==2.21
Pygments==2.14.0
pyparsing==3.0.9
pyrsistent==0.19.3
python-dateutil==2.8.2
python-json-logger==2.0.4
python-magic==0.4.27
pytz==2022.7.1
PyYAML==6.0
pyzmq==25.0.0
requests==2.28.2
rfc3339-validator==0.1.4
rfc3986-validator==0.1.1
seaborn==0.12.2
Send2Trash==1.8.0
six==1.16.0
sniffio==1.3.0
soupsieve==2.3.2.post1
SQLAlchemy==1.4.46
sqlparse==0.4.3
stack-data==0.6.2
terminado==0.17.1
timer==0.2.2
tinycss2==1.2.1
tornado==6.2
traitlets==5.9.0
uri-template==1.2.0
urllib3==1.26.14
wcwidth==0.2.6
webcolors==1.12
webencodings==0.5.1
websocket-client==1.5.1
zipp==3.12.1

Plik binarny nie jest wyświetlany.