kopia lustrzana https://github.com/saubury/mastodon-stream
update
rodzic
2e8667b9c3
commit
d7bb25fa00
|
@ -36,6 +36,8 @@ kafka-avro-console-consumer --bootstrap-server localhost:9092 --topic mastodon-t
|
|||
curl -X PUT -H "Content-Type:application/json" localhost:8083/connectors/mastodon-sink-s3/config -d '@./config/mastodon-sink-s3-minio.json'
|
||||
```
|
||||
|
||||
# Open s3 browser
|
||||
http://localhost:9001/
|
||||
|
||||
|
||||
# Kafka Connect OLD
|
||||
|
|
|
@ -178,23 +178,23 @@ services:
|
|||
exit 0;
|
||||
"
|
||||
|
||||
jupyter:
|
||||
image: jupyter/scipy-notebook
|
||||
ports:
|
||||
- "8888:8888"
|
||||
healthcheck:
|
||||
test: nc -z localhost 8888 || exit -1
|
||||
start_period: 15s
|
||||
interval: 5s
|
||||
timeout: 10s
|
||||
retries: 10
|
||||
volumes:
|
||||
- ./notebooks:/home/jovyan/
|
||||
user: root
|
||||
environment:
|
||||
JUPYTER_ENABLE_LAB: "yes"
|
||||
JUPYTER_RUNTIME_DIR: "/tmp"
|
||||
NB_USER: simonaubury
|
||||
CHOWN_HOME: 'yes'
|
||||
CHOWN_HOME_OPTS: '-R'
|
||||
command: "start-notebook.sh --allow-root --ip=0.0.0.0 --NotebookApp.token='' --NotebookApp.password=''"
|
||||
# jupyter:
|
||||
# image: jupyter/scipy-notebook
|
||||
# ports:
|
||||
# - "8888:8888"
|
||||
# healthcheck:
|
||||
# test: nc -z localhost 8888 || exit -1
|
||||
# start_period: 15s
|
||||
# interval: 5s
|
||||
# timeout: 10s
|
||||
# retries: 10
|
||||
# volumes:
|
||||
# - ./notebooks:/home/jovyan/
|
||||
# user: root
|
||||
# environment:
|
||||
# JUPYTER_ENABLE_LAB: "yes"
|
||||
# JUPYTER_RUNTIME_DIR: "/tmp"
|
||||
# NB_USER: simonaubury
|
||||
# CHOWN_HOME: 'yes'
|
||||
# CHOWN_HOME_OPTS: '-R'
|
||||
# command: "start-notebook.sh --allow-root --ip=0.0.0.0 --NotebookApp.token='' --NotebookApp.password=''"
|
||||
|
|
|
@ -49,6 +49,14 @@
|
|||
"LOAD httpfs;"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Establish s3 endpoint\n",
|
||||
"Set the s3 endpoint settings. Here we're using a local [MinIO](https://min.io/) as an Open Source, Amazon S3 compatible server"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
@ -64,6 +72,13 @@
|
|||
"set s3_url_style='path';"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"And you can now query the parquet files directly from s3"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
@ -118,11 +133,12 @@
|
|||
", tags\n",
|
||||
", characters\n",
|
||||
", mastodon_text\n",
|
||||
"FROM read_parquet('../data_tmp/all_toots.parquet');\n",
|
||||
"from read_parquet('s3://mastodon/topics/mastodon-topic/partition=0/*');\n",
|
||||
"\n",
|
||||
"create table mastodon_toot as\n",
|
||||
"select mr.*, ln.language_name\n",
|
||||
"from mastodon_toot_raw mr left outer join language ln on (mr.language = ln.lang_iso);"
|
||||
"from mastodon_toot_raw mr \n",
|
||||
"left outer join language ln on (mr.language = ln.lang_iso);"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -152,7 +168,6 @@
|
|||
", mode(case when bot='True' then username end) as \"Most freq bot\"\n",
|
||||
", mode(base_url) as \"Most freq host\"\n",
|
||||
"from mastodon_toot\n",
|
||||
"where created_tz between TIMESTAMP '2023-02-07 13:00:00' and TIMESTAMP '2023-02-18 12:59:59' \n",
|
||||
"group by 1\n",
|
||||
"order by 1\n",
|
||||
";"
|
||||
|
@ -235,9 +250,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# sns.lineplot(data=mastodon_usage_df, x=\"created_hour\", y=\"num\")\n",
|
||||
"df = mastodon_usage_df[mastodon_usage_df['created_day'].isin(['2023/02/04 Sat', '2023/02/05 Sun', '2023/02/11 Sat', '2023/02/12 Sun', '2023/02/15 Wed', '2023/02/16 Thu', '2023/02/17 Fri'] ) ]\n",
|
||||
"sns.lineplot(data=df , x=\"created_hour\", y=\"num\", hue=\"created_day\").set_xticks(range(24))"
|
||||
"sns.lineplot(data=mastodon_usage_df, x=\"created_hour\", y=\"num\", hue=\"created_day\").set_xticks(range(24))"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -277,10 +290,10 @@
|
|||
"source": [
|
||||
"%%sql\n",
|
||||
"mastodon_lang_df << \n",
|
||||
"select *\n",
|
||||
"from mastodon_toot\n",
|
||||
"where characters < 200\n",
|
||||
"and language not in ('unknown');"
|
||||
" select *\n",
|
||||
" from mastodon_toot\n",
|
||||
" where characters < 200\n",
|
||||
" and language not in ('unknown');"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
112
requirements.txt
112
requirements.txt
|
@ -1,4 +1,108 @@
|
|||
Mastodon.py
|
||||
BeautifulSoup4
|
||||
confluent_kafka
|
||||
avro
|
||||
anyio==3.6.2
|
||||
appnope==0.1.3
|
||||
argon2-cffi==21.3.0
|
||||
argon2-cffi-bindings==21.2.0
|
||||
arrow==1.2.3
|
||||
asttokens==2.2.1
|
||||
attrs==22.2.0
|
||||
avro==1.11.1
|
||||
backcall==0.2.0
|
||||
beautifulsoup4==4.11.1
|
||||
bleach==6.0.0
|
||||
blurhash==1.1.4
|
||||
certifi==2022.12.7
|
||||
cffi==1.15.1
|
||||
chardet==5.1.0
|
||||
charset-normalizer==3.0.1
|
||||
comm==0.1.2
|
||||
confluent-kafka==2.0.2
|
||||
contourpy==1.0.7
|
||||
cycler==0.11.0
|
||||
debugpy==1.6.6
|
||||
decorator==5.1.1
|
||||
defusedxml==0.7.1
|
||||
duckdb==0.6.1
|
||||
duckdb-engine==0.6.8
|
||||
executing==1.2.0
|
||||
fastavro==1.7.1
|
||||
fastjsonschema==2.16.2
|
||||
fonttools==4.38.0
|
||||
fqdn==1.5.1
|
||||
idna==3.4
|
||||
importlib-metadata==6.0.0
|
||||
ipykernel==6.21.1
|
||||
ipython==8.9.0
|
||||
ipython-genutils==0.2.0
|
||||
ipython-sql==0.4.1
|
||||
isoduration==20.11.0
|
||||
jedi==0.18.2
|
||||
Jinja2==3.1.2
|
||||
jsonpointer==2.3
|
||||
jsonschema==4.17.3
|
||||
jupyter-events==0.6.3
|
||||
jupyter_client==8.0.2
|
||||
jupyter_core==5.2.0
|
||||
jupyter_server==2.2.1
|
||||
jupyter_server_terminals==0.4.4
|
||||
jupyterlab-pygments==0.2.2
|
||||
kiwisolver==1.4.4
|
||||
MarkupSafe==2.1.2
|
||||
Mastodon.py==1.8.0
|
||||
matplotlib==3.6.3
|
||||
matplotlib-inline==0.1.6
|
||||
mistune==2.0.4
|
||||
nbclassic==0.5.1
|
||||
nbclient==0.7.2
|
||||
nbconvert==7.2.9
|
||||
nbformat==5.7.3
|
||||
nest-asyncio==1.5.6
|
||||
notebook==6.5.2
|
||||
notebook_shim==0.2.2
|
||||
numpy==1.24.2
|
||||
packaging==23.0
|
||||
pandas==1.5.3
|
||||
pandocfilters==1.5.0
|
||||
parso==0.8.3
|
||||
pexpect==4.8.0
|
||||
pickleshare==0.7.5
|
||||
Pillow==9.4.0
|
||||
platformdirs==2.6.2
|
||||
prettytable==0.7.2
|
||||
prometheus-client==0.16.0
|
||||
prompt-toolkit==3.0.36
|
||||
psutil==5.9.4
|
||||
ptyprocess==0.7.0
|
||||
pure-eval==0.2.2
|
||||
pycparser==2.21
|
||||
Pygments==2.14.0
|
||||
pyparsing==3.0.9
|
||||
pyrsistent==0.19.3
|
||||
python-dateutil==2.8.2
|
||||
python-json-logger==2.0.4
|
||||
python-magic==0.4.27
|
||||
pytz==2022.7.1
|
||||
PyYAML==6.0
|
||||
pyzmq==25.0.0
|
||||
requests==2.28.2
|
||||
rfc3339-validator==0.1.4
|
||||
rfc3986-validator==0.1.1
|
||||
seaborn==0.12.2
|
||||
Send2Trash==1.8.0
|
||||
six==1.16.0
|
||||
sniffio==1.3.0
|
||||
soupsieve==2.3.2.post1
|
||||
SQLAlchemy==1.4.46
|
||||
sqlparse==0.4.3
|
||||
stack-data==0.6.2
|
||||
terminado==0.17.1
|
||||
timer==0.2.2
|
||||
tinycss2==1.2.1
|
||||
tornado==6.2
|
||||
traitlets==5.9.0
|
||||
uri-template==1.2.0
|
||||
urllib3==1.26.14
|
||||
wcwidth==0.2.6
|
||||
webcolors==1.12
|
||||
webencodings==0.5.1
|
||||
websocket-client==1.5.1
|
||||
zipp==3.12.1
|
||||
|
|
BIN
xx.parquet
BIN
xx.parquet
Plik binarny nie jest wyświetlany.
Ładowanie…
Reference in New Issue