stats: Add ECDF plot for message counts by user

pull/23/head v0.7.0
Michael DM Dryden 2023-01-14 03:24:59 -05:00
rodzic 5fd5c4ee08
commit 750427a841
5 zmienionych plików z 85 dodań i 4 usunięć

Wyświetl plik

@ -7,17 +7,18 @@ The format is based on `Keep a Changelog <https://keepachangelog.com/en/1.0.0/>`
and this project adheres to `Semantic Versioning <https://semver.org/spec/v2.0.0.html>`_.
----------
Unreleased
`0.7.0`_ - 2023-01-14
----------
Fixed
-----
- Sticker pack names save correctly now
- Explicitly add psycopg2-binary as dependency because sqlalchemy extra doesn't seem to work anymore.
- Explicitly add psycopg2-binary as dependency because sqlalchemy extra doesn't seem to work anymore
- Try to map user ids to names during json dump import. (#17)
Added
-----
- Add script to import data from desktop client json dumps
- Add ECDF plot for message counts by user with ``/stats count-dist``
-------------
`0.6.4`_ - 2022-02-27
@ -131,7 +132,7 @@ Fixed
----------------------
- Initial release
.. _Unreleased: https://github.com/mkdryden/telegram-stats-bot/compare/v0.6.2...HEAD
.. _Unreleased: https://github.com/mkdryden/telegram-stats-bot/compare/v0.7.0...HEAD
.. _0.1.1: https://github.com/mkdryden/telegram-stats-bot/releases/tag/v0.1.1
.. _0.2.0: https://github.com/mkdryden/telegram-stats-bot/releases/tag/v0.2.0
.. _0.3.0: https://github.com/mkdryden/telegram-stats-bot/releases/tag/v0.3.0
@ -142,3 +143,4 @@ Fixed
.. _0.6.1: https://github.com/mkdryden/telegram-stats-bot/releases/tag/v0.6.1
.. _0.6.2: https://github.com/mkdryden/telegram-stats-bot/releases/tag/v0.6.2
.. _0.6.3: https://github.com/mkdryden/telegram-stats-bot/releases/tag/v0.6.3
.. _0.7.0: https://github.com/mkdryden/telegram-stats-bot/releases/tag/v0.7.0

Wyświetl plik

@ -45,6 +45,8 @@ Table of contents
- `counts`_
- `count-dist`_
- `hours`_
- `days`_
@ -251,6 +253,13 @@ counts
@WhereAreMyManners 30481 5.1
@TheWorstOfTheBest 28705 4.8
count-dist
----------
``/stats count-dist`` returns an ECDF plot of the users in the group by message count.
.. image:: examples/count-dist.png
:alt: Example of count-dist plot
hours
-----
``/stats hours`` returns a plot of message frequency for the hours of the day.

Plik binarny nie jest wyświetlany.

Po

Szerokość:  |  Wysokość:  |  Rozmiar: 17 KiB

Wyświetl plik

@ -1,6 +1,6 @@
[tool.poetry]
name = "telegram-stats-bot"
version = "0.6.4"
version = "0.7.0"
description = "A logging and statistics bot for Telegram based on python-telegram-bot."
authors = ["Michael DM Dryden <mk.dryden@utoronto.ca>"]
repository = "https://github.com/mkdryden/telegram-stats-bot"

Wyświetl plik

@ -72,6 +72,7 @@ class InternalParser(argparse.ArgumentParser):
class StatsRunner(object):
allowed_methods = {'counts': "get_chat_counts",
'count-dist': 'get_chat_ecdf',
'hours': "get_counts_by_hour",
'days': "get_counts_by_day",
'week': "get_week_by_hourday",
@ -205,6 +206,75 @@ class StatsRunner(object):
return f"```\n{text}\n```", None
def get_chat_ecdf(self, lquery: str = None, mtype: str = None, start: str = None, end: str = None,
log: bool = False) -> Tuple[Union[str, None], Union[None, BytesIO]]:
"""
Get message counts by number of users as an ECDF plot.
:param lquery: Limit results to lexical query (&, |, !, <n>)
:param mtype: Limit results to message type (text, sticker, photo, etc.)
:param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
:param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
:param log: Plot with log scale.
"""
sql_dict = {}
query_conditions = []
if lquery:
query_conditions.append(f"text_index_col @@ to_tsquery( {random_quote(lquery)} )")
if mtype:
if mtype not in ('text', 'sticker', 'photo', 'animation', 'video', 'voice', 'location', 'video_note',
'audio', 'document', 'poll'):
raise HelpException(f'mtype {mtype} is invalid.')
query_conditions.append(f"""type = '{mtype}'""")
if start:
sql_dict['start_dt'] = pd.to_datetime(start)
query_conditions.append("date >= %(start_dt)s")
if end:
sql_dict['end_dt'] = pd.to_datetime(end)
query_conditions.append("date < %(end_dt)s")
query_where = ""
if query_conditions:
query_where = f"WHERE {' AND '.join(query_conditions)}"
query = f"""
SELECT "from_user", COUNT(*) as "count"
FROM "messages_utc"
{query_where}
GROUP BY "from_user"
ORDER BY "count" DESC;
"""
with self.engine.connect() as con:
df = pd.read_sql_query(query, con, params=sql_dict)
if len(df) == 0:
return "No matching messages", None
fig = Figure(constrained_layout=True)
subplot = fig.subplots()
sns.ecdfplot(df, y='count', stat='count', log_scale=log, ax=subplot)
subplot.set_xlabel('User')
subplot.set_ylabel('Messages')
if lquery:
subplot.set_title(f"Messages by User for {lquery}")
else:
subplot.set_title("Messages by User")
sns.despine(fig=fig)
bio = BytesIO()
bio.name = 'plot.png'
fig.savefig(bio, bbox_inches='tight')
bio.seek(0)
return None, bio
def get_counts_by_hour(self, user: Tuple[int, str] = None, lquery: str = None, start: str = None, end: str = None) \
-> Tuple[Union[str, None], Union[None, BytesIO]]:
"""