diff --git a/datasets/nfts/.gitignore b/datasets/nfts/.gitignore index adc9bb00..5f4df389 100644 --- a/datasets/nfts/.gitignore +++ b/datasets/nfts/.gitignore @@ -164,3 +164,4 @@ cython_debug/ .nfts/ venv/ .secrets/ +.analysis/ diff --git a/datasets/nfts/nfts/dataset.py b/datasets/nfts/nfts/dataset.py new file mode 100644 index 00000000..19cc53b6 --- /dev/null +++ b/datasets/nfts/nfts/dataset.py @@ -0,0 +1,118 @@ +""" +Functions to access various data in the NFTs dataset. +""" +import sqlite3 +from typing import Union + +import pandas as pd + +from .datastore import event_tables, EventType + +# TODO(zomglings): Make it so that table names are parametrized by importable variables. The way +# things are now, we have to be very careful if we ever rename a table in our dataset. We should +# also propagate the name change here. +NFTS = "nfts" +MINTS = event_tables[EventType.MINT] +TRANSFERS = event_tables[EventType.TRANSFER] +CURRENT_OWNERS = "current_owners" +CURRENT_MARKET_VALUES = "current_market_values" +TRANSFER_STATISTICS_BY_ADDRESS = "transfer_statistics_by_address" +MINT_HOLDING_TIMES = "mint_holding_times" +TRANSFER_HOLDING_TIMES = "transfer_holding_times" + +AVAILABLE_DATAFRAMES = { + NFTS: """Describes the NFT contracts represented in this dataset, with a name and symbol if they were available at time of crawl. + +Columns: +1. address: The Ethereum address of the NFT contract. +2. name: The name of the collection of NFTs that the contract represents. +3. symbol: The symbol of the collection of NFTs that the contract represents. +""", + MINTS: """All token mint events crawled in this dataset. + +Columns: +1. event_id: A unique event ID associated with the event. +2. transaction_hash: The hash of the transaction which triggered the event. +3. block_number: The transaction block in which the transaction was mined. +4. nft_address: The address of the NFT collection containing the minted token. +5. token_id: The ID of the token that was minted. +6. from_address: The "from" address for the transfer event. For a mint, this should be the 0 address: 0x0000000000000000000000000000000000000000. +7. to_address: The "to" address for the transfer event. This represents the owner of the freshly minted token. +8. transaction_value: The amount of WEI that were sent with the transaction in which the token was minted. +9. timestamp: The time at which the mint operation was mined into the blockchain (this is the timestamp for the mined block). +""", + TRANSFERS: """All token transfer events crawled in this dataset. + +Columns: +1. event_id: A unique event ID associated with the event. +2. transaction_hash: The hash of the transaction which triggered the event. +3. block_number: The transaction block in which the transaction was mined. +4. nft_address: The address of the NFT collection containing the transferred token. +5. token_id: The ID of the token that was transferred. +6. from_address: The "from" address for the transfer event. This is the address that owned the token at the *start* of the transfer. +7. to_address: The "to" address for the transfer event. This is the address that owned the token at the *end* of the transfer. +8. transaction_value: The amount of WEI that were sent with the transaction in which the token was transferred. +9. timestamp: The time at which the transfer operation was mined into the blockchain (this is the timestamp for the mined block). +""", + CURRENT_OWNERS: f"""This table is derived from the {NFTS}, {MINTS}, and {TRANSFERS} tables. It represents the current owner of each token in the dataset. + +Columns: +1. nft_address: The address of the NFT collection containing the token whose ownership we are denoting. +2. token_id: The ID of the token (inside the collection) whose ownership we are denoting. +3. owner: The address that owned the token at the time of construction of this dataset. +""", + CURRENT_MARKET_VALUES: f"""This table is derived from the {NFTS}, {MINTS}, and {TRANSFERS} tables. It represents the current market value (in WEI) of each token in the dataset. + +Columns: +1. nft_address: The address of the NFT collection containing the token whose market value we are denoting. +2. token_id: The ID of the token (inside the collection) whose market value we are denoting. +3. market_value: The estimated market value of the token at the time of construction of this dataset. + +For this dataset, we estimate the market value as the last non-zero transaction value for a transfer involving this token. +This estimate may be inaccurate for some transfers (e.g. multiple token transfers made by an escrow contract in a single transaction) +but ought to be reasonably accurate for a large majority of tokens. +""", + TRANSFER_STATISTICS_BY_ADDRESS: f"""This table is derived from the {NFTS}, {MINTS}, and {TRANSFERS} tables. For each address that participated in +at least one NFT transfer between April 1, 2021 and September 25, 2021, this table shows exactly how many NFTs that address transferred to +other addresses and how many NFT transfers that address was the recipient of. + +Columns: +1. address: An Ethereum address that participated in at least one NFT transfer between April 1, 2021 and September 25, 2021. +2. transfers_out: The number of NFTs that the given address transferred to any other address between April 1, 2021 and September 25, 2021. +3. transfers_in: The number of NFTs that any other address transferred to given address between April 1, 2021 and September 25, 2021. +""", +} + + +def explain() -> None: + """ + Explains the structure of the dataset. + """ + preamble = """ +The Moonstream NFTs dataset +=========================== + +This dataset consists of the following dataframes:""" + + print(preamble) + for name, explanation in AVAILABLE_DATAFRAMES.items(): + print(f"\nDataframe: {name}") + print( + f"Load using:\n\t{name}_df = nfts.dataset.load_dataframe(, {name})" + ) + print("") + print(explanation) + print("- - -") + + +def load_dataframe(db: Union[str, sqlite3.Connection], name: str) -> pd.DataFrame: + """ + Loads one of the available dataframes. To learn more about the available dataframes, run: + >>> nfts.dataset.explain() + """ + if name not in AVAILABLE_DATAFRAMES: + raise ValueError( + f"Invalid dataframe: {name}. Please choose from one of the available dataframes: {','.join(AVAILABLE_DATAFRAMES)}." + ) + df = pd.read_sql_table(name, db) + return df diff --git a/datasets/nfts/setup.py b/datasets/nfts/setup.py index d0b5a435..77e17dfa 100644 --- a/datasets/nfts/setup.py +++ b/datasets/nfts/setup.py @@ -33,9 +33,10 @@ setup( install_requires=[ "moonstreamdb", "humbug", + "pandas", + "requests", "tqdm", "web3", - "requests", ], extras_require={ "dev": ["black", "mypy", "types-requests"],