Added "nfts.dataset" module

2021-10-07 20:32:03 -07:00 · 2021-10-07 20:32:03 -07:00 · ab04ae3917
commit ab04ae3917
--- a/datasets/nfts/.gitignore
+++ b/datasets/nfts/.gitignore
@ -164,3 +164,4 @@ cython_debug/
 .nfts/
 venv/
 .secrets/
+.analysis/
--- a/datasets/nfts/nfts/dataset.py
+++ b/datasets/nfts/nfts/dataset.py
@ -0,0 +1,118 @@
+"""
+Functions to access various data in the NFTs dataset.
+"""
+import sqlite3
+from typing import Union
+
+import pandas as pd
+
+from .datastore import event_tables, EventType
+
+# TODO(zomglings): Make it so that table names are parametrized by importable variables. The way
+# things are now, we have to be very careful if we ever rename a table in our dataset. We should
+# also propagate the name change here.
+NFTS = "nfts"
+MINTS = event_tables[EventType.MINT]
+TRANSFERS = event_tables[EventType.TRANSFER]
+CURRENT_OWNERS = "current_owners"
+CURRENT_MARKET_VALUES = "current_market_values"
+TRANSFER_STATISTICS_BY_ADDRESS = "transfer_statistics_by_address"
+MINT_HOLDING_TIMES = "mint_holding_times"
+TRANSFER_HOLDING_TIMES = "transfer_holding_times"
+
+AVAILABLE_DATAFRAMES = {
+    NFTS: """Describes the NFT contracts represented in this dataset, with a name and symbol if they were available at time of crawl.
+
+Columns:
+1. address: The Ethereum address of the NFT contract.
+2. name: The name of the collection of NFTs that the contract represents.
+3. symbol: The symbol of the collection of NFTs that the contract represents.
+""",
+    MINTS: """All token mint events crawled in this dataset.
+
+Columns:
+1. event_id: A unique event ID associated with the event.
+2. transaction_hash: The hash of the transaction which triggered the event.
+3. block_number: The transaction block in which the transaction was mined.
+4. nft_address: The address of the NFT collection containing the minted token.
+5. token_id: The ID of the token that was minted.
+6. from_address: The "from" address for the transfer event. For a mint, this should be the 0 address: 0x0000000000000000000000000000000000000000.
+7. to_address: The "to" address for the transfer event. This represents the owner of the freshly minted token.
+8. transaction_value: The amount of WEI that were sent with the transaction in which the token was minted.
+9. timestamp: The time at which the mint operation was mined into the blockchain (this is the timestamp for the mined block).
+""",
+    TRANSFERS: """All token transfer events crawled in this dataset.
+
+Columns:
+1. event_id: A unique event ID associated with the event.
+2. transaction_hash: The hash of the transaction which triggered the event.
+3. block_number: The transaction block in which the transaction was mined.
+4. nft_address: The address of the NFT collection containing the transferred token.
+5. token_id: The ID of the token that was transferred.
+6. from_address: The "from" address for the transfer event. This is the address that owned the token at the *start* of the transfer.
+7. to_address: The "to" address for the transfer event. This is the address that owned the token at the *end* of the transfer.
+8. transaction_value: The amount of WEI that were sent with the transaction in which the token was transferred.
+9. timestamp: The time at which the transfer operation was mined into the blockchain (this is the timestamp for the mined block).
+""",
+    CURRENT_OWNERS: f"""This table is derived from the {NFTS}, {MINTS}, and {TRANSFERS} tables. It represents the current owner of each token in the dataset.
+
+Columns:
+1. nft_address: The address of the NFT collection containing the token whose ownership we are denoting.
+2. token_id: The ID of the token (inside the collection) whose ownership we are denoting.
+3. owner: The address that owned the token at the time of construction of this dataset.
+""",
+    CURRENT_MARKET_VALUES: f"""This table is derived from the {NFTS}, {MINTS}, and {TRANSFERS} tables. It represents the current market value (in WEI) of each token in the dataset.
+
+Columns:
+1. nft_address: The address of the NFT collection containing the token whose market value we are denoting.
+2. token_id: The ID of the token (inside the collection) whose market value we are denoting.
+3. market_value: The estimated market value of the token at the time of construction of this dataset.
+
+For this dataset, we estimate the market value as the last non-zero transaction value for a transfer involving this token.
+This estimate may be inaccurate for some transfers (e.g. multiple token transfers made by an escrow contract in a single transaction)
+but ought to be reasonably accurate for a large majority of tokens.
+""",
+    TRANSFER_STATISTICS_BY_ADDRESS: f"""This table is derived from the {NFTS}, {MINTS}, and {TRANSFERS} tables. For each address that participated in
+at least one NFT transfer between April 1, 2021 and September 25, 2021, this table shows exactly how many NFTs that address transferred to
+other addresses and how many NFT transfers that address was the recipient of.
+
+Columns:
+1. address: An Ethereum address that participated in at least one NFT transfer between April 1, 2021 and September 25, 2021.
+2. transfers_out: The number of NFTs that the given address transferred to any other address between April 1, 2021 and September 25, 2021.
+3. transfers_in: The number of NFTs that any other address transferred to given address between April 1, 2021 and September 25, 2021.
+""",
+}
+
+
+def explain() -> None:
+    """
+    Explains the structure of the dataset.
+    """
+    preamble = """
+The Moonstream NFTs dataset
+===========================
+
+This dataset consists of the following dataframes:"""
+
+    print(preamble)
+    for name, explanation in AVAILABLE_DATAFRAMES.items():
+        print(f"\nDataframe: {name}")
+        print(
+            f"Load using:\n\t{name}_df = nfts.dataset.load_dataframe(<sqlite connection or path to sqlite db>, {name})"
+        )
+        print("")
+        print(explanation)
+        print("- - -")
+
+
+def load_dataframe(db: Union[str, sqlite3.Connection], name: str) -> pd.DataFrame:
+    """
+    Loads one of the available dataframes. To learn more about the available dataframes, run:
+    >>> nfts.dataset.explain()
+    """
+    if name not in AVAILABLE_DATAFRAMES:
+        raise ValueError(
+            f"Invalid dataframe: {name}. Please choose from one of the available dataframes: {','.join(AVAILABLE_DATAFRAMES)}."
+        )
+    df = pd.read_sql_table(name, db)
+    return df
--- a/datasets/nfts/setup.py
+++ b/datasets/nfts/setup.py
@ -33,9 +33,10 @@ setup(
    install_requires=[
        "moonstreamdb",
        "humbug",
+        "pandas",
+        "requests",
        "tqdm",
        "web3",
-        "requests",
    ],
    extras_require={
        "dev": ["black", "mypy", "types-requests"],