diff --git a/crawlers/4byteCrawler/.gitignore b/crawlers/4byteCrawler/.gitignore deleted file mode 100644 index 3095d4d8..00000000 --- a/crawlers/4byteCrawler/.gitignore +++ /dev/null @@ -1,197 +0,0 @@ - -# Created by https://www.toptal.com/developers/gitignore/api/node,linux,windows,osx -# Edit at https://www.toptal.com/developers/gitignore?templates=node,linux,windows,osx - -### Linux ### -*~ - -# temporary files which can be created if a process still has a handle open of a deleted file -.fuse_hidden* - -# KDE directory preferences -.directory - -# Linux trash folder which might appear on any partition or disk -.Trash-* - -# .nfs files are created when an open file is removed but is still being accessed -.nfs* - -### Node ### -# Logs -logs -*.log -npm-debug.log* -yarn-debug.log* -yarn-error.log* -lerna-debug.log* -.pnpm-debug.log* - -# Diagnostic reports (https://nodejs.org/api/report.html) -report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json - -# Runtime data -pids -*.pid -*.seed -*.pid.lock - -# Directory for instrumented libs generated by jscoverage/JSCover -lib-cov - -# Coverage directory used by tools like istanbul -coverage -*.lcov - -# nyc test coverage -.nyc_output - -# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) -.grunt - -# Bower dependency directory (https://bower.io/) -bower_components - -# node-waf configuration -.lock-wscript - -# Compiled binary addons (https://nodejs.org/api/addons.html) -build/Release - -# Dependency directories -node_modules/ -jspm_packages/ - -# Snowpack dependency directory (https://snowpack.dev/) -web_modules/ - -# TypeScript cache -*.tsbuildinfo - -# Optional npm cache directory -.npm - -# Optional eslint cache -.eslintcache - -# Microbundle cache -.rpt2_cache/ -.rts2_cache_cjs/ -.rts2_cache_es/ -.rts2_cache_umd/ - -# Optional REPL history -.node_repl_history - -# Output of 'npm pack' -*.tgz - -# Yarn Integrity file -.yarn-integrity - -# dotenv environment variables file -.env -.env.test -.env.production - -# parcel-bundler cache (https://parceljs.org/) -.cache -.parcel-cache - -# Next.js build output -.next -out - -# Nuxt.js build / generate output -.nuxt -dist - -# Gatsby files -.cache/ -# Comment in the public line in if your project uses Gatsby and not Next.js -# https://nextjs.org/blog/next-9-1#public-directory-support -# public - -# vuepress build output -.vuepress/dist - -# Serverless directories -.serverless/ - -# FuseBox cache -.fusebox/ - -# DynamoDB Local files -.dynamodb/ - -# TernJS port file -.tern-port - -# Stores VSCode versions used for testing VSCode extensions -.vscode-test - -# yarn v2 -.yarn/cache -.yarn/unplugged -.yarn/build-state.yml -.yarn/install-state.gz -.pnp.* - -### OSX ### -# General -.DS_Store -.AppleDouble -.LSOverride - -# Icon must end with two \r -Icon - - -# Thumbnails -._* - -# Files that might appear in the root of a volume -.DocumentRevisions-V100 -.fseventsd -.Spotlight-V100 -.TemporaryItems -.Trashes -.VolumeIcon.icns -.com.apple.timemachine.donotpresent - -# Directories potentially created on remote AFP share -.AppleDB -.AppleDesktop -Network Trash Folder -Temporary Items -.apdisk - -### Windows ### -# Windows thumbnail cache files -Thumbs.db -Thumbs.db:encryptable -ehthumbs.db -ehthumbs_vista.db - -# Dump file -*.stackdump - -# Folder config file -[Dd]esktop.ini - -# Recycle Bin used on file shares -$RECYCLE.BIN/ - -# Windows Installer files -*.cab -*.msi -*.msix -*.msm -*.msp - -# Windows shortcuts -*.lnk - -# End of https://www.toptal.com/developers/gitignore/api/node,linux,windows,osx - -package-lock.json diff --git a/crawlers/4byteCrawler/.nvmrc b/crawlers/4byteCrawler/.nvmrc deleted file mode 100644 index a5e323ec..00000000 --- a/crawlers/4byteCrawler/.nvmrc +++ /dev/null @@ -1 +0,0 @@ -v14.17.3 diff --git a/crawlers/4byteCrawler/4byteDirectoryCrawler.js b/crawlers/4byteCrawler/4byteDirectoryCrawler.js deleted file mode 100644 index ef782ee1..00000000 --- a/crawlers/4byteCrawler/4byteDirectoryCrawler.js +++ /dev/null @@ -1,42 +0,0 @@ -const fetch = require('node-fetch') -const fs = require("fs") - - -function sleep(ms) { - return new Promise(resolve => setTimeout(resolve, ms)); -} - -async function makeRequest(url, sleepTime) { - if (sleepTime) - sleep(sleepTime) - let response = await fetch(url); - let json = await response.json(); - return json; -} - -async function crawlFunctionSignatures() { - let url = "https://www.4byte.directory/api/v1/signatures/"; - let response = await makeRequest(url) - let signatures = response.results; - while (response.next) { - response = await makeRequest(response.next) - signatures = signatures.concat(response.results) - console.log(`already crawled : ${signatures.length}`) - } - fs.writeFileSync("function_signatures.json", JSON.stringify(signatures)) -} - -async function crawlEventSignatures() { - let url = "https://www.4byte.directory/api/v1/event-signatures/"; - let response = await makeRequest(url) - let signatures = response.results; - while (response.next) { - response = await makeRequest(response.next) - signatures = signatures.concat(response.results) - console.log(`already crawled : ${signatures.length}`) - } - fs.writeFileSync("event_signatures.json", JSON.stringify(signatures)) -} - -//crawlFunctionSignatures() -//crawlEventSignatures(); diff --git a/crawlers/4byteCrawler/README.md b/crawlers/4byteCrawler/README.md deleted file mode 100644 index e69de29b..00000000 diff --git a/crawlers/4byteCrawler/json2Sqlite.js b/crawlers/4byteCrawler/json2Sqlite.js deleted file mode 100644 index 062b6d58..00000000 --- a/crawlers/4byteCrawler/json2Sqlite.js +++ /dev/null @@ -1,57 +0,0 @@ -const sqlite3 = require('sqlite3').verbose() -const fs = require('fs') -let db = new sqlite3.Database('./signatures.db', sqlite3.OPEN_READWRITE, (err) => { - if (err) { - console.error(err.message); - } - -}); - -function put_func_signatures_to_db() { - db.serialize(() => { - db.run('create table if not exists ' - + 'function_signatures(' - + 'id numeric primary key,' - + 'text_signature text,' - + 'hex_signature text)') - - let stmt = db.prepare('insert into function_signatures values (?, ?, ?)') - let function_signatures = JSON.parse(fs.readFileSync("./function_signatures.json")) - function_signatures.forEach((item) => { - try { - stmt.run([item.id, item.text_signature, item.hex_signature]) - - } - catch(err) { - console.log(item) - console.log(err) - } - }) - }) -} - -function put_event_signatures_to_db() { - db.serialize(() => { - db.run('create table if not exists ' - + 'event_signatures(' - + 'id numeric primary key,' - + 'text_signature text,' - + 'hex_signature text)') - - let stmt = db.prepare('insert into event_signatures values (?, ?, ?)') - let function_signatures = JSON.parse(fs.readFileSync("./event_signatures.json")) - function_signatures.forEach((item) => { - try { - stmt.run([item.id, item.text_signature, item.hex_signature]) - - } - catch(err) { - console.log(item) - console.log(err) - } - }) - }) -} - -//put_func_signatures_to_db() -//put_event_signatures_to_db() \ No newline at end of file diff --git a/crawlers/4byteCrawler/package.json b/crawlers/4byteCrawler/package.json deleted file mode 100644 index 5ef3f3e9..00000000 --- a/crawlers/4byteCrawler/package.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "name": "@bugout/4byteCrawler", - "version": "1.0.0", - "description": "", - "main": "index.js", - "scripts": { - "test": "echo \"Error: no test specified\" && exit 1" - }, - "author": "", - "license": "ISC", - "dependencies": { - "csv-parser": "^3.0.0", - "node-fetch": "^2.6.1", - "sqlite3": "^5.0.2" - } -} diff --git a/crawlers/4byteCrawler/signatures.db b/crawlers/4byteCrawler/signatures.db deleted file mode 100644 index 09f2ece5..00000000 Binary files a/crawlers/4byteCrawler/signatures.db and /dev/null differ diff --git a/crawlers/esd/.gitignore b/crawlers/esd/.gitignore new file mode 100644 index 00000000..3754e2b9 --- /dev/null +++ b/crawlers/esd/.gitignore @@ -0,0 +1,171 @@ + +# Created by https://www.toptal.com/developers/gitignore/api/python,visualstudiocode +# Edit at https://www.toptal.com/developers/gitignore?templates=python,visualstudiocode + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +### VisualStudioCode ### +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +*.code-workspace + +# Local History for Visual Studio Code +.history/ + +### VisualStudioCode Patch ### +# Ignore all local history of files +.history +.ionide + +# End of https://www.toptal.com/developers/gitignore/api/python,visualstudiocode + +# Custom +dev.env +prod.env +alembic.dev.ini +alembic.prod.ini +.db/ +.venv/ +.esd/ +.secrets/ diff --git a/crawlers/esd/README.md b/crawlers/esd/README.md new file mode 100644 index 00000000..11340eb9 --- /dev/null +++ b/crawlers/esd/README.md @@ -0,0 +1,5 @@ +# Crawler: Ethereum Signature Database + +This crawler retrieves Ethereum function signatures from the Ethereum Signature Database at +[https://4byte.directory](https://4byte.directory). + diff --git a/crawlers/esd/esd.py b/crawlers/esd/esd.py new file mode 100644 index 00000000..e2be8c44 --- /dev/null +++ b/crawlers/esd/esd.py @@ -0,0 +1,68 @@ +import argparse +import sys +import time +from typing import Optional, Union + +from moonstreamdb.db import yield_db_session_ctx +from moonstreamdb.models import ESDEventSignature, ESDFunctionSignature +from sqlalchemy.orm import Session +import requests + +CRAWL_URLS = { + "functions": "https://www.4byte.directory/api/v1/signatures/", + "events": "https://www.4byte.directory/api/v1/event-signatures/", +} + +DB_MODELS = { + "functions": ESDFunctionSignature, + "events": ESDEventSignature, +} + +def crawl_step(db_session: Session, crawl_url: str, db_model: Union[ESDEventSignature, ESDFunctionSignature]) -> Optional[str]: + attempt = 0 + current_interval = 2 + success = False + + response: Optional[requests.Response] = None + while (not success) and attempt < 3: + attempt += 1 + try: + response = requests.get(crawl_url) + response.raise_for_status() + success = True + except: + current_interval *= 2 + time.sleep(current_interval) + + if response is None: + print(f"Could not process URL: {crawl_url}", file=sys.stderr) + return None + + page = response.json() + results = page.get("results", []) + + rows = [db_model(id=row.get("id"), text_signature=row.get("text_signature"), hex_signature=row.get("hex_signature"), created_at=row.get("created_at")) for row in results] + db_session.bulk_save_objects(rows) + db_session.commit() + + return page.get("next") + +def crawl(crawl_type: str, interval: float) -> None: + crawl_url: Optional[str] = CRAWL_URLS[crawl_type] + db_model = DB_MODELS[crawl_type] + with yield_db_session_ctx() as db_session: + while crawl_url is not None: + print(f"Crawling: {crawl_url}") + crawl_url = crawl_step(db_session, crawl_url, db_model) + time.sleep(interval) + +def main(): + parser = argparse.ArgumentParser(description="Crawls function and event signatures from the Ethereum Signature Database (https://www.4byte.directory/)") + parser.add_argument("crawl_type", choices=CRAWL_URLS, help="Specifies whether to crawl function signatures or event signatures") + parser.add_argument("--interval", type=float, default=0.1, help="Number of seconds to wait between requests to the Ethereum Signature Database API") + args = parser.parse_args() + + crawl(args.crawl_type, args.interval) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/crawlers/esd/requirements.txt b/crawlers/esd/requirements.txt new file mode 100644 index 00000000..f7a8f3c5 --- /dev/null +++ b/crawlers/esd/requirements.txt @@ -0,0 +1,15 @@ +alembic==1.6.5 +certifi==2021.5.30 +charset-normalizer==2.0.3 +greenlet==1.1.0 +idna==3.2 +Mako==1.1.4 +MarkupSafe==2.0.1 +-e git+ssh://git@github.com/zomglings/moonstock.git@8acebb7c8a1872cd0a9c2b663f86be3877a20636#egg=moonstreamdb&subdirectory=db +psycopg2-binary==2.9.1 +python-dateutil==2.8.2 +python-editor==1.0.4 +requests==2.26.0 +six==1.16.0 +SQLAlchemy==1.4.22 +urllib3==1.26.6 diff --git a/crawlers/esd/sample.env b/crawlers/esd/sample.env new file mode 100644 index 00000000..e5402a4a --- /dev/null +++ b/crawlers/esd/sample.env @@ -0,0 +1 @@ +export EXPLORATION_DB_URI="postgresql://:@:/"