Python implementation of Ethereum Signature Database crawler

smart-contract-crawlers
Neeraj Kashyap 2021-07-27 00:52:14 -07:00
rodzic 8acebb7c8a
commit ef9c73c554
12 zmienionych plików z 260 dodań i 313 usunięć

Wyświetl plik

@ -1,197 +0,0 @@
# Created by https://www.toptal.com/developers/gitignore/api/node,linux,windows,osx
# Edit at https://www.toptal.com/developers/gitignore?templates=node,linux,windows,osx
### Linux ###
*~
# temporary files which can be created if a process still has a handle open of a deleted file
.fuse_hidden*
# KDE directory preferences
.directory
# Linux trash folder which might appear on any partition or disk
.Trash-*
# .nfs files are created when an open file is removed but is still being accessed
.nfs*
### Node ###
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
lerna-debug.log*
.pnpm-debug.log*
# Diagnostic reports (https://nodejs.org/api/report.html)
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
# Runtime data
pids
*.pid
*.seed
*.pid.lock
# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov
# Coverage directory used by tools like istanbul
coverage
*.lcov
# nyc test coverage
.nyc_output
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
.grunt
# Bower dependency directory (https://bower.io/)
bower_components
# node-waf configuration
.lock-wscript
# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release
# Dependency directories
node_modules/
jspm_packages/
# Snowpack dependency directory (https://snowpack.dev/)
web_modules/
# TypeScript cache
*.tsbuildinfo
# Optional npm cache directory
.npm
# Optional eslint cache
.eslintcache
# Microbundle cache
.rpt2_cache/
.rts2_cache_cjs/
.rts2_cache_es/
.rts2_cache_umd/
# Optional REPL history
.node_repl_history
# Output of 'npm pack'
*.tgz
# Yarn Integrity file
.yarn-integrity
# dotenv environment variables file
.env
.env.test
.env.production
# parcel-bundler cache (https://parceljs.org/)
.cache
.parcel-cache
# Next.js build output
.next
out
# Nuxt.js build / generate output
.nuxt
dist
# Gatsby files
.cache/
# Comment in the public line in if your project uses Gatsby and not Next.js
# https://nextjs.org/blog/next-9-1#public-directory-support
# public
# vuepress build output
.vuepress/dist
# Serverless directories
.serverless/
# FuseBox cache
.fusebox/
# DynamoDB Local files
.dynamodb/
# TernJS port file
.tern-port
# Stores VSCode versions used for testing VSCode extensions
.vscode-test
# yarn v2
.yarn/cache
.yarn/unplugged
.yarn/build-state.yml
.yarn/install-state.gz
.pnp.*
### OSX ###
# General
.DS_Store
.AppleDouble
.LSOverride
# Icon must end with two \r
Icon
# Thumbnails
._*
# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent
# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk
### Windows ###
# Windows thumbnail cache files
Thumbs.db
Thumbs.db:encryptable
ehthumbs.db
ehthumbs_vista.db
# Dump file
*.stackdump
# Folder config file
[Dd]esktop.ini
# Recycle Bin used on file shares
$RECYCLE.BIN/
# Windows Installer files
*.cab
*.msi
*.msix
*.msm
*.msp
# Windows shortcuts
*.lnk
# End of https://www.toptal.com/developers/gitignore/api/node,linux,windows,osx
package-lock.json

Wyświetl plik

@ -1 +0,0 @@
v14.17.3

Wyświetl plik

@ -1,42 +0,0 @@
const fetch = require('node-fetch')
const fs = require("fs")
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
async function makeRequest(url, sleepTime) {
if (sleepTime)
sleep(sleepTime)
let response = await fetch(url);
let json = await response.json();
return json;
}
async function crawlFunctionSignatures() {
let url = "https://www.4byte.directory/api/v1/signatures/";
let response = await makeRequest(url)
let signatures = response.results;
while (response.next) {
response = await makeRequest(response.next)
signatures = signatures.concat(response.results)
console.log(`already crawled : ${signatures.length}`)
}
fs.writeFileSync("function_signatures.json", JSON.stringify(signatures))
}
async function crawlEventSignatures() {
let url = "https://www.4byte.directory/api/v1/event-signatures/";
let response = await makeRequest(url)
let signatures = response.results;
while (response.next) {
response = await makeRequest(response.next)
signatures = signatures.concat(response.results)
console.log(`already crawled : ${signatures.length}`)
}
fs.writeFileSync("event_signatures.json", JSON.stringify(signatures))
}
//crawlFunctionSignatures()
//crawlEventSignatures();

Wyświetl plik

@ -1,57 +0,0 @@
const sqlite3 = require('sqlite3').verbose()
const fs = require('fs')
let db = new sqlite3.Database('./signatures.db', sqlite3.OPEN_READWRITE, (err) => {
if (err) {
console.error(err.message);
}
});
function put_func_signatures_to_db() {
db.serialize(() => {
db.run('create table if not exists '
+ 'function_signatures('
+ 'id numeric primary key,'
+ 'text_signature text,'
+ 'hex_signature text)')
let stmt = db.prepare('insert into function_signatures values (?, ?, ?)')
let function_signatures = JSON.parse(fs.readFileSync("./function_signatures.json"))
function_signatures.forEach((item) => {
try {
stmt.run([item.id, item.text_signature, item.hex_signature])
}
catch(err) {
console.log(item)
console.log(err)
}
})
})
}
function put_event_signatures_to_db() {
db.serialize(() => {
db.run('create table if not exists '
+ 'event_signatures('
+ 'id numeric primary key,'
+ 'text_signature text,'
+ 'hex_signature text)')
let stmt = db.prepare('insert into event_signatures values (?, ?, ?)')
let function_signatures = JSON.parse(fs.readFileSync("./event_signatures.json"))
function_signatures.forEach((item) => {
try {
stmt.run([item.id, item.text_signature, item.hex_signature])
}
catch(err) {
console.log(item)
console.log(err)
}
})
})
}
//put_func_signatures_to_db()
//put_event_signatures_to_db()

Wyświetl plik

@ -1,16 +0,0 @@
{
"name": "@bugout/4byteCrawler",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "",
"license": "ISC",
"dependencies": {
"csv-parser": "^3.0.0",
"node-fetch": "^2.6.1",
"sqlite3": "^5.0.2"
}
}

Plik binarny nie jest wyświetlany.

171
crawlers/esd/.gitignore vendored 100644
Wyświetl plik

@ -0,0 +1,171 @@
# Created by https://www.toptal.com/developers/gitignore/api/python,visualstudiocode
# Edit at https://www.toptal.com/developers/gitignore?templates=python,visualstudiocode
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
### VisualStudioCode ###
.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
*.code-workspace
# Local History for Visual Studio Code
.history/
### VisualStudioCode Patch ###
# Ignore all local history of files
.history
.ionide
# End of https://www.toptal.com/developers/gitignore/api/python,visualstudiocode
# Custom
dev.env
prod.env
alembic.dev.ini
alembic.prod.ini
.db/
.venv/
.esd/
.secrets/

Wyświetl plik

@ -0,0 +1,5 @@
# Crawler: Ethereum Signature Database
This crawler retrieves Ethereum function signatures from the Ethereum Signature Database at
[https://4byte.directory](https://4byte.directory).

Wyświetl plik

@ -0,0 +1,68 @@
import argparse
import sys
import time
from typing import Optional, Union
from moonstreamdb.db import yield_db_session_ctx
from moonstreamdb.models import ESDEventSignature, ESDFunctionSignature
from sqlalchemy.orm import Session
import requests
CRAWL_URLS = {
"functions": "https://www.4byte.directory/api/v1/signatures/",
"events": "https://www.4byte.directory/api/v1/event-signatures/",
}
DB_MODELS = {
"functions": ESDFunctionSignature,
"events": ESDEventSignature,
}
def crawl_step(db_session: Session, crawl_url: str, db_model: Union[ESDEventSignature, ESDFunctionSignature]) -> Optional[str]:
attempt = 0
current_interval = 2
success = False
response: Optional[requests.Response] = None
while (not success) and attempt < 3:
attempt += 1
try:
response = requests.get(crawl_url)
response.raise_for_status()
success = True
except:
current_interval *= 2
time.sleep(current_interval)
if response is None:
print(f"Could not process URL: {crawl_url}", file=sys.stderr)
return None
page = response.json()
results = page.get("results", [])
rows = [db_model(id=row.get("id"), text_signature=row.get("text_signature"), hex_signature=row.get("hex_signature"), created_at=row.get("created_at")) for row in results]
db_session.bulk_save_objects(rows)
db_session.commit()
return page.get("next")
def crawl(crawl_type: str, interval: float) -> None:
crawl_url: Optional[str] = CRAWL_URLS[crawl_type]
db_model = DB_MODELS[crawl_type]
with yield_db_session_ctx() as db_session:
while crawl_url is not None:
print(f"Crawling: {crawl_url}")
crawl_url = crawl_step(db_session, crawl_url, db_model)
time.sleep(interval)
def main():
parser = argparse.ArgumentParser(description="Crawls function and event signatures from the Ethereum Signature Database (https://www.4byte.directory/)")
parser.add_argument("crawl_type", choices=CRAWL_URLS, help="Specifies whether to crawl function signatures or event signatures")
parser.add_argument("--interval", type=float, default=0.1, help="Number of seconds to wait between requests to the Ethereum Signature Database API")
args = parser.parse_args()
crawl(args.crawl_type, args.interval)
if __name__ == "__main__":
main()

Wyświetl plik

@ -0,0 +1,15 @@
alembic==1.6.5
certifi==2021.5.30
charset-normalizer==2.0.3
greenlet==1.1.0
idna==3.2
Mako==1.1.4
MarkupSafe==2.0.1
-e git+ssh://git@github.com/zomglings/moonstock.git@8acebb7c8a1872cd0a9c2b663f86be3877a20636#egg=moonstreamdb&subdirectory=db
psycopg2-binary==2.9.1
python-dateutil==2.8.2
python-editor==1.0.4
requests==2.26.0
six==1.16.0
SQLAlchemy==1.4.22
urllib3==1.26.6

Wyświetl plik

@ -0,0 +1 @@
export EXPLORATION_DB_URI="postgresql://<username>:<password>@<db_host>:<db_port>/<db_name>"