Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
RedLicorice committed Nov 24, 2021
0 parents commit 19a100f
Show file tree
Hide file tree
Showing 25 changed files with 1,226 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
.venv/
data/
.env
4 changes: 4 additions & 0 deletions config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
database:
sql:
uri: !ENV 'sqlite:///feature_store.db'
chunksize: 10
7 changes: 7 additions & 0 deletions readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# CryptoML

Current functionality:
- Build datasets from bootstrap data

Backlog:
- Update datasets with new data records
13 changes: 13 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
typer
xgboost
pandas
statsmodels
scikit-learn
scipy
numpy
requests
pyyaml
git+https://github.com/RedLicorice/pyti.git
sqlalchemy
confuse
python-dotenv
109 changes: 109 additions & 0 deletions src/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import typer
import xgboost
app = typer.Typer()


@app.command(name='bootstrap', help='Bootstrap dataset with data from zip files in data/bootstrap')
def build_dataset(symbol: str, currency: str):
target_name = '../data/dataset-{symbol}{currency}'.format(symbol=symbol, currency=currency)
from crawlers import kraken, coinmetrics
_kraken = kraken.get_bootstrap_data(symbol, currency).fillna(method='ffill')
_coinmetrics = coinmetrics.get_bootstrap_data(symbol)

ohlcv = kraken.ticks_to_ohlcv(_kraken, '1D').fillna(method='ffill')

from dataset import build, get_feature_metadata, make_target
import pandas as pd
result = build(ohlcv=ohlcv, coinmetrics=_coinmetrics, W=10)
result.to_csv(target_name + '.csv', index_label='timestamp')

_begin, _end, _features = get_feature_metadata(result)
meta = pd.DataFrame.from_records(_features)
meta.index = meta['name']
meta.drop(labels='name', axis='columns', inplace=True)
meta.to_csv(target_name + '.meta.csv', index_label='feature')

target = make_target(ohlcv)
target.to_csv(target_name + '.target.csv', index_label='timestamp')

info = {
'symbol': symbol,
'currency': currency,
'interval': '1D',
'records': result.shape[0],
'features': result.shape[1],
'index_min': result.index.min().to_pydatetime().isoformat(),
'index_max': result.index.max().to_pydatetime().isoformat(),
'valid_index_min': _begin,
'valid_index_max': _end,
'targets': {str(k): False if k != 'class' else True for k in target.columns},
'features': {str(k): True for k in meta.index}
}
with open(target_name + '.info.yaml', 'w') as f:
import yaml
yaml.dump(info, f, sort_keys=False)
print('done')

@app.command(name='selection', help='Perform feature selection and update <dataset>.info.yaml with selected features')
def selection(symbol: str, currency: str, percent: float):
target_name = '../data/dataset-{symbol}{currency}'.format(symbol=symbol, currency=currency)
import pandas as pd
import math
from crawlers import load_yaml, save_yaml

info = load_yaml(target_name + '.info.yaml')
dataset = pd.read_csv(target_name + '.csv', parse_dates=True, index_col='timestamp')
target = pd.read_csv(target_name + '.target.csv', parse_dates=True, index_col='timestamp')
first_valid_i = dataset.index.get_loc(info.valid_index_min)
last_valid_i = dataset.index.get_loc(info.valid_index_max)

training_records = math.floor((last_valid_i - first_valid_i) * percent)
dataset['label'] = target['class']
training_dataset = dataset.iloc[first_valid_i:first_valid_i+training_records]
# testing_dataset = dataset.iloc[first_valid_i+training_records: last_valid_i+1]

from xgboost import XGBClassifier
from util.selection_pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
pipeline = Pipeline(steps=[
('s', StandardScaler()),
('i', SimpleImputer()),
('c', XGBClassifier(use_label_encoder=False))
])

X_train = training_dataset.drop(labels=['label'], axis='columns')
with pd.option_context('mode.use_inf_as_na', True): # Set option temporarily
X_train.fillna(axis='columns', method='ffill', inplace=True)
y_train = training_dataset['label']

from sklearn.feature_selection import SelectFromModel
sel = SelectFromModel(pipeline)
sel.fit(X_train, y_train)
support = sel.get_support()

dinfo = info.to_dict()
for c, mask in zip(X_train.columns, support):
dinfo['features'][c] = True if mask else False

import yaml
with open(target_name + '.info.yaml', 'w') as f:
yaml.dump(dinfo, f, sort_keys=False)
with open(target_name + '.info.yaml.bak', 'w') as f:
yaml.dump(info, f, sort_keys=False)
print('done')



@app.command()
def test(symbol: str, currency: str):
from dataset import make_ohlcv_ta
from crawlers import kraken
_kraken = kraken.get_bootstrap_data(symbol, currency).fillna(method='ffill')
ohlcv = kraken.ticks_to_ohlcv(_kraken, '1D').fillna(method='ffill')
ohlcv_ta = make_ohlcv_ta(ohlcv)
print('It works')
print(ohlcv_ta.head())

if __name__ == '__main__':
app()
32 changes: 32 additions & 0 deletions src/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import confuse
from dotenv import load_dotenv
import re, os, yaml

# Load Configuration
env_variable_pattern = re.compile('.*?\${(\w+)}.*?')
_configfile = None
load_dotenv()

def replace_env_variables(loader, node):
"""
Extracts the environment variable from the node's value
:param yaml.Loader loader: the yaml loader
:param node: the current node in the yaml
:return: the parsed string that contains the value of the environment
variable
"""
value = loader.construct_scalar(node)
match = env_variable_pattern.findall(value) # to find all env variables in line
if match:
full_value = value
for g in match:
full_value = full_value.replace(
f'${{{g}}}', os.environ.get(g, g)
)
return full_value if not full_value.isnumeric() else int(full_value)
return value

confuse.Loader.add_constructor('!ENV', replace_env_variables)
#config = confuse.Configuration('CryptoML-API', __name__)
config = confuse.LazyConfig('CryptoML', __name__)
config.set_file('../config.yaml')
90 changes: 90 additions & 0 deletions src/crawlers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import yaml
from util.bunch import Bunch


class Spec(dict):
def __init__(self, base_url, endpoints, **kwargs):
if not type(endpoints) is dict:
raise ValueError('Endpoints must be a dictionary!')
self.base_url = base_url
self.endpoints = endpoints
super().__init__(kwargs)

@staticmethod
def from_dict(spec):
if not spec:
raise ValueError('Provided spec is invalid.')
if not 'base_url' in spec:
raise ValueError('Provided spec does not describe a base url.')
res = Spec(base_url=spec['base_url'], endpoints=spec['endpoints'])
return res

def __dir__(self):
return self.endpoints.keys()

def __setstate__(self, state):
pass

def __setattr__(self, key, value):
if key in ['endpoints', 'base_url']:
self[key] = value
else:
self.endpoints[key] = value

def __getattr__(self, key):
if key in ['endpoints', 'base_url']:
return self[key]
try:
query = self.endpoints[key]
if '{' in query and '}' in query:
return (self.base_url + query).format
return self.base_url + query
except KeyError:
raise AttributeError('Method not described in api spec: ' + key)


def load_api_spec(filename):
with open(filename, 'r') as f:
try:
spec = yaml.safe_load(f)
except yaml.YAMLError as exc:
print('error loading api spec' + exc)
if spec:
return Spec.from_dict(spec)

def load_yaml(filename):
with open(filename, 'r') as f:
try:
spec = yaml.safe_load(f)
except yaml.YAMLError as exc:
print('error loading yaml' + exc)
if spec:
return Bunch(**spec)

def save_yaml(filename, data):
with open(filename, 'w') as f:
try:
yaml.safe_dump(data=data, stream=f)
except yaml.YAMLError as exc:
print('error saving yaml' + exc.message or str(exc))

def load_bootstrap(zip_file, csv_file):
import zipfile
import pandas as pd
with zipfile.ZipFile(zip_file) as z:
with z.open(csv_file) as f:
train = pd.read_csv(f, delimiter=",", parse_dates=True, index_col='date')
return train

def bootstrap_index(filename):
index = load_yaml(filename)
return index.bootstrap

def load_transformer(filename):
import importlib
import ntpath

spec = importlib.util.spec_from_file_location("bootstrap."+ntpath.basename(filename)[:-3], filename)
transformer = importlib.util.module_from_spec(spec)
spec.loader.exec_module(transformer)
return transformer
6 changes: 6 additions & 0 deletions src/crawlers/coinmetrics-community.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
base_url: 'https://community-api.coinmetrics.io/v4'
api_key: ''
endpoints:
asset_metadata: '/catalog/assets?assets={assets}&pretty=false'
metrics_timeseries: '/timeseries/asset-metrics?assets={assets}&metrics={metrics}&frequency={frequency}&status=all&start_time={begin}&end_time={end}&end_inclusive=true&timezone=Europe/Rome'

62 changes: 62 additions & 0 deletions src/crawlers/coinmetrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
def api():
from . import load_api_spec
return load_api_spec('crawlers/coinmetrics-community.yaml')

def get_assets(assets):
if type(assets) is list:
assets = ','.join(assets)
import requests
resp = requests.get(api().asset_metadata(assets=assets))
return resp.json()['data']

def get_asset_features(asset_name, frequency):
assets = get_assets(asset_name)
result = []
id = 1
for asset in assets:
for metric in asset['metrics']:
for f in metric['frequencies']:
if frequency == f['frequency']:
result.append({'index': id, 'dataset': 'coinmetrics', 'asset': asset['asset'], 'name': metric['metric'], 'min': f['min_time'], 'max': f['max_time'], 'enabled': True})
id += 1
return result

def get_asset_metrics(asset_name, metrics, frequency, begin, end):
if type(metrics) is list:
metrics = ','.join(metrics)
import requests
resp = requests.get(api().metrics_timeseries(assets=asset_name, metrics=metrics, frequency=frequency, begin=begin, end=end))
rj = resp.json()
result = rj['data']
import time
if 'next_page_url' in rj:
while True:
time.sleep(0.5)
resp = requests.get(rj['next_page_url'])
rj = resp.json()
result += rj['data']
if not 'next_page_url' in rj:
break
return result


def get_bootstrap_data(symbol):
symbol = symbol.lower()

from . import bootstrap_index, load_transformer
try:
index = bootstrap_index('../data/bootstrap/index.yaml')
transformer = load_transformer('../data/bootstrap/' + index.coinmetrics.transformer)
if symbol not in index.coinmetrics.groups:
filename = index.coinmetrics.name_format.format(symbol=symbol) + '.csv'
return transformer.get_df('../data/bootstrap/' + index.coinmetrics.zipfile, filename)
else:
filenames = [index.coinmetrics.name_format.format(symbol=symbol) + '.csv']
filenames += [ name + '.csv' for name in index.coinmetrics.groups[symbol]]
dataframes = [transformer.get_df('../data/bootstrap/' + index.coinmetrics.zipfile, filename) for filename in filenames]

import pandas as pd
return pd.concat(dataframes)
except Exception as e:
print('Exception occurred! ' + str(e))
raise
51 changes: 51 additions & 0 deletions src/crawlers/kraken.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
def api():
from . import load_api_spec
return load_api_spec('crawlers/kraken.yaml')

def get_pair_ohlc(pair, metrics, frequency, begin, end):
global kraken
if type(metrics) is list:
metrics = ','.join(metrics)
import requests
resp = requests.get(api().ohlc_data(assets=pair, metrics=metrics, frequency=frequency, since=0))
rj = resp.json()
result = rj['data']
import time
if 'next_page_url' in rj:
while True:
time.sleep(0.5)
resp = requests.get(rj['next_page_url'])
rj = resp.json()
result += rj['data']
if not 'next_page_url' in rj:
break
return result


def get_bootstrap_data(symbol, currency):
from . import bootstrap_index, load_transformer
_convert_map = {
'btc':'xbt',
'doge':'xdg'
}
if symbol in _convert_map:
symbol = _convert_map[symbol]
if currency in _convert_map:
currency = _convert_map[currency]
try:
index = bootstrap_index('../data/bootstrap/index.yaml')
transformer = load_transformer('../data/bootstrap/' + index.kraken.transformer)
if index.kraken.groups:
raise ValueError('Groups are not supported for kraken Loader')
filename = index.kraken.name_format.format(symbol=symbol.upper(), currency=currency.upper()) + '.csv'
return transformer.get_df('../data/bootstrap/' + index.kraken.zipfile, filename)
except Exception as e:
print('Exception occurred! ' + str(e))
raise


def ticks_to_ohlcv(ticks, interval):
resample = ticks.resample(interval)
ohlc = resample['price'].ohlc()
ohlc['volume'] = resample['amount'].sum()
return ohlc
Loading

0 comments on commit 19a100f

Please sign in to comment.